pub struct CsvFormat {
options: CsvOptions,
}Expand description
Character Separated Value FileFormat implementation.
Fields§
§options: CsvOptionsImplementations§
Source§impl CsvFormat
impl CsvFormat
Sourcepub async fn read_to_delimited_chunks_from_stream<'a>(
&self,
stream: Pin<Box<dyn Stream<Item = Result<Bytes, DataFusionError>> + Send + 'a>>,
) -> Pin<Box<dyn Stream<Item = Result<Bytes, DataFusionError>> + Send + 'a>>
pub async fn read_to_delimited_chunks_from_stream<'a>( &self, stream: Pin<Box<dyn Stream<Item = Result<Bytes, DataFusionError>> + Send + 'a>>, ) -> Pin<Box<dyn Stream<Item = Result<Bytes, DataFusionError>> + Send + 'a>>
Convert a stream of bytes into a stream of of [Bytes] containing newline
delimited CSV records, while accounting for \ and ".
Sourcepub fn with_options(self, options: CsvOptions) -> CsvFormat
pub fn with_options(self, options: CsvOptions) -> CsvFormat
Set the csv options
Sourcepub fn options(&self) -> &CsvOptions
pub fn options(&self) -> &CsvOptions
Retrieve the csv options
Sourcepub fn with_schema_infer_max_rec(self, max_rec: usize) -> CsvFormat
pub fn with_schema_infer_max_rec(self, max_rec: usize) -> CsvFormat
Set a limit in terms of records to scan to infer the schema
- default to
DEFAULT_SCHEMA_INFER_MAX_RECORD
Sourcepub fn with_has_header(self, has_header: bool) -> CsvFormat
pub fn with_has_header(self, has_header: bool) -> CsvFormat
Set true to indicate that the first line is a header.
- default to true
pub fn with_truncated_rows(self, truncated_rows: bool) -> CsvFormat
Sourcepub fn with_null_regex(self, null_regex: Option<String>) -> CsvFormat
pub fn with_null_regex(self, null_regex: Option<String>) -> CsvFormat
Set the regex to use for null values in the CSV reader.
- default to treat empty values as null.
Sourcepub fn has_header(&self) -> Option<bool>
pub fn has_header(&self) -> Option<bool>
Returns Some(true) if the first line is a header, Some(false) if
it is not, and None if it is not specified.
Sourcepub fn with_comment(self, comment: Option<u8>) -> CsvFormat
pub fn with_comment(self, comment: Option<u8>) -> CsvFormat
Lines beginning with this byte are ignored.
Sourcepub fn with_delimiter(self, delimiter: u8) -> CsvFormat
pub fn with_delimiter(self, delimiter: u8) -> CsvFormat
The character separating values within a row.
- default to ‘,’
Sourcepub fn with_quote(self, quote: u8) -> CsvFormat
pub fn with_quote(self, quote: u8) -> CsvFormat
The quote character in a row.
- default to ‘“’
Sourcepub fn with_escape(self, escape: Option<u8>) -> CsvFormat
pub fn with_escape(self, escape: Option<u8>) -> CsvFormat
The escape character in a row.
- default is None
Sourcepub fn with_terminator(self, terminator: Option<u8>) -> CsvFormat
pub fn with_terminator(self, terminator: Option<u8>) -> CsvFormat
The character used to indicate the end of a row.
- default to None (CRLF)
Sourcepub fn with_newlines_in_values(self, newlines_in_values: bool) -> CsvFormat
pub fn with_newlines_in_values(self, newlines_in_values: bool) -> CsvFormat
Specifies whether newlines in (quoted) values are supported.
Parsing newlines in quoted values may be affected by execution behaviour such as
parallel file scanning. Setting this to true ensures that newlines in values are
parsed successfully, which may reduce performance.
The default behaviour depends on the datafusion.catalog.newlines_in_values setting.
Sourcepub fn with_file_compression_type(
self,
file_compression_type: FileCompressionType,
) -> CsvFormat
pub fn with_file_compression_type( self, file_compression_type: FileCompressionType, ) -> CsvFormat
Set a FileCompressionType of CSV
- defaults to
FileCompressionType::UNCOMPRESSED
Sourcepub fn with_truncate_rows(self, truncate_rows: bool) -> CsvFormat
pub fn with_truncate_rows(self, truncate_rows: bool) -> CsvFormat
Set whether rows should be truncated to the column width
- defaults to false
Source§impl CsvFormat
impl CsvFormat
Sourcepub async fn infer_schema_from_stream(
&self,
state: &dyn Session,
records_to_read: usize,
stream: impl Stream<Item = Result<Bytes, DataFusionError>>,
) -> Result<(Schema, usize), DataFusionError>
pub async fn infer_schema_from_stream( &self, state: &dyn Session, records_to_read: usize, stream: impl Stream<Item = Result<Bytes, DataFusionError>>, ) -> Result<(Schema, usize), DataFusionError>
Return the inferred schema reading up to records_to_read from a stream of delimited chunks returning the inferred schema and the number of lines that were read.
This method can handle CSV files with different numbers of columns. The inferred schema will be the union of all columns found across all files. Files with fewer columns will have missing columns filled with null values.
§Example
If you have two CSV files:
file1.csv:col1,col2,col3file2.csv:col1,col2,col3,col4,col5
The inferred schema will contain all 5 columns, with files that don’t have columns 4 and 5 having null values for those columns.
Trait Implementations§
Source§impl FileFormat for CsvFormat
impl FileFormat for CsvFormat
Source§fn as_any(&self) -> &(dyn Any + 'static)
fn as_any(&self) -> &(dyn Any + 'static)
Any so that it can be
downcast to a specific implementation.Source§fn get_ext_with_compression(
&self,
file_compression_type: &FileCompressionType,
) -> Result<String, DataFusionError>
fn get_ext_with_compression( &self, file_compression_type: &FileCompressionType, ) -> Result<String, DataFusionError>
Source§fn compression_type(&self) -> Option<FileCompressionType>
fn compression_type(&self) -> Option<FileCompressionType>
Source§fn infer_schema<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 self,
state: &'life1 dyn Session,
store: &'life2 Arc<dyn ObjectStore>,
objects: &'life3 [ObjectMeta],
) -> Pin<Box<dyn Future<Output = Result<Arc<Schema>, DataFusionError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait,
CsvFormat: 'async_trait,
fn infer_schema<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 self,
state: &'life1 dyn Session,
store: &'life2 Arc<dyn ObjectStore>,
objects: &'life3 [ObjectMeta],
) -> Pin<Box<dyn Future<Output = Result<Arc<Schema>, DataFusionError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait,
CsvFormat: 'async_trait,
Source§fn infer_stats<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 self,
_state: &'life1 dyn Session,
_store: &'life2 Arc<dyn ObjectStore>,
table_schema: Arc<Schema>,
_object: &'life3 ObjectMeta,
) -> Pin<Box<dyn Future<Output = Result<Statistics, DataFusionError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait,
CsvFormat: 'async_trait,
fn infer_stats<'life0, 'life1, 'life2, 'life3, 'async_trait>(
&'life0 self,
_state: &'life1 dyn Session,
_store: &'life2 Arc<dyn ObjectStore>,
table_schema: Arc<Schema>,
_object: &'life3 ObjectMeta,
) -> Pin<Box<dyn Future<Output = Result<Statistics, DataFusionError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
'life2: 'async_trait,
'life3: 'async_trait,
CsvFormat: 'async_trait,
Source§fn create_physical_plan<'life0, 'life1, 'async_trait>(
&'life0 self,
state: &'life1 dyn Session,
conf: FileScanConfig,
) -> Pin<Box<dyn Future<Output = Result<Arc<dyn ExecutionPlan>, DataFusionError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
CsvFormat: 'async_trait,
fn create_physical_plan<'life0, 'life1, 'async_trait>(
&'life0 self,
state: &'life1 dyn Session,
conf: FileScanConfig,
) -> Pin<Box<dyn Future<Output = Result<Arc<dyn ExecutionPlan>, DataFusionError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
CsvFormat: 'async_trait,
Source§fn create_writer_physical_plan<'life0, 'life1, 'async_trait>(
&'life0 self,
input: Arc<dyn ExecutionPlan>,
state: &'life1 dyn Session,
conf: FileSinkConfig,
order_requirements: Option<LexRequirement>,
) -> Pin<Box<dyn Future<Output = Result<Arc<dyn ExecutionPlan>, DataFusionError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
CsvFormat: 'async_trait,
fn create_writer_physical_plan<'life0, 'life1, 'async_trait>(
&'life0 self,
input: Arc<dyn ExecutionPlan>,
state: &'life1 dyn Session,
conf: FileSinkConfig,
order_requirements: Option<LexRequirement>,
) -> Pin<Box<dyn Future<Output = Result<Arc<dyn ExecutionPlan>, DataFusionError>> + Send + 'async_trait>>where
'life0: 'async_trait,
'life1: 'async_trait,
CsvFormat: 'async_trait,
Source§fn file_source(&self) -> Arc<dyn FileSource>
fn file_source(&self) -> Arc<dyn FileSource>
CsvSource, JsonSource, etc.Auto Trait Implementations§
impl Freeze for CsvFormat
impl RefUnwindSafe for CsvFormat
impl Send for CsvFormat
impl Sync for CsvFormat
impl Unpin for CsvFormat
impl UnwindSafe for CsvFormat
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
§impl<T> Instrument for T
impl<T> Instrument for T
§fn instrument(self, span: Span) -> Instrumented<Self>
fn instrument(self, span: Span) -> Instrumented<Self>
§fn in_current_span(self) -> Instrumented<Self>
fn in_current_span(self) -> Instrumented<Self>
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more