datafusion::datasource::physical_plan::parquet::source

Struct ParquetSource

pub struct ParquetSource {
    pub(crate) table_parquet_options: TableParquetOptions,
    pub(crate) metrics: ExecutionPlanMetricsSet,
    pub(crate) table_schema: Option<TableSchema>,
    pub(crate) predicate: Option<Arc<dyn PhysicalExpr>>,
    pub(crate) parquet_file_reader_factory: Option<Arc<dyn ParquetFileReaderFactory>>,
    pub(crate) schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
    pub(crate) batch_size: Option<usize>,
    pub(crate) metadata_size_hint: Option<usize>,
    pub(crate) projected_statistics: Option<Statistics>,
    pub(crate) encryption_factory: Option<Arc<dyn EncryptionFactory>>,
}

Expand description

Execution plan for reading one or more Parquet files.

            ▲
            │
            │  Produce a stream of
            │  RecordBatches
            │
┌───────────────────────┐
│                       │
│     DataSourceExec    │
│                       │
└───────────────────────┘
            ▲
            │  Asynchronously read from one
            │  or more parquet files via
            │  ObjectStore interface
            │
            │
  .───────────────────.
 │                     )
 │`───────────────────'│
 │    ObjectStore      │
 │.───────────────────.│
 │                     )
  `───────────────────'

§Example: Create a `DataSourceExec`


let source = Arc::new(
    ParquetSource::default()
    .with_predicate(predicate)
);
// Create a DataSourceExec for reading `file1.parquet` with a file size of 100MB
let config = FileScanConfigBuilder::new(object_store_url, file_schema, source)
   .with_file(PartitionedFile::new("file1.parquet", 100*1024*1024)).build();
let exec = DataSourceExec::from_data_source(config);

§Features

Supports the following optimizations:

Concurrent reads: reads from one or more files in parallel as multiple partitions, including concurrently reading multiple row groups from a single file.
Predicate push down: skips row groups, pages, rows based on metadata and late materialization. See “Predicate Pushdown” below.
Projection pushdown: reads and decodes only the columns required.
Limit pushdown: stop execution early after some number of rows are read.
Custom readers: customize reading parquet files, e.g. to cache metadata, coalesce I/O operations, etc. See ParquetFileReaderFactory for more details.
Schema evolution: read parquet files with different schemas into a unified table schema. See SchemaAdapterFactory for more details.
metadata_size_hint: controls the number of bytes read from the end of the file in the initial I/O when the default ParquetFileReaderFactory. If a custom reader is used, it supplies the metadata directly and this parameter is ignored. ParquetSource::with_metadata_size_hint for more details.
User provided ParquetAccessPlans to skip row groups and/or pages based on external information. See “Implementing External Indexes” below

§Predicate Pushdown

DataSourceExec uses the provided PhysicalExpr predicate as a filter to skip reading unnecessary data and improve query performance using several techniques:

Row group pruning: skips entire row groups based on min/max statistics found in ParquetMetaData and any Bloom filters that are present.
Page pruning: skips individual pages within a ColumnChunk using the Parquet PageIndex, if present.
Row filtering: skips rows within a page using a form of late materialization. When possible, predicates are applied by the parquet decoder during decode (see ArrowPredicate and RowFilter for more details). This is only enabled if ParquetScanOptions::pushdown_filters is set to true.

Note: If the predicate can not be used to accelerate the scan, it is ignored (no error is raised on predicate evaluation errors).

§Example: rewriting `DataSourceExec`

You can modify a DataSourceExec using ParquetSource, for example to change files or add a predicate.


// Split a single DataSourceExec into multiple DataSourceExecs, one for each file
let exec = parquet_exec();
let data_source = exec.data_source();
let base_config = data_source.as_any().downcast_ref::<FileScanConfig>().unwrap();
let existing_file_groups = &base_config.file_groups;
let new_execs = existing_file_groups
  .iter()
  .map(|file_group| {
    // create a new exec by copying the existing exec's source config
    let new_config = FileScanConfigBuilder::from(base_config.clone())
       .with_file_groups(vec![file_group.clone()])
      .build();

    (DataSourceExec::from_data_source(new_config))
  })
  .collect::<Vec<_>>();

§Implementing External Indexes

It is possible to restrict the row groups and selections within those row groups that the DataSourceExec will consider by providing an initial ParquetAccessPlan as extensions on PartitionedFile. This can be used to implement external indexes on top of parquet files and select only portions of the files.

The DataSourceExec will try and reduce any provided ParquetAccessPlan further based on the contents of ParquetMetadata and other settings.

§Example of providing a ParquetAccessPlan


// create an access plan to scan row group 0, 1 and 3 and skip row groups 2 and 4
let mut access_plan = ParquetAccessPlan::new_all(5);
access_plan.skip(2);
access_plan.skip(4);
// provide the plan as extension to the FileScanConfig
let partitioned_file = PartitionedFile::new("my_file.parquet", 1234)
  .with_extensions(Arc::new(access_plan));
// create a FileScanConfig to scan this file
let config = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema(), Arc::new(ParquetSource::default()))
    .with_file(partitioned_file).build();
// this parquet DataSourceExec will not even try to read row groups 2 and 4. Additional
// pruning based on predicates may also happen
let exec = DataSourceExec::from_data_source(config);

For a complete example, see the [advanced_parquet_index example]).

§Execution Overview

Step 1: DataSourceExec::execute is called, returning a FileStream configured to open parquet files with a ParquetOpener.
Step 2: When the stream is polled, the ParquetOpener is called to open the file.
Step 3: The ParquetOpener gets the ParquetMetaData (file metadata) via ParquetFileReaderFactory, creating a ParquetAccessPlan by applying predicates to metadata. The plan and projections are used to determine what pages must be read.
Step 4: The stream begins reading data, fetching the required parquet pages incrementally decoding them, and applying any row filters (see Self::with_pushdown_filters).
Step 5: As each RecordBatch is read, it may be adapted by a SchemaAdapter to match the table schema. By default missing columns are filled with nulls, but this can be customized via SchemaAdapterFactory.

Fields§

§table_parquet_options: TableParquetOptions§metrics: ExecutionPlanMetricsSet§table_schema: Option<TableSchema>§predicate: Option<Arc<dyn PhysicalExpr>>§parquet_file_reader_factory: Option<Arc<dyn ParquetFileReaderFactory>>§schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>§batch_size: Option<usize>§metadata_size_hint: Option<usize>§projected_statistics: Option<Statistics>§encryption_factory: Option<Arc<dyn EncryptionFactory>>

Struct ParquetSource Copy item path

§Example: Create a DataSourceExec

§Features

§Predicate Pushdown

§Example: rewriting DataSourceExec

§Implementing External Indexes

§Example of providing a ParquetAccessPlan

§Execution Overview

Fields§

Implementations§

impl ParquetSource

pub fn new(table_parquet_options: TableParquetOptions) -> ParquetSource

pub fn with_metadata_size_hint(self, metadata_size_hint: usize) -> ParquetSource

pub fn with_predicate(&self, predicate: Arc<dyn PhysicalExpr>) -> ParquetSource

pub fn with_encryption_factory( self, encryption_factory: Arc<dyn EncryptionFactory>, ) -> ParquetSource

pub fn table_parquet_options(&self) -> &TableParquetOptions

pub fn predicate(&self) -> Option<&Arc<dyn PhysicalExpr>>

pub fn parquet_file_reader_factory( &self, ) -> Option<&Arc<dyn ParquetFileReaderFactory>>

pub fn with_parquet_file_reader_factory( self, parquet_file_reader_factory: Arc<dyn ParquetFileReaderFactory>, ) -> ParquetSource

pub fn with_pushdown_filters(self, pushdown_filters: bool) -> ParquetSource

pub fn with_reorder_filters(self, reorder_filters: bool) -> ParquetSource

pub fn with_enable_page_index(self, enable_page_index: bool) -> ParquetSource

pub fn with_bloom_filter_on_read( self, bloom_filter_on_read: bool, ) -> ParquetSource

pub fn with_bloom_filter_on_write( self, enable_bloom_filter_on_write: bool, ) -> ParquetSource

pub fn max_predicate_cache_size(&self) -> Option<usize>

pub fn apply_schema_adapter( self, conf: &FileScanConfig, ) -> Result<Arc<dyn FileSource>, DataFusionError>

§Arguments

§Returns

Trait Implementations§

impl Clone for ParquetSource

fn clone(&self) -> ParquetSource

fn clone_from(&mut self, source: &Self)

impl Debug for ParquetSource

fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error>

impl Default for ParquetSource

fn default() -> ParquetSource

impl FileSource for ParquetSource

fn create_file_opener( &self, object_store: Arc<dyn ObjectStore>, base_config: &FileScanConfig, partition: usize, ) -> Arc<dyn FileOpener>

fn as_any(&self) -> &(dyn Any + 'static)

fn filter(&self) -> Option<Arc<dyn PhysicalExpr>>

fn with_batch_size(&self, batch_size: usize) -> Arc<dyn FileSource>

fn with_schema(&self, schema: TableSchema) -> Arc<dyn FileSource>

fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource>

fn with_projection(&self, _config: &FileScanConfig) -> Arc<dyn FileSource>

fn metrics(&self) -> &ExecutionPlanMetricsSet

fn statistics(&self) -> Result<Statistics, DataFusionError>

fn file_type(&self) -> &str

fn fmt_extra( &self, t: DisplayFormatType, f: &mut Formatter<'_>, ) -> Result<(), Error>

fn try_pushdown_filters( &self, filters: Vec<Arc<dyn PhysicalExpr>>, config: &ConfigOptions, ) -> Result<FilterPushdownPropagation<Arc<dyn FileSource>>, DataFusionError>

fn with_schema_adapter_factory( &self, schema_adapter_factory: Arc<dyn SchemaAdapterFactory>, ) -> Result<Arc<dyn FileSource>, DataFusionError>

fn schema_adapter_factory(&self) -> Option<Arc<dyn SchemaAdapterFactory>>

fn repartitioned( &self, target_partitions: usize, repartition_file_min_size: usize, output_ordering: Option<LexOrdering>, config: &FileScanConfig, ) -> Result<Option<FileScanConfig>, DataFusionError>

impl From<ParquetSource> for Arc<dyn FileSource>

fn from(source: ParquetSource) -> Arc<dyn FileSource>

Auto Trait Implementations§

impl Freeze for ParquetSource

impl !RefUnwindSafe for ParquetSource

impl Send for ParquetSource

impl Sync for ParquetSource

impl Unpin for ParquetSource

impl !UnwindSafe for ParquetSource

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

Struct ParquetSource

§Example: Create a `DataSourceExec`

§Example: rewriting `DataSourceExec`

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T> PolicyExt for T
where T: ?Sized,

fn and<P, B, E>(self, other: P) -> And<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

fn or<P, B, E>(self, other: P) -> Or<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

impl<T> ErasedDestructor for T
where T: 'static,