DFParquetMetadata

Struct DFParquetMetadata 

Source
pub struct DFParquetMetadata<'a> {
    store: &'a (dyn ObjectStore + 'static),
    object_meta: &'a ObjectMeta,
    metadata_size_hint: Option<usize>,
    decryption_properties: Option<Arc<FileDecryptionProperties>>,
    file_metadata_cache: Option<Arc<dyn FileMetadataCache<Extra = ObjectMeta>>>,
    pub coerce_int96: Option<TimeUnit>,
}
Expand description

Handles fetching Parquet file schema, metadata and statistics from object store.

This component is exposed for low level integrations through ParquetFileReaderFactory.

Fields§

§store: &'a (dyn ObjectStore + 'static)§object_meta: &'a ObjectMeta§metadata_size_hint: Option<usize>§decryption_properties: Option<Arc<FileDecryptionProperties>>§file_metadata_cache: Option<Arc<dyn FileMetadataCache<Extra = ObjectMeta>>>§coerce_int96: Option<TimeUnit>

timeunit to coerce INT96 timestamps to

Implementations§

Source§

impl<'a> DFParquetMetadata<'a>

Source

pub fn new( store: &'a (dyn ObjectStore + 'static), object_meta: &'a ObjectMeta, ) -> DFParquetMetadata<'a>

Source

pub fn with_metadata_size_hint( self, metadata_size_hint: Option<usize>, ) -> DFParquetMetadata<'a>

set metadata size hint

Source

pub fn with_decryption_properties( self, decryption_properties: Option<Arc<FileDecryptionProperties>>, ) -> DFParquetMetadata<'a>

set decryption properties

Source

pub fn with_file_metadata_cache( self, file_metadata_cache: Option<Arc<dyn FileMetadataCache<Extra = ObjectMeta>>>, ) -> DFParquetMetadata<'a>

set file metadata cache

Source

pub fn with_coerce_int96( self, time_unit: Option<TimeUnit>, ) -> DFParquetMetadata<'a>

Set timeunit to coerce INT96 timestamps to

Source

pub async fn fetch_metadata( &self, ) -> Result<Arc<ParquetMetaData>, DataFusionError>

Fetch parquet metadata from the remote object store

Source

pub async fn fetch_schema(&self) -> Result<Schema, DataFusionError>

Read and parse the schema of the Parquet file

Source

pub async fn fetch_statistics( &self, table_schema: &Arc<Schema>, ) -> Result<Statistics, DataFusionError>

Fetch the metadata from the Parquet file via Self::fetch_metadata and convert the statistics in the metadata using Self::statistics_from_parquet_metadata

Source

pub fn statistics_from_parquet_metadata( metadata: &ParquetMetaData, table_schema: &Arc<Schema>, ) -> Result<Statistics, DataFusionError>

Convert statistics in [ParquetMetaData] into Statistics using [StatisticsConverter]

The statistics are calculated for each column in the table schema using the row group statistics in the parquet metadata.

§Key behaviors:
  1. Extracts row counts and byte sizes from all row groups
  2. Applies schema type coercions to align file schema with table schema
  3. Collects and aggregates statistics across row groups when available
§When there are no statistics:

If the Parquet file doesn’t contain any statistics (has_statistics is false), the function returns a Statistics object with:

  • Exact row count
  • Exact byte size
  • All column statistics marked as unknown via Statistics::unknown_column(&table_schema)
§When only some columns have statistics:

For columns with statistics:

  • Min/max values are properly extracted and represented as Precision::Exact
  • Null counts are calculated by summing across row groups

For columns without statistics,

  • For min/max, there are two situations:
    1. The column isn’t in arrow schema, then min/max values are set to Precision::Absent
    2. The column is in arrow schema, but not in parquet schema due to schema revolution, min/max values are set to Precision::Exact(null)
  • Null counts are set to Precision::Exact(num_rows) (conservatively assuming all values could be null)

Trait Implementations§

Source§

impl<'a> Debug for DFParquetMetadata<'a>

Source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error>

Formats the value using the given formatter. Read more

Auto Trait Implementations§

§

impl<'a> Freeze for DFParquetMetadata<'a>

§

impl<'a> !RefUnwindSafe for DFParquetMetadata<'a>

§

impl<'a> Send for DFParquetMetadata<'a>

§

impl<'a> Sync for DFParquetMetadata<'a>

§

impl<'a> Unpin for DFParquetMetadata<'a>

§

impl<'a> !UnwindSafe for DFParquetMetadata<'a>

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

§

impl<T> Instrument for T

§

fn instrument(self, span: Span) -> Instrumented<Self>

Instruments this type with the provided [Span], returning an Instrumented wrapper. Read more
§

fn in_current_span(self) -> Instrumented<Self>

Instruments this type with the current Span, returning an Instrumented wrapper. Read more
Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T> IntoEither for T

Source§

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
§

impl<T> PolicyExt for T
where T: ?Sized,

§

fn and<P, B, E>(self, other: P) -> And<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

Create a new Policy that returns [Action::Follow] only if self and other return Action::Follow. Read more
§

fn or<P, B, E>(self, other: P) -> Or<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

Create a new Policy that returns [Action::Follow] if either self or other returns Action::Follow. Read more
Source§

impl<T> Same for T

Source§

type Output = T

Should always be Self
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

§

fn vzip(self) -> V

§

impl<T> WithSubscriber for T

§

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

Attaches the provided Subscriber to this type, returning a [WithDispatch] wrapper. Read more
§

fn with_current_subscriber(self) -> WithDispatch<Self>

Attaches the current default Subscriber to this type, returning a [WithDispatch] wrapper. Read more
§

impl<T> ErasedDestructor for T
where T: 'static,