datafusion_datasource_parquet/
metrics.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use datafusion_physical_plan::metrics::{
19    Count, ExecutionPlanMetricsSet, MetricBuilder, MetricType, PruningMetrics, Time,
20};
21
22/// Stores metrics about the parquet execution for a particular parquet file.
23///
24/// This component is a subject to **change** in near future and is exposed for low level integrations
25/// through [`ParquetFileReaderFactory`].
26///
27/// [`ParquetFileReaderFactory`]: super::ParquetFileReaderFactory
28#[derive(Debug, Clone)]
29pub struct ParquetFileMetrics {
30    /// Number of file **ranges** pruned or matched by partition or file level statistics.
31    /// Pruning of files often happens at planning time but may happen at execution time
32    /// if dynamic filters (e.g. from a join) result in additional pruning.
33    ///
34    /// This does **not** necessarily equal the number of files pruned:
35    /// files may be scanned in sub-ranges to increase parallelism,
36    /// in which case this will represent the number of sub-ranges pruned, not the number of files.
37    /// The number of files pruned will always be less than or equal to this number.
38    ///
39    /// A single file may have some ranges that are not pruned and some that are pruned.
40    /// For example, with a query like `ORDER BY col LIMIT 10`, the TopK dynamic filter
41    /// pushdown optimization may fill up the TopK heap when reading the first part of a file,
42    /// then skip the second part if file statistics indicate it cannot contain rows
43    /// that would be in the TopK.
44    pub files_ranges_pruned_statistics: PruningMetrics,
45    /// Number of times the predicate could not be evaluated
46    pub predicate_evaluation_errors: Count,
47    /// Number of row groups whose bloom filters were checked, tracked with matched/pruned counts
48    pub row_groups_pruned_bloom_filter: PruningMetrics,
49    /// Number of row groups whose statistics were checked, tracked with matched/pruned counts
50    pub row_groups_pruned_statistics: PruningMetrics,
51    /// Total number of bytes scanned
52    pub bytes_scanned: Count,
53    /// Total rows filtered out by predicates pushed into parquet scan
54    pub pushdown_rows_pruned: Count,
55    /// Total rows passed predicates pushed into parquet scan
56    pub pushdown_rows_matched: Count,
57    /// Total time spent evaluating row-level pushdown filters
58    pub row_pushdown_eval_time: Time,
59    /// Total time spent evaluating row group-level statistics filters
60    pub statistics_eval_time: Time,
61    /// Total time spent evaluating row group Bloom Filters
62    pub bloom_filter_eval_time: Time,
63    /// Total rows filtered or matched by parquet page index
64    pub page_index_rows_pruned: PruningMetrics,
65    /// Total time spent evaluating parquet page index filters
66    pub page_index_eval_time: Time,
67    /// Total time spent reading and parsing metadata from the footer
68    pub metadata_load_time: Time,
69    /// Predicate Cache: number of records read directly from the inner reader.
70    /// This is the number of rows decoded while evaluating predicates
71    pub predicate_cache_inner_records: Count,
72    /// Predicate Cache: number of records read from the cache. This is the
73    /// number of rows that were stored in the cache after evaluating predicates
74    /// reused for the output.
75    pub predicate_cache_records: Count,
76}
77
78impl ParquetFileMetrics {
79    /// Create new metrics
80    pub fn new(
81        partition: usize,
82        filename: &str,
83        metrics: &ExecutionPlanMetricsSet,
84    ) -> Self {
85        // -----------------------
86        // 'summary' level metrics
87        // -----------------------
88        let row_groups_pruned_bloom_filter = MetricBuilder::new(metrics)
89            .with_new_label("filename", filename.to_string())
90            .with_type(MetricType::SUMMARY)
91            .pruning_metrics("row_groups_pruned_bloom_filter", partition);
92
93        let row_groups_pruned_statistics = MetricBuilder::new(metrics)
94            .with_new_label("filename", filename.to_string())
95            .with_type(MetricType::SUMMARY)
96            .pruning_metrics("row_groups_pruned_statistics", partition);
97
98        let page_index_rows_pruned = MetricBuilder::new(metrics)
99            .with_new_label("filename", filename.to_string())
100            .with_type(MetricType::SUMMARY)
101            .pruning_metrics("page_index_rows_pruned", partition);
102
103        let bytes_scanned = MetricBuilder::new(metrics)
104            .with_new_label("filename", filename.to_string())
105            .with_type(MetricType::SUMMARY)
106            .counter("bytes_scanned", partition);
107
108        let metadata_load_time = MetricBuilder::new(metrics)
109            .with_new_label("filename", filename.to_string())
110            .with_type(MetricType::SUMMARY)
111            .subset_time("metadata_load_time", partition);
112
113        let files_ranges_pruned_statistics = MetricBuilder::new(metrics)
114            .with_type(MetricType::SUMMARY)
115            .pruning_metrics("files_ranges_pruned_statistics", partition);
116
117        // -----------------------
118        // 'dev' level metrics
119        // -----------------------
120        let predicate_evaluation_errors = MetricBuilder::new(metrics)
121            .with_new_label("filename", filename.to_string())
122            .counter("predicate_evaluation_errors", partition);
123
124        let pushdown_rows_pruned = MetricBuilder::new(metrics)
125            .with_new_label("filename", filename.to_string())
126            .counter("pushdown_rows_pruned", partition);
127        let pushdown_rows_matched = MetricBuilder::new(metrics)
128            .with_new_label("filename", filename.to_string())
129            .counter("pushdown_rows_matched", partition);
130
131        let row_pushdown_eval_time = MetricBuilder::new(metrics)
132            .with_new_label("filename", filename.to_string())
133            .subset_time("row_pushdown_eval_time", partition);
134        let statistics_eval_time = MetricBuilder::new(metrics)
135            .with_new_label("filename", filename.to_string())
136            .subset_time("statistics_eval_time", partition);
137        let bloom_filter_eval_time = MetricBuilder::new(metrics)
138            .with_new_label("filename", filename.to_string())
139            .subset_time("bloom_filter_eval_time", partition);
140
141        let page_index_eval_time = MetricBuilder::new(metrics)
142            .with_new_label("filename", filename.to_string())
143            .subset_time("page_index_eval_time", partition);
144
145        let predicate_cache_inner_records = MetricBuilder::new(metrics)
146            .with_new_label("filename", filename.to_string())
147            .counter("predicate_cache_inner_records", partition);
148
149        let predicate_cache_records = MetricBuilder::new(metrics)
150            .with_new_label("filename", filename.to_string())
151            .counter("predicate_cache_records", partition);
152
153        Self {
154            files_ranges_pruned_statistics,
155            predicate_evaluation_errors,
156            row_groups_pruned_bloom_filter,
157            row_groups_pruned_statistics,
158            bytes_scanned,
159            pushdown_rows_pruned,
160            pushdown_rows_matched,
161            row_pushdown_eval_time,
162            page_index_rows_pruned,
163            statistics_eval_time,
164            bloom_filter_eval_time,
165            page_index_eval_time,
166            metadata_load_time,
167            predicate_cache_inner_records,
168            predicate_cache_records,
169        }
170    }
171}