datafusion_datasource_parquet/metrics.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use datafusion_physical_plan::metrics::{
19 Count, ExecutionPlanMetricsSet, MetricBuilder, MetricType, PruningMetrics, Time,
20};
21
22/// Stores metrics about the parquet execution for a particular parquet file.
23///
24/// This component is a subject to **change** in near future and is exposed for low level integrations
25/// through [`ParquetFileReaderFactory`].
26///
27/// [`ParquetFileReaderFactory`]: super::ParquetFileReaderFactory
28#[derive(Debug, Clone)]
29pub struct ParquetFileMetrics {
30 /// Number of file **ranges** pruned or matched by partition or file level statistics.
31 /// Pruning of files often happens at planning time but may happen at execution time
32 /// if dynamic filters (e.g. from a join) result in additional pruning.
33 ///
34 /// This does **not** necessarily equal the number of files pruned:
35 /// files may be scanned in sub-ranges to increase parallelism,
36 /// in which case this will represent the number of sub-ranges pruned, not the number of files.
37 /// The number of files pruned will always be less than or equal to this number.
38 ///
39 /// A single file may have some ranges that are not pruned and some that are pruned.
40 /// For example, with a query like `ORDER BY col LIMIT 10`, the TopK dynamic filter
41 /// pushdown optimization may fill up the TopK heap when reading the first part of a file,
42 /// then skip the second part if file statistics indicate it cannot contain rows
43 /// that would be in the TopK.
44 pub files_ranges_pruned_statistics: PruningMetrics,
45 /// Number of times the predicate could not be evaluated
46 pub predicate_evaluation_errors: Count,
47 /// Number of row groups whose bloom filters were checked, tracked with matched/pruned counts
48 pub row_groups_pruned_bloom_filter: PruningMetrics,
49 /// Number of row groups whose statistics were checked, tracked with matched/pruned counts
50 pub row_groups_pruned_statistics: PruningMetrics,
51 /// Total number of bytes scanned
52 pub bytes_scanned: Count,
53 /// Total rows filtered out by predicates pushed into parquet scan
54 pub pushdown_rows_pruned: Count,
55 /// Total rows passed predicates pushed into parquet scan
56 pub pushdown_rows_matched: Count,
57 /// Total time spent evaluating row-level pushdown filters
58 pub row_pushdown_eval_time: Time,
59 /// Total time spent evaluating row group-level statistics filters
60 pub statistics_eval_time: Time,
61 /// Total time spent evaluating row group Bloom Filters
62 pub bloom_filter_eval_time: Time,
63 /// Total rows filtered or matched by parquet page index
64 pub page_index_rows_pruned: PruningMetrics,
65 /// Total time spent evaluating parquet page index filters
66 pub page_index_eval_time: Time,
67 /// Total time spent reading and parsing metadata from the footer
68 pub metadata_load_time: Time,
69 /// Predicate Cache: number of records read directly from the inner reader.
70 /// This is the number of rows decoded while evaluating predicates
71 pub predicate_cache_inner_records: Count,
72 /// Predicate Cache: number of records read from the cache. This is the
73 /// number of rows that were stored in the cache after evaluating predicates
74 /// reused for the output.
75 pub predicate_cache_records: Count,
76}
77
78impl ParquetFileMetrics {
79 /// Create new metrics
80 pub fn new(
81 partition: usize,
82 filename: &str,
83 metrics: &ExecutionPlanMetricsSet,
84 ) -> Self {
85 // -----------------------
86 // 'summary' level metrics
87 // -----------------------
88 let row_groups_pruned_bloom_filter = MetricBuilder::new(metrics)
89 .with_new_label("filename", filename.to_string())
90 .with_type(MetricType::SUMMARY)
91 .pruning_metrics("row_groups_pruned_bloom_filter", partition);
92
93 let row_groups_pruned_statistics = MetricBuilder::new(metrics)
94 .with_new_label("filename", filename.to_string())
95 .with_type(MetricType::SUMMARY)
96 .pruning_metrics("row_groups_pruned_statistics", partition);
97
98 let page_index_rows_pruned = MetricBuilder::new(metrics)
99 .with_new_label("filename", filename.to_string())
100 .with_type(MetricType::SUMMARY)
101 .pruning_metrics("page_index_rows_pruned", partition);
102
103 let bytes_scanned = MetricBuilder::new(metrics)
104 .with_new_label("filename", filename.to_string())
105 .with_type(MetricType::SUMMARY)
106 .counter("bytes_scanned", partition);
107
108 let metadata_load_time = MetricBuilder::new(metrics)
109 .with_new_label("filename", filename.to_string())
110 .with_type(MetricType::SUMMARY)
111 .subset_time("metadata_load_time", partition);
112
113 let files_ranges_pruned_statistics = MetricBuilder::new(metrics)
114 .with_type(MetricType::SUMMARY)
115 .pruning_metrics("files_ranges_pruned_statistics", partition);
116
117 // -----------------------
118 // 'dev' level metrics
119 // -----------------------
120 let predicate_evaluation_errors = MetricBuilder::new(metrics)
121 .with_new_label("filename", filename.to_string())
122 .counter("predicate_evaluation_errors", partition);
123
124 let pushdown_rows_pruned = MetricBuilder::new(metrics)
125 .with_new_label("filename", filename.to_string())
126 .counter("pushdown_rows_pruned", partition);
127 let pushdown_rows_matched = MetricBuilder::new(metrics)
128 .with_new_label("filename", filename.to_string())
129 .counter("pushdown_rows_matched", partition);
130
131 let row_pushdown_eval_time = MetricBuilder::new(metrics)
132 .with_new_label("filename", filename.to_string())
133 .subset_time("row_pushdown_eval_time", partition);
134 let statistics_eval_time = MetricBuilder::new(metrics)
135 .with_new_label("filename", filename.to_string())
136 .subset_time("statistics_eval_time", partition);
137 let bloom_filter_eval_time = MetricBuilder::new(metrics)
138 .with_new_label("filename", filename.to_string())
139 .subset_time("bloom_filter_eval_time", partition);
140
141 let page_index_eval_time = MetricBuilder::new(metrics)
142 .with_new_label("filename", filename.to_string())
143 .subset_time("page_index_eval_time", partition);
144
145 let predicate_cache_inner_records = MetricBuilder::new(metrics)
146 .with_new_label("filename", filename.to_string())
147 .counter("predicate_cache_inner_records", partition);
148
149 let predicate_cache_records = MetricBuilder::new(metrics)
150 .with_new_label("filename", filename.to_string())
151 .counter("predicate_cache_records", partition);
152
153 Self {
154 files_ranges_pruned_statistics,
155 predicate_evaluation_errors,
156 row_groups_pruned_bloom_filter,
157 row_groups_pruned_statistics,
158 bytes_scanned,
159 pushdown_rows_pruned,
160 pushdown_rows_matched,
161 row_pushdown_eval_time,
162 page_index_rows_pruned,
163 statistics_eval_time,
164 bloom_filter_eval_time,
165 page_index_eval_time,
166 metadata_load_time,
167 predicate_cache_inner_records,
168 predicate_cache_records,
169 }
170 }
171}