datafusion_optimizer/plan_signature.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::{
19 collections::hash_map::DefaultHasher,
20 hash::{Hash, Hasher},
21 num::NonZeroUsize,
22};
23
24use datafusion_common::tree_node::TreeNodeRecursion;
25use datafusion_expr::LogicalPlan;
26
27/// Non-unique identifier of a [`LogicalPlan`].
28///
29/// See [`LogicalPlanSignature::new`] for details.
30#[derive(Clone, Copy, PartialEq, Eq, Hash)]
31pub struct LogicalPlanSignature {
32 node_number: NonZeroUsize,
33 plan_hash: u64,
34}
35
36impl LogicalPlanSignature {
37 /// Returns [`LogicalPlanSignature`] of the given [`LogicalPlan`].
38 ///
39 /// It is a kind of [`LogicalPlan`] hashing with stronger guarantees.
40 ///
41 /// # Guarantees
42 ///
43 /// Consider two [`LogicalPlan`]s `p1` and `p2`.
44 ///
45 /// If `p1` and `p2` have a different number of [`LogicalPlan`]s, then
46 /// they will have different [`LogicalPlanSignature`]s.
47 ///
48 /// If `p1` and `p2` have a different [`Hash`], then
49 /// they will have different [`LogicalPlanSignature`]s.
50 ///
51 /// # Caveats
52 ///
53 /// The intention of [`LogicalPlanSignature`] is to have a lower chance
54 /// of hash collisions.
55 ///
56 /// There exist different [`LogicalPlan`]s with the same
57 /// [`LogicalPlanSignature`].
58 ///
59 /// When two [`LogicalPlan`]s differ only in metadata, then they will have
60 /// the same [`LogicalPlanSignature`]s (due to hash implementation in
61 /// [`LogicalPlan`]).
62 pub fn new(plan: &LogicalPlan) -> Self {
63 let mut hasher = DefaultHasher::new();
64 plan.hash(&mut hasher);
65
66 Self {
67 node_number: get_node_number(plan),
68 plan_hash: hasher.finish(),
69 }
70 }
71}
72
73/// Get total number of [`LogicalPlan`]s in the plan.
74fn get_node_number(plan: &LogicalPlan) -> NonZeroUsize {
75 let mut node_number = 0;
76 plan.apply_with_subqueries(|_plan| {
77 node_number += 1;
78 Ok(TreeNodeRecursion::Continue)
79 })
80 // Closure always return Ok
81 .unwrap();
82 // Visitor must have at least visited the root,
83 // so v.node_number is at least 1.
84 NonZeroUsize::new(node_number).unwrap()
85}
86
87#[cfg(test)]
88mod tests {
89 use std::sync::Arc;
90
91 use datafusion_common::{DFSchema, Result};
92 use datafusion_expr::{lit, LogicalPlan};
93
94 use crate::plan_signature::get_node_number;
95
96 #[test]
97 fn node_number_for_some_plan() -> Result<()> {
98 let schema = Arc::new(DFSchema::empty());
99
100 let one_node_plan =
101 Arc::new(LogicalPlan::EmptyRelation(datafusion_expr::EmptyRelation {
102 produce_one_row: false,
103 schema: Arc::clone(&schema),
104 }));
105
106 assert_eq!(1, get_node_number(&one_node_plan).get());
107
108 let two_node_plan = Arc::new(LogicalPlan::Projection(
109 datafusion_expr::Projection::try_new(vec![lit(1), lit(2)], one_node_plan)?,
110 ));
111
112 assert_eq!(2, get_node_number(&two_node_plan).get());
113
114 let five_node_plan = Arc::new(LogicalPlan::Union(datafusion_expr::Union {
115 inputs: vec![Arc::clone(&two_node_plan), two_node_plan],
116 schema,
117 }));
118
119 assert_eq!(5, get_node_number(&five_node_plan).get());
120
121 Ok(())
122 }
123}