datafusion/
lib.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#![doc(
19    html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
20    html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
21)]
22#![cfg_attr(docsrs, feature(doc_cfg))]
23// Make sure fast / cheap clones on Arc are explicit:
24// https://github.com/apache/datafusion/issues/11143
25//
26// Eliminate unnecessary function calls(some may be not cheap) due to `xxx_or`
27// for performance. Also avoid abusing `xxx_or_else` for readability:
28// https://github.com/apache/datafusion/issues/15802
29#![cfg_attr(
30    not(test),
31    deny(
32        clippy::clone_on_ref_ptr,
33        clippy::or_fun_call,
34        clippy::unnecessary_lazy_evaluations
35    )
36)]
37#![warn(missing_docs, clippy::needless_borrow)]
38
39//! [DataFusion] is an extensible query engine written in Rust that
40//! uses [Apache Arrow] as its in-memory format. DataFusion's target users are
41//! developers building fast and feature rich database and analytic systems,
42//! customized to particular workloads. Please see the [DataFusion website] for
43//! additional documentation, [use cases] and examples.
44//!
45//! "Out of the box," DataFusion offers [SQL] and [`Dataframe`] APIs,
46//! excellent [performance], built-in support for CSV, Parquet, JSON, and Avro,
47//! extensive customization, and a great community.
48//! [Python Bindings] are also available.
49//!
50//! DataFusion features a full query planner, a columnar, streaming, multi-threaded,
51//! vectorized execution engine, and partitioned data  sources. You can
52//! customize DataFusion at almost all points including additional data sources,
53//! query languages, functions, custom operators and more.
54//! See the [Architecture] section below for more details.
55//!
56//! [DataFusion]: https://datafusion.apache.org/
57//! [DataFusion website]: https://datafusion.apache.org
58//! [Apache Arrow]: https://arrow.apache.org
59//! [use cases]: https://datafusion.apache.org/user-guide/introduction.html#use-cases
60//! [SQL]: https://datafusion.apache.org/user-guide/sql/index.html
61//! [`DataFrame`]: dataframe::DataFrame
62//! [performance]: https://benchmark.clickhouse.com/
63//! [Python Bindings]: https://github.com/apache/datafusion-python
64//! [Architecture]: #architecture
65//!
66//! # Examples
67//!
68//! The main entry point for interacting with DataFusion is the
69//! [`SessionContext`]. [`Expr`]s represent expressions such as `a + b`.
70//!
71//! [`SessionContext`]: execution::context::SessionContext
72//!
73//! ## DataFrame
74//!
75//! To execute a query against data stored
76//! in a CSV file using a [`DataFrame`]:
77//!
78//! ```rust
79//! # use datafusion::prelude::*;
80//! # use datafusion::error::Result;
81//! # use datafusion::functions_aggregate::expr_fn::min;
82//! # use datafusion::arrow::array::RecordBatch;
83//!
84//! # #[tokio::main]
85//! # async fn main() -> Result<()> {
86//! let ctx = SessionContext::new();
87//!
88//! // create the dataframe
89//! let df = ctx
90//!     .read_csv("tests/data/example.csv", CsvReadOptions::new())
91//!     .await?;
92//!
93//! // create a plan
94//! let df = df
95//!     .filter(col("a").lt_eq(col("b")))?
96//!     .aggregate(vec![col("a")], vec![min(col("b"))])?
97//!     .limit(0, Some(100))?;
98//!
99//! // execute the plan
100//! let results: Vec<RecordBatch> = df.collect().await?;
101//!
102//! // format the results
103//! let pretty_results =
104//!     arrow::util::pretty::pretty_format_batches(&results)?.to_string();
105//!
106//! let expected = vec![
107//!     "+---+----------------+",
108//!     "| a | min(?table?.b) |",
109//!     "+---+----------------+",
110//!     "| 1 | 2              |",
111//!     "+---+----------------+",
112//! ];
113//!
114//! assert_eq!(pretty_results.trim().lines().collect::<Vec<_>>(), expected);
115//! # Ok(())
116//! # }
117//! ```
118//!
119//! ## SQL
120//!
121//! To execute a query against a CSV file using [SQL]:
122//!
123//! ```
124//! # use datafusion::prelude::*;
125//! # use datafusion::error::Result;
126//! # use datafusion::arrow::array::RecordBatch;
127//!
128//! # #[tokio::main]
129//! # async fn main() -> Result<()> {
130//! let ctx = SessionContext::new();
131//!
132//! ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new())
133//!     .await?;
134//!
135//! // create a plan
136//! let df = ctx
137//!     .sql("SELECT a, MIN(b) FROM example WHERE a <= b GROUP BY a LIMIT 100")
138//!     .await?;
139//!
140//! // execute the plan
141//! let results: Vec<RecordBatch> = df.collect().await?;
142//!
143//! // format the results
144//! let pretty_results =
145//!     arrow::util::pretty::pretty_format_batches(&results)?.to_string();
146//!
147//! let expected = vec![
148//!     "+---+----------------+",
149//!     "| a | min(example.b) |",
150//!     "+---+----------------+",
151//!     "| 1 | 2              |",
152//!     "+---+----------------+",
153//! ];
154//!
155//! assert_eq!(pretty_results.trim().lines().collect::<Vec<_>>(), expected);
156//! # Ok(())
157//! # }
158//! ```
159//!
160//! ## More Examples
161//!
162//! There are many additional annotated examples of using DataFusion in the [datafusion-examples] directory.
163//!
164//! [datafusion-examples]: https://github.com/apache/datafusion/tree/main/datafusion-examples
165//!
166//! # Architecture
167//!
168//! <!-- NOTE: The goal of this section is to provide a high level
169//! overview of how DataFusion is organized and then link to other
170//! sections of the docs with more details -->
171//!
172//! You can find a formal description of DataFusion's architecture in our
173//! [SIGMOD 2024 Paper].
174//!
175//! [SIGMOD 2024 Paper]: https://dl.acm.org/doi/10.1145/3626246.3653368
176//!
177//! ## Design Goals
178//! DataFusion's Architecture Goals are:
179//!
180//! 1. Work β€œout of the box”: Provide a very fast, world class query engine with
181//!    minimal setup or required configuration.
182//!
183//! 2. Customizable everything: All behavior should be customizable by
184//!    implementing traits.
185//!
186//! 3. Architecturally boring πŸ₯±: Follow industrial best practice rather than
187//!    trying cutting edge, but unproven, techniques.
188//!
189//! With these principles, users start with a basic, high-performance engine
190//! and specialize it over time to suit their needs and available engineering
191//! capacity.
192//!
193//! ## Overview  Presentations
194//!
195//! The following presentations offer high level overviews of the
196//! different components and how they interact together.
197//!
198//! - [Apr 2023]: The Apache DataFusion Architecture talks
199//!   - _Query Engine_: [recording](https://youtu.be/NVKujPxwSBA) and [slides](https://docs.google.com/presentation/d/1D3GDVas-8y0sA4c8EOgdCvEjVND4s2E7I6zfs67Y4j8/edit#slide=id.p)
200//!   - _Logical Plan and Expressions_: [recording](https://youtu.be/EzZTLiSJnhY) and [slides](https://docs.google.com/presentation/d/1ypylM3-w60kVDW7Q6S99AHzvlBgciTdjsAfqNP85K30)
201//!   - _Physical Plan and Execution_: [recording](https://youtu.be/2jkWU3_w6z0) and [slides](https://docs.google.com/presentation/d/1cA2WQJ2qg6tx6y4Wf8FH2WVSm9JQ5UgmBWATHdik0hg)
202//! - [July 2022]: DataFusion and Arrow: Supercharge Your Data Analytical Tool with a Rusty Query Engine: [recording](https://www.youtube.com/watch?v=Rii1VTn3seQ) and [slides](https://docs.google.com/presentation/d/1q1bPibvu64k2b7LPi7Yyb0k3gA1BiUYiUbEklqW1Ckc/view#slide=id.g11054eeab4c_0_1165)
203//! - [March 2021]: The DataFusion architecture is described in _Query Engine Design and the Rust-Based DataFusion in Apache Arrow_: [recording](https://www.youtube.com/watch?v=K6eCAVEk4kU) (DataFusion content starts [~ 15 minutes in](https://www.youtube.com/watch?v=K6eCAVEk4kU&t=875s)) and [slides](https://www.slideshare.net/influxdata/influxdb-iox-tech-talks-query-engine-design-and-the-rustbased-datafusion-in-apache-arrow-244161934)
204//! - [February 2021]: How DataFusion is used within the Ballista Project is described in _Ballista: Distributed Compute with Rust and Apache Arrow_: [recording](https://www.youtube.com/watch?v=ZZHQaOap9pQ)
205//!
206//! ## Customization and Extension
207//!
208//! DataFusion is designed to be highly extensible, so you can
209//! start with a working, full featured engine, and then
210//! specialize any behavior for your use case. For example,
211//! some projects may add custom [`ExecutionPlan`] operators, or create their own
212//! query language that directly creates [`LogicalPlan`] rather than using the
213//! built in SQL planner, [`SqlToRel`].
214//!
215//! In order to achieve this, DataFusion supports extension at many points:
216//!
217//! * read from any datasource ([`TableProvider`])
218//! * define your own catalogs, schemas, and table lists ([`catalog`] and [`CatalogProvider`])
219//! * build your own query language or plans ([`LogicalPlanBuilder`])
220//! * declare and use user-defined functions ([`ScalarUDF`], and [`AggregateUDF`], [`WindowUDF`])
221//! * add custom plan rewrite passes ([`AnalyzerRule`], [`OptimizerRule`]  and [`PhysicalOptimizerRule`])
222//! * extend the planner to use user-defined logical and physical nodes ([`QueryPlanner`])
223//!
224//! You can find examples of each of them in the [datafusion-examples] directory.
225//!
226//! [`TableProvider`]: crate::datasource::TableProvider
227//! [`CatalogProvider`]: crate::catalog::CatalogProvider
228//! [`LogicalPlanBuilder`]: datafusion_expr::logical_plan::builder::LogicalPlanBuilder
229//! [`ScalarUDF`]: crate::logical_expr::ScalarUDF
230//! [`AggregateUDF`]: crate::logical_expr::AggregateUDF
231//! [`WindowUDF`]: crate::logical_expr::WindowUDF
232//! [`QueryPlanner`]: execution::context::QueryPlanner
233//! [`OptimizerRule`]: datafusion_optimizer::optimizer::OptimizerRule
234//! [`AnalyzerRule`]:  datafusion_optimizer::analyzer::AnalyzerRule
235//! [`PhysicalOptimizerRule`]: datafusion_physical_optimizer::PhysicalOptimizerRule
236//!
237//! ## Query Planning and Execution Overview
238//!
239//! ### SQL
240//!
241//! ```text
242//!                 Parsed with            SqlToRel creates
243//!                 sqlparser              initial plan
244//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”           β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”             β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
245//! β”‚   SELECT *    β”‚           β”‚Query {  β”‚             β”‚Project      β”‚
246//! β”‚   FROM ...    │──────────▢│..       │────────────▢│  TableScan  β”‚
247//! β”‚               β”‚           β”‚}        β”‚             β”‚    ...      β”‚
248//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜           β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜             β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
249//!
250//!   SQL String                 sqlparser               LogicalPlan
251//!                              AST nodes
252//! ```
253//!
254//! 1. The query string is parsed to an Abstract Syntax Tree (AST)
255//!    [`Statement`] using [sqlparser].
256//!
257//! 2. The AST is converted to a [`LogicalPlan`] and logical expressions
258//!    [`Expr`]s to compute the desired result by [`SqlToRel`]. This phase
259//!    also includes name and type resolution ("binding").
260//!
261//! [`Statement`]: https://docs.rs/sqlparser/latest/sqlparser/ast/enum.Statement.html
262//!
263//! ### DataFrame
264//!
265//! When executing plans using the [`DataFrame`] API, the process is
266//! identical as with SQL, except the DataFrame API builds the
267//! [`LogicalPlan`] directly using [`LogicalPlanBuilder`]. Systems
268//! that have their own custom query languages typically also build
269//! [`LogicalPlan`] directly.
270//!
271//! ### Planning
272//!
273//! ```text
274//!             AnalyzerRules and      PhysicalPlanner          PhysicalOptimizerRules
275//!             OptimizerRules         creates ExecutionPlan    improve performance
276//!             rewrite plan
277//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”        β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”      β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”        β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
278//! β”‚Project      β”‚        β”‚Project(x, y)β”‚      β”‚ProjectExec      β”‚        β”‚ProjectExec      β”‚
279//! β”‚  TableScan  │──...──▢│  TableScan  │─────▢│  ...            │──...──▢│  ...            β”‚
280//! β”‚    ...      β”‚        β”‚    ...      β”‚      β”‚   DataSourceExecβ”‚        β”‚   DataSourceExecβ”‚
281//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜        β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜      β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜        β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
282//!
283//!  LogicalPlan            LogicalPlan         ExecutionPlan             ExecutionPlan
284//! ```
285//!
286//! To process large datasets with many rows as efficiently as
287//! possible, significant effort is spent planning and
288//! optimizing, in the following manner:
289//!
290//! 1. The [`LogicalPlan`] is checked and rewritten to enforce
291//!    semantic rules, such as type coercion, by [`AnalyzerRule`]s
292//!
293//! 2. The [`LogicalPlan`] is rewritten by [`OptimizerRule`]s, such as
294//!    projection and filter pushdown, to improve its efficiency.
295//!
296//! 3. The [`LogicalPlan`] is converted to an [`ExecutionPlan`] by a
297//!    [`PhysicalPlanner`]
298//!
299//! 4. The [`ExecutionPlan`] is rewritten by
300//!    [`PhysicalOptimizerRule`]s, such as sort and join selection, to
301//!    improve its efficiency.
302//!
303//! ## Data Sources
304//!
305//! ```text
306//! Planning       β”‚
307//! requests       β”‚            TableProvider::scan
308//! information    β”‚            creates an
309//! such as schema β”‚            ExecutionPlan
310//!                β”‚
311//!                β–Ό
312//!   β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”         β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
313//!   β”‚                         β”‚         β”‚               β”‚
314//!   β”‚impl TableProvider       │────────▢│DataSourceExec β”‚
315//!   β”‚                         β”‚         β”‚               β”‚
316//!   β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜         β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
317//!         TableProvider
318//!         (built in or user provided)    ExecutionPlan
319//! ```
320//!
321//! A [`TableProvider`] provides information for planning and
322//! an [`ExecutionPlan`] for execution. DataFusion includes two built-in
323//! table providers that support common file formats and require no runtime services,
324//! [`ListingTable`] and [`MemTable`]. You can add support for any other data
325//! source and/or file formats by implementing the [`TableProvider`] trait.
326//!
327//! See also:
328//!
329//! 1. [`ListingTable`]: Reads data from one or more Parquet, JSON, CSV, or AVRO
330//!    files in one or more local or remote directories. Supports HIVE style
331//!    partitioning, optional compression, directly reading from remote
332//!    object store, file metadata caching, and more.
333//!
334//! 2. [`MemTable`]: Reads data from in memory [`RecordBatch`]es.
335//!
336//! 3. [`StreamingTable`]: Reads data from potentially unbounded inputs.
337//!
338//! [`ListingTable`]: crate::datasource::listing::ListingTable
339//! [`MemTable`]: crate::datasource::memory::MemTable
340//! [`StreamingTable`]: crate::catalog::streaming::StreamingTable
341//!
342//! ## Plan Representations
343//!
344//! ### Logical Plans
345//! Logical planning yields [`LogicalPlan`] nodes and [`Expr`]
346//! representing expressions which are [`Schema`] aware and represent statements
347//! independent of how they are physically executed.
348//! A [`LogicalPlan`] is a Directed Acyclic Graph (DAG) of other
349//! [`LogicalPlan`]s, each potentially containing embedded [`Expr`]s.
350//!
351//! [`LogicalPlan`]s can be rewritten with [`TreeNode`] API, see the
352//! [`tree_node module`] for more details.
353//!
354//! [`Expr`]s can also be rewritten with [`TreeNode`] API and simplified using
355//! [`ExprSimplifier`]. Examples of working with and executing [`Expr`]s can be
356//! found in the [`expr_api`.rs] example
357//!
358//! [`TreeNode`]: datafusion_common::tree_node::TreeNode
359//! [`tree_node module`]: datafusion_expr::logical_plan::tree_node
360//! [`ExprSimplifier`]: crate::optimizer::simplify_expressions::ExprSimplifier
361//! [`expr_api`.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/expr_api.rs
362//!
363//! ### Physical Plans
364//!
365//! An [`ExecutionPlan`] (sometimes referred to as a "physical plan")
366//! is a plan that can be executed against data. It a DAG of other
367//! [`ExecutionPlan`]s each potentially containing expressions that implement the
368//! [`PhysicalExpr`] trait.
369//!
370//! Compared to a [`LogicalPlan`], an [`ExecutionPlan`] has additional concrete
371//! information about how to perform calculations (e.g. hash vs merge
372//! join), and how data flows during execution (e.g. partitioning and
373//! sortedness).
374//!
375//! [cp_solver] performs range propagation analysis on [`PhysicalExpr`]s and
376//! [`PruningPredicate`] can prove certain boolean [`PhysicalExpr`]s used for
377//! filtering can never be `true` using additional statistical information.
378//!
379//! [cp_solver]: crate::physical_expr::intervals::cp_solver
380//! [`PruningPredicate`]: datafusion_physical_optimizer::pruning::PruningPredicate
381//! [`PhysicalExpr`]: crate::physical_plan::PhysicalExpr
382//!
383//! ## Execution
384//!
385//! ```text
386//!            ExecutionPlan::execute             Calling next() on the
387//!            produces a stream                  stream produces the data
388//!
389//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”      β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”         β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
390//! β”‚ProjectExec     β”‚      β”‚impl                     β”‚    β”Œβ”€β”€β”€β–Άβ”‚RecordBatch β”‚
391//! β”‚  ...           │─────▢│SendableRecordBatchStream│─────    β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
392//! β”‚  DataSourceExecβ”‚      β”‚                         β”‚    β”‚    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
393//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜      β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜    β”œβ”€β”€β”€β–Άβ”‚RecordBatch β”‚
394//!               β–²                                        β”‚    β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
395//! ExecutionPlan β”‚                                        β”‚         ...
396//!               β”‚                                        β”‚
397//!               β”‚                                        β”‚    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
398//!             PhysicalOptimizerRules                     β”œβ”€β”€β”€β–Άβ”‚RecordBatch β”‚
399//!             request information                        β”‚    β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
400//!             such as partitioning                       β”‚    β”Œ ─ ─ ─ ─ ─ ─
401//!                                                        └───▢ None        β”‚
402//!                                                             β”” ─ ─ ─ ─ ─ ─
403//! ```
404//!
405//! [`ExecutionPlan`]s process data using the [Apache Arrow] memory
406//! format, making heavy use of functions from the [arrow]
407//! crate. Values are represented with [`ColumnarValue`], which are either
408//! [`ScalarValue`] (single constant values) or [`ArrayRef`] (Arrow
409//! Arrays).
410//!
411//! Calling [`execute`] produces 1 or more partitions of data,
412//! as a [`SendableRecordBatchStream`], which implements a pull based execution
413//! API. Calling [`next()`]`.await` will incrementally compute and return the next
414//! [`RecordBatch`]. Balanced parallelism is achieved using [Volcano style]
415//! "Exchange" operations implemented by [`RepartitionExec`].
416//!
417//! While some recent research such as [Morsel-Driven Parallelism] describes challenges
418//! with the pull style Volcano execution model on NUMA architectures, in practice DataFusion achieves
419//! similar scalability as systems that use push driven schedulers [such as DuckDB].
420//! See the [DataFusion paper in SIGMOD 2024] for more details.
421//!
422//! [`execute`]: physical_plan::ExecutionPlan::execute
423//! [`SendableRecordBatchStream`]: crate::physical_plan::SendableRecordBatchStream
424//! [`ColumnarValue`]: datafusion_expr::ColumnarValue
425//! [`ScalarValue`]: crate::scalar::ScalarValue
426//! [`ArrayRef`]: arrow::array::ArrayRef
427//! [`Stream`]: futures::stream::Stream
428//!
429//! See the [implementors of `ExecutionPlan`] for a list of physical operators available.
430//!
431//! [`RepartitionExec`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/repartition/struct.RepartitionExec.html
432//! [Volcano style]: https://doi.org/10.1145/93605.98720
433//! [Morsel-Driven Parallelism]: https://db.in.tum.de/~leis/papers/morsels.pdf
434//! [DataFusion paper in SIGMOD 2024]: https://github.com/apache/datafusion/files/15149988/DataFusion_Query_Engine___SIGMOD_2024-FINAL-mk4.pdf
435//! [such as DuckDB]: https://github.com/duckdb/duckdb/issues/1583
436//! [implementors of `ExecutionPlan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html#implementors
437//!
438//! ## Streaming Execution
439//!
440//! DataFusion is a "streaming" query engine which means [`ExecutionPlan`]s incrementally
441//! read from their input(s) and compute output one [`RecordBatch`] at a time
442//! by continually polling [`SendableRecordBatchStream`]s. Output and
443//! intermediate [`RecordBatch`]s each have approximately `batch_size` rows,
444//! which amortizes per-batch overhead of execution.
445//!
446//! Note that certain operations, sometimes called "pipeline breakers",
447//! (for example full sorts or hash aggregations) are fundamentally non streaming and
448//! must read their input fully before producing **any** output. As much as possible,
449//! other operators read a single [`RecordBatch`] from their input to produce a
450//! single [`RecordBatch`] as output.
451//!
452//! For example, given this SQL:
453//!
454//! ```sql
455//! SELECT name FROM 'data.parquet' WHERE id > 10
456//! ```
457//!
458//! An simplified DataFusion execution plan is shown below. It first reads
459//! data from the Parquet file, then applies the filter, then the projection,
460//! and finally produces output. Each step processes one [`RecordBatch`] at a
461//! time. Multiple batches are processed concurrently on different CPU cores
462//! for plans with multiple partitions.
463//!
464//! ```text
465//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
466//! β”‚ Parquet     │───▢│ DataSource   │───▢│ FilterExec     │───▢│ ProjectionExec   │───▢│ Results  β”‚
467//! β”‚ File        β”‚    β”‚              β”‚    β”‚                β”‚    β”‚                  β”‚    β”‚          β”‚
468//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜    β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜    β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜    β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜    β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
469//!                    (reads data)        (id > 10)             (keeps "name" col)
470//!                    RecordBatch ───▢    RecordBatch ────▢     RecordBatch ────▢        RecordBatch
471//! ```
472//!
473//! DataFusion uses the classic "pull" based control flow (explained more in the
474//! next section) to implement streaming execution. As an example,
475//! consider the following SQL query:
476//!
477//! ```sql
478//! SELECT date_trunc('month', time) FROM data WHERE id IN (10,20,30);
479//! ```
480//!
481//! The diagram below shows the call sequence when a consumer calls [`next()`] to
482//! get the next [`RecordBatch`] of output. While it is possible that some
483//! steps run on different threads, typically tokio will use the same thread
484//! that called [`next()`] to read from the input, apply the filter, and
485//! return the results without interleaving any other operations. This results
486//! in excellent cache locality as the same CPU core that produces the data often
487//! consumes it immediately as well.
488//!
489//! ```text
490//!
491//! Step 3: FilterExec calls next()       Step 2: ProjectionExec calls
492//!         on input Stream                  next() on input Stream
493//!         β”Œ ─ ─ ─ ─ ─ ─ ─ ─ ─      β”Œ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐
494//!                            β”‚                                               Step 1: Consumer
495//!         β–Ό                        β–Ό                           β”‚               calls next()
496//! ┏━━━━━━━━━━━━━━━━┓     ┏━━━━━┻━━━━━━━━━━━━━┓      ┏━━━━━━━━━━━━━━━━━━━━━━━━┓
497//! ┃                ┃     ┃                   ┃      ┃                        β—€ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
498//! ┃  DataSource    ┃     ┃                   ┃      ┃                        ┃
499//! ┃    (e.g.       ┃     ┃    FilterExec     ┃      ┃     ProjectionExec     ┃
500//! ┃ ParquetSource) ┃     ┃id IN (10, 20, 30) ┃      ┃date_bin('month', time) ┃
501//! ┃                ┃     ┃                   ┃      ┃                        ┣ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ β–Ά
502//! ┃                ┃     ┃                   ┃      ┃                        ┃
503//! ┗━━━━━━━━━━━━━━━━┛     ┗━━━━━━━━━━━┳━━━━━━━┛      ┗━━━━━━━━━━━━━━━━━━━━━━━━┛
504//!         β”‚                  β–²                                 β–²          Step 6: ProjectionExec
505//!                            β”‚     β”‚                           β”‚        computes date_trunc into a
506//!         β”” ─ ─ ─ ─ ─ ─ ─ ─ ─       ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─          new RecordBatch returned
507//!              β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”                β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”          from client
508//!              β”‚     RecordBatch     β”‚                β”‚ RecordBatch β”‚
509//!              β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜                β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
510//!
511//!           Step 4: DataSource returns a        Step 5: FilterExec returns a new
512//!                single RecordBatch            RecordBatch with only matching rows
513//! ```
514//!
515//! [`next()`]: futures::StreamExt::next
516//!
517//! ## Thread Scheduling, CPU / IO Thread Pools, and [Tokio] [`Runtime`]s
518//!
519//! DataFusion automatically runs each plan with multiple CPU cores using
520//! a [Tokio] [`Runtime`] as a thread pool. While tokio is most commonly used
521//! for asynchronous network I/O, the combination of an efficient, work-stealing
522//! scheduler, and first class compiler support for automatic continuation
523//! generation (`async`) also makes it a compelling choice for CPU intensive
524//! applications as explained in the [Using Rustlang’s Async Tokio
525//! Runtime for CPU-Bound Tasks] blog.
526//!
527//! The number of cores used is determined by the `target_partitions`
528//! configuration setting, which defaults to the number of CPU cores.
529//! While preparing for execution, DataFusion tries to create this many distinct
530//! `async` [`Stream`]s for each [`ExecutionPlan`].
531//! The [`Stream`]s for certain [`ExecutionPlan`]s, such as [`RepartitionExec`]
532//! and [`CoalescePartitionsExec`], spawn [Tokio] [`task`]s, that run on
533//! threads managed by the [`Runtime`].
534//! Many DataFusion [`Stream`]s perform CPU intensive processing.
535//!
536//! ### Cooperative Scheduling
537//!
538//! DataFusion uses cooperative scheduling, which means that each [`Stream`]
539//! is responsible for yielding control back to the [`Runtime`] after
540//! some amount of work is done. Please see the [`coop`] module documentation
541//! for more details.
542//!
543//! [`coop`]: datafusion_physical_plan::coop
544//!
545//! ### Network I/O and CPU intensive tasks
546//!
547//! Using `async` for CPU intensive tasks makes it easy for [`TableProvider`]s
548//! to perform network I/O using standard Rust `async` during execution.
549//! However, this design also makes it very easy to mix CPU intensive and latency
550//! sensitive I/O work on the same thread pool ([`Runtime`]).
551//! Using the same (default) [`Runtime`] is convenient, and often works well for
552//! initial development and processing local files, but it can lead to problems
553//! under load and/or when reading from network sources such as AWS S3.
554//!
555//! ### Optimizing Latency: Throttled CPU / IO under Highly Concurrent Load
556//!
557//! If your system does not fully utilize either the CPU or network bandwidth
558//! during execution, or you see significantly higher tail (e.g. p99) latencies
559//! responding to network requests, **it is likely you need to use a different
560//! [`Runtime`] for DataFusion plans**. The [thread_pools example]
561//! has  an example of how to do so.
562//!
563//! As shown below, using the same [`Runtime`] for both CPU intensive processing
564//! and network requests can introduce significant delays in responding to
565//! those network requests. Delays in processing network requests can and does
566//! lead network flow control to throttle the available bandwidth in response.
567//! This effect can be especially pronounced when running multiple queries
568//! concurrently.
569//!
570//! ```text
571//!                                                                          Legend
572//!
573//!                                                                          ┏━━━━━━┓
574//!                            Processing network request                    ┃      ┃  CPU bound work
575//!                            is delayed due to processing                  ┗━━━━━━┛
576//!                            CPU bound work                                β”Œβ”€β”
577//!                                                                          β”‚ β”‚       Network request
578//!                                         β”‚β”‚                               β””β”€β”˜       processing
579//!
580//!                                         β”‚β”‚
581//!                                ─ ─ ─ ─ ─  ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
582//!                               β”‚                                            β”‚
583//!
584//!                               β–Ό                                            β–Ό
585//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”           β”Œβ”€β”β”Œβ”€β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”“β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”“β”Œβ”€β”
586//! β”‚             β”‚thread 1   β”‚ β”‚β”‚ │┃     Decoding      ┃┃     Filtering     ┃│ β”‚
587//! β”‚             β”‚           β””β”€β”˜β””β”€β”˜β”—β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”›β”—β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”›β””β”€β”˜
588//! β”‚             β”‚           ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
589//! β”‚Tokio Runtimeβ”‚thread 2   ┃   Decoding   ┃     Filtering     ┃   Decoding   ┃       ...
590//! β”‚(thread pool)β”‚           ┗━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━┛
591//! β”‚             β”‚     ...                               ...
592//! β”‚             β”‚           β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”³β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”“β”Œβ”€β” ┏━━━━━━━━━━━━━━┓
593//! β”‚             β”‚thread N   ┃     Decoding      ┃     Filtering     ┃│ β”‚ ┃   Decoding   ┃
594//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜           β”—β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”»β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”›β””β”€β”˜ ┗━━━━━━━━━━━━━━┛
595//!                           ─────────────────────────────────────────────────────────────▢
596//!                                                                                           time
597//! ```
598//!
599//! The bottleneck resulting from network throttling can be avoided
600//! by using separate [`Runtime`]s for the different types of work, as shown
601//! in the diagram below.
602//!
603//! ```text
604//!                    A separate thread pool processes network       Legend
605//!                    requests, reducing the latency for
606//!                    processing each request                        ┏━━━━━━┓
607//!                                                                   ┃      ┃  CPU bound work
608//!                                         β”‚                         ┗━━━━━━┛
609//!                                          β”‚                        β”Œβ”€β”
610//!                               β”Œ ─ ─ ─ ─ β”˜                         β”‚ β”‚       Network request
611//!                                  β”Œ ─ ─ ─ β”˜                        β””β”€β”˜       processing
612//!                               β”‚
613//!                               β–Ό  β–Ό
614//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”           β”Œβ”€β”β”Œβ”€β”β”Œβ”€β”
615//! β”‚             β”‚thread 1   β”‚ β”‚β”‚ β”‚β”‚ β”‚
616//! β”‚             β”‚           β””β”€β”˜β””β”€β”˜β””β”€β”˜
617//! β”‚Tokio Runtimeβ”‚                                          ...
618//! β”‚(thread pool)β”‚thread 2
619//! β”‚             β”‚
620//! β”‚"IO Runtime" β”‚     ...
621//! β”‚             β”‚                                                   β”Œβ”€β”
622//! β”‚             β”‚thread N                                           β”‚ β”‚
623//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜                                                   β””β”€β”˜
624//!                           ─────────────────────────────────────────────────────────────▢
625//!                                                                                           time
626//!
627//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”           ┏━━━━━━━━━━━━━━━━━━━┓┏━━━━━━━━━━━━━━━━━━━┓
628//! β”‚             β”‚thread 1   ┃     Decoding      ┃┃     Filtering     ┃
629//! β”‚             β”‚           ┗━━━━━━━━━━━━━━━━━━━┛┗━━━━━━━━━━━━━━━━━━━┛
630//! β”‚Tokio Runtimeβ”‚           ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
631//! β”‚(thread pool)β”‚thread 2   ┃   Decoding   ┃     Filtering     ┃   Decoding   ┃       ...
632//! β”‚             β”‚           ┗━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━┛
633//! β”‚ CPU Runtime β”‚     ...                               ...
634//! β”‚             β”‚           ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
635//! β”‚             β”‚thread N   ┃     Decoding      ┃     Filtering     ┃   Decoding   ┃
636//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜           ┗━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━┛
637//!                          ─────────────────────────────────────────────────────────────▢
638//!                                                                                           time
639//! ```
640//!
641//! Note that DataFusion does not use [`tokio::task::spawn_blocking`] for
642//! CPU-bounded work, because `spawn_blocking` is designed for blocking **IO**,
643//! not designed CPU bound tasks. Among other challenges, spawned blocking
644//! tasks can't yield waiting for input (can't call `await`) so they
645//! can't be used to limit the number of concurrent CPU bound tasks or
646//! keep the processing pipeline to the same core.
647//!
648//! [Tokio]:  https://tokio.rs
649//! [`Runtime`]: tokio::runtime::Runtime
650//! [thread_pools example]: https://github.com/apache/datafusion/tree/main/datafusion-examples/examples/thread_pools.rs
651//! [`task`]: tokio::task
652//! [Using Rustlang’s Async Tokio Runtime for CPU-Bound Tasks]: https://thenewstack.io/using-rustlangs-async-tokio-runtime-for-cpu-bound-tasks/
653//! [`RepartitionExec`]: physical_plan::repartition::RepartitionExec
654//! [`CoalescePartitionsExec`]: physical_plan::coalesce_partitions::CoalescePartitionsExec
655//!
656//! ## State Management and Configuration
657//!
658//! [`ConfigOptions`] contain options to control DataFusion's
659//! execution.
660//!
661//! [`ConfigOptions`]: datafusion_common::config::ConfigOptions
662//!
663//! The state required to execute queries is managed by the following
664//! structures:
665//!
666//! 1. [`SessionContext`]: State needed to create [`LogicalPlan`]s such
667//!    as the table definitions and the function registries.
668//!
669//! 2. [`TaskContext`]: State needed for execution such as the
670//!    [`MemoryPool`], [`DiskManager`], and [`ObjectStoreRegistry`].
671//!
672//! 3. [`ExecutionProps`]: Per-execution properties and data (such as
673//!    starting timestamps, etc).
674//!
675//! [`SessionContext`]: crate::execution::context::SessionContext
676//! [`TaskContext`]: crate::execution::context::TaskContext
677//! [`ExecutionProps`]: crate::execution::context::ExecutionProps
678//!
679//! ### Resource Management
680//!
681//! The amount of memory and temporary local disk space used by
682//! DataFusion when running a plan can be controlled using the
683//! [`MemoryPool`] and [`DiskManager`]. Other runtime options can be
684//! found on [`RuntimeEnv`].
685//!
686//! [`DiskManager`]: crate::execution::DiskManager
687//! [`MemoryPool`]: crate::execution::memory_pool::MemoryPool
688//! [`RuntimeEnv`]: crate::execution::runtime_env::RuntimeEnv
689//! [`ObjectStoreRegistry`]: crate::datasource::object_store::ObjectStoreRegistry
690//!
691//! ## Crate Organization
692//!
693//! Most users interact with DataFusion via this crate (`datafusion`), which re-exports
694//! all functionality needed to build and execute queries.
695//!
696//! There are three other crates that provide additional functionality that
697//! must be used directly:
698//! * [`datafusion_proto`]: Plan serialization and deserialization
699//! * [`datafusion_substrait`]: Support for the substrait plan serialization format
700//! * [`datafusion_sqllogictest`] : The DataFusion SQL logic test runner
701//!
702//! [`datafusion_proto`]: https://crates.io/crates/datafusion-proto
703//! [`datafusion_substrait`]: https://crates.io/crates/datafusion-substrait
704//! [`datafusion_sqllogictest`]: https://crates.io/crates/datafusion-sqllogictest
705//!
706//! DataFusion is internally split into multiple sub crates to
707//! enforce modularity and improve compilation times. See the
708//! [list of modules](#modules) for all available sub-crates. Major ones are
709//!
710//! * [datafusion_common]: Common traits and types
711//! * [datafusion_catalog]: Catalog APIs such as [`SchemaProvider`] and [`CatalogProvider`]
712//! * [datafusion_datasource]: File and Data IO such as [`FileSource`] and [`DataSink`]
713//! * [datafusion_session]: [`Session`] and related structures
714//! * [datafusion_execution]: State and structures needed for execution
715//! * [datafusion_expr]: [`LogicalPlan`], [`Expr`] and related logical planning structure
716//! * [datafusion_functions]: Scalar function packages
717//! * [datafusion_functions_aggregate]: Aggregate functions such as `MIN`, `MAX`, `SUM`, etc
718//! * [datafusion_functions_nested]: Scalar function packages for `ARRAY`s, `MAP`s and `STRUCT`s
719//! * [datafusion_functions_table]: Table Functions such as `GENERATE_SERIES`
720//! * [datafusion_functions_window]: Window functions such as `ROW_NUMBER`, `RANK`, etc
721//! * [datafusion_optimizer]: [`OptimizerRule`]s and [`AnalyzerRule`]s
722//! * [datafusion_physical_expr]: [`PhysicalExpr`] and related expressions
723//! * [datafusion_physical_plan]: [`ExecutionPlan`] and related expressions
724//! * [datafusion_physical_optimizer]: [`ExecutionPlan`] and related expressions
725//! * [datafusion_sql]: SQL planner ([`SqlToRel`])
726//!
727//! [`SchemaProvider`]: datafusion_catalog::SchemaProvider
728//! [`CatalogProvider`]: datafusion_catalog::CatalogProvider
729//! [`Session`]: datafusion_session::Session
730//! [`FileSource`]: datafusion_datasource::file::FileSource
731//! [`DataSink`]: datafusion_datasource::sink::DataSink
732//!
733//! ## Citing DataFusion in Academic Papers
734//!
735//! You can use the following citation to reference DataFusion in academic papers:
736//!
737//! ```text
738//! @inproceedings{lamb2024apache
739//!   title={Apache Arrow DataFusion: A Fast, Embeddable, Modular Analytic Query Engine},
740//!   author={Lamb, Andrew and Shen, Yijie and Heres, Dani{\"e}l and Chakraborty, Jayjeet and Kabak, Mehmet Ozan and Hsieh, Liang-Chi and Sun, Chao},
741//!   booktitle={Companion of the 2024 International Conference on Management of Data},
742//!   pages={5--17},
743//!   year={2024}
744//! }
745//! ```
746//!
747//! [sqlparser]: https://docs.rs/sqlparser/latest/sqlparser
748//! [`SqlToRel`]: sql::planner::SqlToRel
749//! [`Expr`]: datafusion_expr::Expr
750//! [`LogicalPlan`]: datafusion_expr::LogicalPlan
751//! [`AnalyzerRule`]: datafusion_optimizer::analyzer::AnalyzerRule
752//! [`OptimizerRule`]: optimizer::optimizer::OptimizerRule
753//! [`ExecutionPlan`]: physical_plan::ExecutionPlan
754//! [`PhysicalPlanner`]: physical_planner::PhysicalPlanner
755//! [`PhysicalOptimizerRule`]: datafusion_physical_optimizer::PhysicalOptimizerRule
756//! [`Schema`]: arrow::datatypes::Schema
757//! [`PhysicalExpr`]: physical_plan::PhysicalExpr
758//! [`RecordBatch`]: arrow::array::RecordBatch
759//! [`RecordBatchReader`]: arrow::record_batch::RecordBatchReader
760//! [`Array`]: arrow::array::Array
761
762/// DataFusion crate version
763pub const DATAFUSION_VERSION: &str = env!("CARGO_PKG_VERSION");
764
765extern crate core;
766
767#[cfg(feature = "sql")]
768extern crate sqlparser;
769
770pub mod dataframe;
771pub mod datasource;
772pub mod error;
773pub mod execution;
774pub mod physical_planner;
775pub mod prelude;
776pub mod scalar;
777
778// Re-export dependencies that are part of DataFusion public API (e.g. via DataFusionError)
779pub use arrow;
780pub use object_store;
781
782#[cfg(feature = "parquet")]
783pub use parquet;
784
785#[cfg(feature = "avro")]
786pub use datafusion_datasource_avro::apache_avro;
787
788// re-export DataFusion sub-crates at the top level. Use `pub use *`
789// so that the contents of the subcrates appears in rustdocs
790// for details, see https://github.com/apache/datafusion/issues/6648
791
792/// re-export of [`datafusion_common`] crate
793pub mod common {
794    pub use datafusion_common::*;
795
796    /// re-export of [`datafusion_common_runtime`] crate
797    pub mod runtime {
798        pub use datafusion_common_runtime::*;
799    }
800}
801
802// Backwards compatibility
803pub use common::config;
804
805// NB datafusion execution is re-exported in the `execution` module
806
807/// re-export of [`datafusion_catalog`] crate
808pub mod catalog {
809    pub use datafusion_catalog::*;
810}
811
812/// re-export of [`datafusion_expr`] crate
813pub mod logical_expr {
814    pub use datafusion_expr::*;
815}
816
817/// re-export of [`datafusion_expr_common`] crate
818pub mod logical_expr_common {
819    pub use datafusion_expr_common::*;
820}
821
822/// re-export of [`datafusion_optimizer`] crate
823pub mod optimizer {
824    pub use datafusion_optimizer::*;
825}
826
827/// re-export of [`datafusion_physical_optimizer`] crate
828pub mod physical_optimizer {
829    pub use datafusion_physical_optimizer::*;
830}
831
832/// re-export of [`datafusion_physical_expr`] crate
833pub mod physical_expr_common {
834    pub use datafusion_physical_expr_common::*;
835}
836
837/// re-export of [`datafusion_physical_expr`] crate
838pub mod physical_expr {
839    pub use datafusion_physical_expr::*;
840}
841
842/// re-export of [`datafusion_physical_expr_adapter`] crate
843pub mod physical_expr_adapter {
844    pub use datafusion_physical_expr_adapter::*;
845}
846
847/// re-export of [`datafusion_physical_plan`] crate
848pub mod physical_plan {
849    pub use datafusion_physical_plan::*;
850}
851
852// Reexport testing macros for compatibility
853pub use datafusion_common::assert_batches_eq;
854pub use datafusion_common::assert_batches_sorted_eq;
855
856/// re-export of [`datafusion_sql`] crate
857#[cfg(feature = "sql")]
858pub mod sql {
859    pub use datafusion_sql::*;
860}
861
862/// re-export of [`datafusion_functions`] crate
863pub mod functions {
864    pub use datafusion_functions::*;
865}
866
867/// re-export of [`datafusion_functions_nested`] crate, if "nested_expressions" feature is enabled
868pub mod functions_nested {
869    #[cfg(feature = "nested_expressions")]
870    pub use datafusion_functions_nested::*;
871}
872
873/// re-export of [`datafusion_functions_aggregate`] crate
874pub mod functions_aggregate {
875    pub use datafusion_functions_aggregate::*;
876}
877
878/// re-export of [`datafusion_functions_window`] crate
879pub mod functions_window {
880    pub use datafusion_functions_window::*;
881}
882
883/// re-export of [`datafusion_functions_table`] crate
884pub mod functions_table {
885    pub use datafusion_functions_table::*;
886}
887
888/// re-export of variable provider for `@name` and `@@name` style runtime values.
889pub mod variable {
890    pub use datafusion_expr::var_provider::{VarProvider, VarType};
891}
892
893#[cfg(not(target_arch = "wasm32"))]
894pub mod test;
895
896mod schema_equivalence;
897pub mod test_util;
898
899#[cfg(doctest)]
900doc_comment::doctest!("../../../README.md", readme_example_test);
901
902// Instructions for Documentation Examples
903//
904// The following commands test the examples from the user guide as part of
905// `cargo test --doc`
906//
907// # Adding new tests:
908//
909// Simply add code like this to your .md file and ensure your md file is
910// included in the lists below.
911//
912// ```rust
913// <code here will be tested>
914// ```
915//
916// Note that sometimes it helps to author the doctest as a standalone program
917// first, and then copy it into the user guide.
918//
919// # Debugging Test Failures
920//
921// Unfortunately, the line numbers reported by doctest do not correspond to the
922// line numbers of in the .md files. Thus, if a doctest fails, use the name of
923// the test to find the relevant file in the list below, and then find the
924// example in that file to fix.
925//
926// For example, if `user_guide_expressions(line 123)` fails,
927// go to `docs/source/user-guide/expressions.md` to find the relevant problem.
928//
929#[cfg(doctest)]
930doc_comment::doctest!(
931    "../../../docs/source/user-guide/arrow-introduction.md",
932    user_guide_arrow_introduction
933);
934
935#[cfg(doctest)]
936doc_comment::doctest!(
937    "../../../docs/source/user-guide/concepts-readings-events.md",
938    user_guide_concepts_readings_events
939);
940
941#[cfg(doctest)]
942doc_comment::doctest!(
943    "../../../docs/source/user-guide/configs.md",
944    user_guide_configs
945);
946
947#[cfg(doctest)]
948doc_comment::doctest!(
949    "../../../docs/source/user-guide/crate-configuration.md",
950    user_guide_crate_configuration
951);
952
953#[cfg(doctest)]
954doc_comment::doctest!(
955    "../../../docs/source/user-guide/dataframe.md",
956    user_guide_dataframe
957);
958
959#[cfg(doctest)]
960doc_comment::doctest!(
961    "../../../docs/source/user-guide/example-usage.md",
962    user_guide_example_usage
963);
964
965#[cfg(doctest)]
966doc_comment::doctest!(
967    "../../../docs/source/user-guide/explain-usage.md",
968    user_guide_explain_usage
969);
970
971#[cfg(doctest)]
972doc_comment::doctest!(
973    "../../../docs/source/user-guide/expressions.md",
974    user_guide_expressions
975);
976
977#[cfg(doctest)]
978doc_comment::doctest!("../../../docs/source/user-guide/faq.md", user_guide_faq);
979
980#[cfg(doctest)]
981doc_comment::doctest!(
982    "../../../docs/source/user-guide/introduction.md",
983    user_guide_introduction
984);
985
986#[cfg(doctest)]
987doc_comment::doctest!(
988    "../../../docs/source/user-guide/cli/datasources.md",
989    user_guide_cli_datasource
990);
991
992#[cfg(doctest)]
993doc_comment::doctest!(
994    "../../../docs/source/user-guide/cli/installation.md",
995    user_guide_cli_installation
996);
997
998#[cfg(doctest)]
999doc_comment::doctest!(
1000    "../../../docs/source/user-guide/cli/overview.md",
1001    user_guide_cli_overview
1002);
1003
1004#[cfg(doctest)]
1005doc_comment::doctest!(
1006    "../../../docs/source/user-guide/cli/usage.md",
1007    user_guide_cli_usage
1008);
1009
1010#[cfg(doctest)]
1011doc_comment::doctest!(
1012    "../../../docs/source/user-guide/features.md",
1013    user_guide_features
1014);
1015
1016#[cfg(doctest)]
1017doc_comment::doctest!(
1018    "../../../docs/source/user-guide/sql/aggregate_functions.md",
1019    user_guide_sql_aggregate_functions
1020);
1021
1022#[cfg(doctest)]
1023doc_comment::doctest!(
1024    "../../../docs/source/user-guide/sql/data_types.md",
1025    user_guide_sql_data_types
1026);
1027
1028#[cfg(doctest)]
1029doc_comment::doctest!(
1030    "../../../docs/source/user-guide/sql/ddl.md",
1031    user_guide_sql_ddl
1032);
1033
1034#[cfg(doctest)]
1035doc_comment::doctest!(
1036    "../../../docs/source/user-guide/sql/dml.md",
1037    user_guide_sql_dml
1038);
1039
1040#[cfg(doctest)]
1041doc_comment::doctest!(
1042    "../../../docs/source/user-guide/sql/explain.md",
1043    user_guide_sql_exmplain
1044);
1045
1046#[cfg(doctest)]
1047doc_comment::doctest!(
1048    "../../../docs/source/user-guide/sql/information_schema.md",
1049    user_guide_sql_information_schema
1050);
1051
1052#[cfg(doctest)]
1053doc_comment::doctest!(
1054    "../../../docs/source/user-guide/sql/operators.md",
1055    user_guide_sql_operators
1056);
1057
1058#[cfg(doctest)]
1059doc_comment::doctest!(
1060    "../../../docs/source/user-guide/sql/prepared_statements.md",
1061    user_guide_prepared_statements
1062);
1063
1064#[cfg(doctest)]
1065doc_comment::doctest!(
1066    "../../../docs/source/user-guide/sql/scalar_functions.md",
1067    user_guide_sql_scalar_functions
1068);
1069
1070#[cfg(doctest)]
1071doc_comment::doctest!(
1072    "../../../docs/source/user-guide/sql/select.md",
1073    user_guide_sql_select
1074);
1075
1076#[cfg(doctest)]
1077doc_comment::doctest!(
1078    "../../../docs/source/user-guide/sql/special_functions.md",
1079    user_guide_sql_special_functions
1080);
1081
1082#[cfg(doctest)]
1083doc_comment::doctest!(
1084    "../../../docs/source/user-guide/sql/subqueries.md",
1085    user_guide_sql_subqueries
1086);
1087
1088#[cfg(doctest)]
1089doc_comment::doctest!(
1090    "../../../docs/source/user-guide/sql/window_functions.md",
1091    user_guide_sql_window_functions
1092);
1093
1094#[cfg(doctest)]
1095doc_comment::doctest!(
1096    "../../../docs/source/user-guide/sql/format_options.md",
1097    user_guide_sql_format_options
1098);
1099
1100#[cfg(doctest)]
1101doc_comment::doctest!(
1102    "../../../docs/source/library-user-guide/functions/adding-udfs.md",
1103    library_user_guide_functions_adding_udfs
1104);
1105
1106#[cfg(doctest)]
1107doc_comment::doctest!(
1108    "../../../docs/source/library-user-guide/functions/spark.md",
1109    library_user_guide_functions_spark
1110);
1111
1112#[cfg(doctest)]
1113doc_comment::doctest!(
1114    "../../../docs/source/library-user-guide/building-logical-plans.md",
1115    library_user_guide_building_logical_plans
1116);
1117
1118#[cfg(doctest)]
1119doc_comment::doctest!(
1120    "../../../docs/source/library-user-guide/catalogs.md",
1121    library_user_guide_catalogs
1122);
1123
1124#[cfg(doctest)]
1125doc_comment::doctest!(
1126    "../../../docs/source/library-user-guide/custom-table-providers.md",
1127    library_user_guide_custom_table_providers
1128);
1129
1130#[cfg(doctest)]
1131doc_comment::doctest!(
1132    "../../../docs/source/library-user-guide/extending-operators.md",
1133    library_user_guide_extending_operators
1134);
1135
1136#[cfg(doctest)]
1137doc_comment::doctest!(
1138    "../../../docs/source/library-user-guide/extensions.md",
1139    library_user_guide_extensions
1140);
1141
1142#[cfg(doctest)]
1143doc_comment::doctest!(
1144    "../../../docs/source/library-user-guide/index.md",
1145    library_user_guide_index
1146);
1147
1148#[cfg(doctest)]
1149doc_comment::doctest!(
1150    "../../../docs/source/library-user-guide/profiling.md",
1151    library_user_guide_profiling
1152);
1153
1154#[cfg(doctest)]
1155doc_comment::doctest!(
1156    "../../../docs/source/library-user-guide/query-optimizer.md",
1157    library_user_guide_query_optimizer
1158);
1159
1160#[cfg(doctest)]
1161doc_comment::doctest!(
1162    "../../../docs/source/library-user-guide/using-the-dataframe-api.md",
1163    library_user_guide_dataframe_api
1164);
1165
1166#[cfg(doctest)]
1167doc_comment::doctest!(
1168    "../../../docs/source/library-user-guide/using-the-sql-api.md",
1169    library_user_guide_sql_api
1170);
1171
1172#[cfg(doctest)]
1173doc_comment::doctest!(
1174    "../../../docs/source/library-user-guide/working-with-exprs.md",
1175    library_user_guide_working_with_exprs
1176);
1177
1178#[cfg(doctest)]
1179doc_comment::doctest!(
1180    "../../../docs/source/library-user-guide/upgrading.md",
1181    library_user_guide_upgrading
1182);
1183
1184#[cfg(doctest)]
1185doc_comment::doctest!(
1186    "../../../docs/source/contributor-guide/api-health.md",
1187    contributor_guide_api_health
1188);