datafusion_catalog/
catalog.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::any::Any;
19use std::fmt::Debug;
20use std::sync::Arc;
21
22pub use crate::schema::SchemaProvider;
23use datafusion_common::not_impl_err;
24use datafusion_common::Result;
25
26/// Represents a catalog, comprising a number of named schemas.
27///
28/// # Catalog Overview
29///
30/// To plan and execute queries, DataFusion needs a "Catalog" that provides
31/// metadata such as which schemas and tables exist, their columns and data
32/// types, and how to access the data.
33///
34/// The Catalog API consists:
35/// * [`CatalogProviderList`]: a collection of `CatalogProvider`s
36/// * [`CatalogProvider`]: a collection of `SchemaProvider`s (sometimes called a "database" in other systems)
37/// * [`SchemaProvider`]:  a collection of `TableProvider`s (often called a "schema" in other systems)
38/// * [`TableProvider`]:  individual tables
39///
40/// # Implementing Catalogs
41///
42/// To implement a catalog, you implement at least one of the [`CatalogProviderList`],
43/// [`CatalogProvider`] and [`SchemaProvider`] traits and register them
44/// appropriately in the `SessionContext`.
45///
46/// DataFusion comes with a simple in-memory catalog implementation,
47/// `MemoryCatalogProvider`, that is used by default and has no persistence.
48/// DataFusion does not include more complex Catalog implementations because
49/// catalog management is a key design choice for most data systems, and thus
50/// it is unlikely that any general-purpose catalog implementation will work
51/// well across many use cases.
52///
53/// # Implementing "Remote" catalogs
54///
55/// See [`remote_catalog`] for an end to end example of how to implement a
56/// remote catalog.
57///
58/// Sometimes catalog information is stored remotely and requires a network call
59/// to retrieve. For example, the [Delta Lake] table format stores table
60/// metadata in files on S3 that must be first downloaded to discover what
61/// schemas and tables exist.
62///
63/// [Delta Lake]: https://delta.io/
64/// [`remote_catalog`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/remote_catalog.rs
65///
66/// The [`CatalogProvider`] can support this use case, but it takes some care.
67/// The planning APIs in DataFusion are not `async` and thus network IO can not
68/// be performed "lazily" / "on demand" during query planning. The rationale for
69/// this design is that using remote procedure calls for all catalog accesses
70/// required for query planning would likely result in multiple network calls
71/// per plan, resulting in very poor planning performance.
72///
73/// To implement [`CatalogProvider`] and [`SchemaProvider`] for remote catalogs,
74/// you need to provide an in memory snapshot of the required metadata. Most
75/// systems typically either already have this information cached locally or can
76/// batch access to the remote catalog to retrieve multiple schemas and tables
77/// in a single network call.
78///
79/// Note that [`SchemaProvider::table`] **is** an `async` function in order to
80/// simplify implementing simple [`SchemaProvider`]s. For many table formats it
81/// is easy to list all available tables but there is additional non trivial
82/// access required to read table details (e.g. statistics).
83///
84/// The pattern that DataFusion itself uses to plan SQL queries is to walk over
85/// the query to find all table references, performing required remote catalog
86/// lookups in parallel, storing the results in a cached snapshot, and then plans
87/// the query using that snapshot.
88///
89/// # Example Catalog Implementations
90///
91/// Here are some examples of how to implement custom catalogs:
92///
93/// * [`datafusion-cli`]: [`DynamicFileCatalogProvider`] catalog provider
94///   that treats files and directories on a filesystem as tables.
95///
96/// * The [`catalog.rs`]:  a simple directory based catalog.
97///
98/// * [delta-rs]:  [`UnityCatalogProvider`] implementation that can
99///   read from Delta Lake tables
100///
101/// [`datafusion-cli`]: https://datafusion.apache.org/user-guide/cli/index.html
102/// [`DynamicFileCatalogProvider`]: https://github.com/apache/datafusion/blob/31b9b48b08592b7d293f46e75707aad7dadd7cbc/datafusion-cli/src/catalog.rs#L75
103/// [`catalog.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/catalog.rs
104/// [delta-rs]: https://github.com/delta-io/delta-rs
105/// [`UnityCatalogProvider`]: https://github.com/delta-io/delta-rs/blob/951436ecec476ce65b5ed3b58b50fb0846ca7b91/crates/deltalake-core/src/data_catalog/unity/datafusion.rs#L111-L123
106///
107/// [`TableProvider`]: crate::TableProvider
108pub trait CatalogProvider: Debug + Sync + Send {
109    /// Returns the catalog provider as [`Any`]
110    /// so that it can be downcast to a specific implementation.
111    fn as_any(&self) -> &dyn Any;
112
113    /// Retrieves the list of available schema names in this catalog.
114    fn schema_names(&self) -> Vec<String>;
115
116    /// Retrieves a specific schema from the catalog by name, provided it exists.
117    fn schema(&self, name: &str) -> Option<Arc<dyn SchemaProvider>>;
118
119    /// Adds a new schema to this catalog.
120    ///
121    /// If a schema of the same name existed before, it is replaced in
122    /// the catalog and returned.
123    ///
124    /// By default returns a "Not Implemented" error
125    fn register_schema(
126        &self,
127        name: &str,
128        schema: Arc<dyn SchemaProvider>,
129    ) -> Result<Option<Arc<dyn SchemaProvider>>> {
130        // use variables to avoid unused variable warnings
131        let _ = name;
132        let _ = schema;
133        not_impl_err!("Registering new schemas is not supported")
134    }
135
136    /// Removes a schema from this catalog. Implementations of this method should return
137    /// errors if the schema exists but cannot be dropped. For example, in DataFusion's
138    /// default in-memory catalog, `MemoryCatalogProvider`, a non-empty schema
139    /// will only be successfully dropped when `cascade` is true.
140    /// This is equivalent to how DROP SCHEMA works in PostgreSQL.
141    ///
142    /// Implementations of this method should return None if schema with `name`
143    /// does not exist.
144    ///
145    /// By default returns a "Not Implemented" error
146    fn deregister_schema(
147        &self,
148        _name: &str,
149        _cascade: bool,
150    ) -> Result<Option<Arc<dyn SchemaProvider>>> {
151        not_impl_err!("Deregistering new schemas is not supported")
152    }
153}
154
155/// Represent a list of named [`CatalogProvider`]s.
156///
157/// Please see the documentation on [`CatalogProvider`] for details of
158/// implementing a custom catalog.
159pub trait CatalogProviderList: Debug + Sync + Send {
160    /// Returns the catalog list as [`Any`]
161    /// so that it can be downcast to a specific implementation.
162    fn as_any(&self) -> &dyn Any;
163
164    /// Adds a new catalog to this catalog list
165    /// If a catalog of the same name existed before, it is replaced in the list and returned.
166    fn register_catalog(
167        &self,
168        name: String,
169        catalog: Arc<dyn CatalogProvider>,
170    ) -> Option<Arc<dyn CatalogProvider>>;
171
172    /// Retrieves the list of available catalog names
173    fn catalog_names(&self) -> Vec<String>;
174
175    /// Retrieves a specific catalog by name, provided it exists.
176    fn catalog(&self, name: &str) -> Option<Arc<dyn CatalogProvider>>;
177}