datafusion_catalog/catalog.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::any::Any;
19use std::fmt::Debug;
20use std::sync::Arc;
21
22pub use crate::schema::SchemaProvider;
23use datafusion_common::not_impl_err;
24use datafusion_common::Result;
25
26/// Represents a catalog, comprising a number of named schemas.
27///
28/// # Catalog Overview
29///
30/// To plan and execute queries, DataFusion needs a "Catalog" that provides
31/// metadata such as which schemas and tables exist, their columns and data
32/// types, and how to access the data.
33///
34/// The Catalog API consists:
35/// * [`CatalogProviderList`]: a collection of `CatalogProvider`s
36/// * [`CatalogProvider`]: a collection of `SchemaProvider`s (sometimes called a "database" in other systems)
37/// * [`SchemaProvider`]: a collection of `TableProvider`s (often called a "schema" in other systems)
38/// * [`TableProvider`]: individual tables
39///
40/// # Implementing Catalogs
41///
42/// To implement a catalog, you implement at least one of the [`CatalogProviderList`],
43/// [`CatalogProvider`] and [`SchemaProvider`] traits and register them
44/// appropriately in the `SessionContext`.
45///
46/// DataFusion comes with a simple in-memory catalog implementation,
47/// `MemoryCatalogProvider`, that is used by default and has no persistence.
48/// DataFusion does not include more complex Catalog implementations because
49/// catalog management is a key design choice for most data systems, and thus
50/// it is unlikely that any general-purpose catalog implementation will work
51/// well across many use cases.
52///
53/// # Implementing "Remote" catalogs
54///
55/// See [`remote_catalog`] for an end to end example of how to implement a
56/// remote catalog.
57///
58/// Sometimes catalog information is stored remotely and requires a network call
59/// to retrieve. For example, the [Delta Lake] table format stores table
60/// metadata in files on S3 that must be first downloaded to discover what
61/// schemas and tables exist.
62///
63/// [Delta Lake]: https://delta.io/
64/// [`remote_catalog`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/remote_catalog.rs
65///
66/// The [`CatalogProvider`] can support this use case, but it takes some care.
67/// The planning APIs in DataFusion are not `async` and thus network IO can not
68/// be performed "lazily" / "on demand" during query planning. The rationale for
69/// this design is that using remote procedure calls for all catalog accesses
70/// required for query planning would likely result in multiple network calls
71/// per plan, resulting in very poor planning performance.
72///
73/// To implement [`CatalogProvider`] and [`SchemaProvider`] for remote catalogs,
74/// you need to provide an in memory snapshot of the required metadata. Most
75/// systems typically either already have this information cached locally or can
76/// batch access to the remote catalog to retrieve multiple schemas and tables
77/// in a single network call.
78///
79/// Note that [`SchemaProvider::table`] **is** an `async` function in order to
80/// simplify implementing simple [`SchemaProvider`]s. For many table formats it
81/// is easy to list all available tables but there is additional non trivial
82/// access required to read table details (e.g. statistics).
83///
84/// The pattern that DataFusion itself uses to plan SQL queries is to walk over
85/// the query to find all table references, performing required remote catalog
86/// lookups in parallel, storing the results in a cached snapshot, and then plans
87/// the query using that snapshot.
88///
89/// # Example Catalog Implementations
90///
91/// Here are some examples of how to implement custom catalogs:
92///
93/// * [`datafusion-cli`]: [`DynamicFileCatalogProvider`] catalog provider
94/// that treats files and directories on a filesystem as tables.
95///
96/// * The [`catalog.rs`]: a simple directory based catalog.
97///
98/// * [delta-rs]: [`UnityCatalogProvider`] implementation that can
99/// read from Delta Lake tables
100///
101/// [`datafusion-cli`]: https://datafusion.apache.org/user-guide/cli/index.html
102/// [`DynamicFileCatalogProvider`]: https://github.com/apache/datafusion/blob/31b9b48b08592b7d293f46e75707aad7dadd7cbc/datafusion-cli/src/catalog.rs#L75
103/// [`catalog.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/catalog.rs
104/// [delta-rs]: https://github.com/delta-io/delta-rs
105/// [`UnityCatalogProvider`]: https://github.com/delta-io/delta-rs/blob/951436ecec476ce65b5ed3b58b50fb0846ca7b91/crates/deltalake-core/src/data_catalog/unity/datafusion.rs#L111-L123
106///
107/// [`TableProvider`]: crate::TableProvider
108pub trait CatalogProvider: Debug + Sync + Send {
109 /// Returns the catalog provider as [`Any`]
110 /// so that it can be downcast to a specific implementation.
111 fn as_any(&self) -> &dyn Any;
112
113 /// Retrieves the list of available schema names in this catalog.
114 fn schema_names(&self) -> Vec<String>;
115
116 /// Retrieves a specific schema from the catalog by name, provided it exists.
117 fn schema(&self, name: &str) -> Option<Arc<dyn SchemaProvider>>;
118
119 /// Adds a new schema to this catalog.
120 ///
121 /// If a schema of the same name existed before, it is replaced in
122 /// the catalog and returned.
123 ///
124 /// By default returns a "Not Implemented" error
125 fn register_schema(
126 &self,
127 name: &str,
128 schema: Arc<dyn SchemaProvider>,
129 ) -> Result<Option<Arc<dyn SchemaProvider>>> {
130 // use variables to avoid unused variable warnings
131 let _ = name;
132 let _ = schema;
133 not_impl_err!("Registering new schemas is not supported")
134 }
135
136 /// Removes a schema from this catalog. Implementations of this method should return
137 /// errors if the schema exists but cannot be dropped. For example, in DataFusion's
138 /// default in-memory catalog, `MemoryCatalogProvider`, a non-empty schema
139 /// will only be successfully dropped when `cascade` is true.
140 /// This is equivalent to how DROP SCHEMA works in PostgreSQL.
141 ///
142 /// Implementations of this method should return None if schema with `name`
143 /// does not exist.
144 ///
145 /// By default returns a "Not Implemented" error
146 fn deregister_schema(
147 &self,
148 _name: &str,
149 _cascade: bool,
150 ) -> Result<Option<Arc<dyn SchemaProvider>>> {
151 not_impl_err!("Deregistering new schemas is not supported")
152 }
153}
154
155/// Represent a list of named [`CatalogProvider`]s.
156///
157/// Please see the documentation on [`CatalogProvider`] for details of
158/// implementing a custom catalog.
159pub trait CatalogProviderList: Debug + Sync + Send {
160 /// Returns the catalog list as [`Any`]
161 /// so that it can be downcast to a specific implementation.
162 fn as_any(&self) -> &dyn Any;
163
164 /// Adds a new catalog to this catalog list
165 /// If a catalog of the same name existed before, it is replaced in the list and returned.
166 fn register_catalog(
167 &self,
168 name: String,
169 catalog: Arc<dyn CatalogProvider>,
170 ) -> Option<Arc<dyn CatalogProvider>>;
171
172 /// Retrieves the list of available catalog names
173 fn catalog_names(&self) -> Vec<String>;
174
175 /// Retrieves a specific catalog by name, provided it exists.
176 fn catalog(&self, name: &str) -> Option<Arc<dyn CatalogProvider>>;
177}