From e86c3e3d9cb97d0cbe9a9a50efc06e0dae15a700 Mon Sep 17 00:00:00 2001 From: nuno-faria Date: Mon, 5 May 2025 22:29:31 +0100 Subject: [PATCH 1/5] feat: Support Parquet writer options --- python/datafusion/__init__.py | 6 +- python/datafusion/dataframe.py | 247 +++++++++++++++------- python/tests/test_dataframe.py | 373 ++++++++++++++++++++++++++++++--- src/dataframe.rs | 154 ++++++++++---- src/lib.rs | 2 + 5 files changed, 623 insertions(+), 159 deletions(-) diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 15ceefbdb..273abbadb 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -31,7 +31,7 @@ from . import functions, object_store, substrait, unparser # The following imports are okay to remain as opaque to the user. -from ._internal import Config +from ._internal import Config, ParquetWriterOptions from .catalog import Catalog, Database, Table from .common import ( DFSchema, @@ -42,7 +42,7 @@ SessionContext, SQLOptions, ) -from .dataframe import DataFrame +from .dataframe import DataFrame, ParquetColumnOptions from .expr import ( Expr, WindowFrame, @@ -66,6 +66,8 @@ "ExecutionPlan", "Expr", "LogicalPlan", + "ParquetColumnOptions", + "ParquetWriterOptions", "RecordBatch", "RecordBatchStream", "RuntimeEnvBuilder", diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 26fe8f453..96f939e70 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -28,7 +28,6 @@ Iterable, Literal, Optional, - Union, overload, ) @@ -51,67 +50,58 @@ from datafusion._internal import DataFrame as DataFrameInternal from datafusion._internal import expr as expr_internal -from enum import Enum - +from datafusion._internal import ParquetColumnOptions as ParquetColumnOptionsInternal +from datafusion._internal import ParquetWriterOptions as ParquetWriterOptionsInternal from datafusion.expr import Expr, SortExpr, sort_or_default -# excerpt from deltalake -# https://github.com/apache/datafusion-python/pull/981#discussion_r1905619163 -class Compression(Enum): - """Enum representing the available compression types for Parquet files.""" - - UNCOMPRESSED = "uncompressed" - SNAPPY = "snappy" - GZIP = "gzip" - BROTLI = "brotli" - LZ4 = "lz4" - # lzo is not implemented yet - # https://github.com/apache/arrow-rs/issues/6970 - # LZO = "lzo" - ZSTD = "zstd" - LZ4_RAW = "lz4_raw" - - @classmethod - def from_str(cls: type[Compression], value: str) -> Compression: - """Convert a string to a Compression enum value. - - Args: - value: The string representation of the compression type. - - Returns: - The Compression enum lowercase value. - - Raises: - ValueError: If the string does not match any Compression enum value. - """ - try: - return cls(value.lower()) - except ValueError as err: - valid_values = str([item.value for item in Compression]) - error_msg = f""" - {value} is not a valid Compression. - Valid values are: {valid_values} - """ - raise ValueError(error_msg) from err - - def get_default_level(self) -> Optional[int]: - """Get the default compression level for the compression type. +class ParquetColumnOptions: + """Parquet options for individual columns. + + Contains the available options that can be applied for an individual Parquet column, + replacing the provided options in the `write_parquet`. + + Attributes: + encoding: Sets encoding for the column path. Valid values are: `plain`, + `plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`, + `delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, and + `byte_stream_split`. These values are not case-sensitive. If `None`, uses + the default parquet options + dictionary_enabled: Sets if dictionary encoding is enabled for the column path. + If `None`, uses the default parquet options + compression: Sets default parquet compression codec for the column path. Valid + values are `uncompressed`, `snappy`, `gzip(level)`, `lzo`, `brotli(level)`, + `lz4`, `zstd(level)`, and `lz4_raw`. These values are not case-sensitive. If + `None`, uses the default parquet options. + statistics_enabled: Sets if statistics are enabled for the column Valid values + are: `none`, `chunk`, and `page` These values are not case sensitive. If + `None`, uses the default parquet options. + bloom_filter_enabled: Sets if bloom filter is enabled for the column path. If + `None`, uses the default parquet options. + bloom_filter_fpp: Sets bloom filter false positive probability for the column + path. If `None`, uses the default parquet options. + bloom_filter_ndv: Sets bloom filter number of distinct values. If `None`, uses + the default parquet options. + """ - Returns: - The default compression level for the compression type. - """ - # GZIP, BROTLI default values from deltalake repo - # https://github.com/apache/datafusion-python/pull/981#discussion_r1905619163 - # ZSTD default value from delta-rs - # https://github.com/apache/datafusion-python/pull/981#discussion_r1904789223 - if self == Compression.GZIP: - return 6 - if self == Compression.BROTLI: - return 1 - if self == Compression.ZSTD: - return 4 - return None + def __init__( + self, + encoding: Optional[str] = None, + dictionary_enabled: Optional[bool] = None, + compression: Optional[str] = None, + statistics_enabled: Optional[str] = None, + bloom_filter_enabled: Optional[bool] = None, + bloom_filter_fpp: Optional[float] = None, + bloom_filter_ndv: Optional[int] = None, + ) -> None: + """Initialize the ParquetColumnOptions.""" + self.encoding = encoding + self.dictionary_enabled = dictionary_enabled + self.compression = compression + self.statistics_enabled = statistics_enabled + self.bloom_filter_enabled = bloom_filter_enabled + self.bloom_filter_fpp = bloom_filter_fpp + self.bloom_filter_ndv = bloom_filter_ndv class DataFrame: @@ -704,38 +694,135 @@ def write_csv(self, path: str | pathlib.Path, with_header: bool = False) -> None def write_parquet( self, path: str | pathlib.Path, - compression: Union[str, Compression] = Compression.ZSTD, - compression_level: int | None = None, + data_pagesize_limit: int = 1024 * 1024, + write_batch_size: int = 1024, + writer_version: str = "1.0", + skip_arrow_metadata: bool = False, + compression: Optional[str] = "zstd(3)", + dictionary_enabled: Optional[bool] = True, + dictionary_page_size_limit: int = 1024 * 1024, + statistics_enabled: Optional[str] = "page", + max_row_group_size: int = 1024 * 1024, + created_by: str = "datafusion-python", + column_index_truncate_length: Optional[int] = 64, + statistics_truncate_length: Optional[int] = None, + data_page_row_count_limit: int = 20_000, + encoding: Optional[str] = None, + bloom_filter_on_write: bool = False, + bloom_filter_fpp: Optional[float] = None, + bloom_filter_ndv: Optional[int] = None, + allow_single_file_parallelism: bool = True, + maximum_parallel_row_group_writers: int = 1, + maximum_buffered_record_batches_per_stream: int = 2, + column_specific_options: Optional[dict[str, ParquetColumnOptions]] = None, ) -> None: """Execute the :py:class:`DataFrame` and write the results to a Parquet file. Args: path: Path of the Parquet file to write. - compression: Compression type to use. Default is "ZSTD". - Available compression types are: + data_pagesize_limit: Sets best effort maximum size of data page in bytes. + write_batch_size: Sets write_batch_size in bytes. + writer_version: Sets parquet writer version. Valid values are `1.0` and + `2.0`. + skip_arrow_metadata: Skip encoding the embedded arrow metadata in the + KV_meta. + compression: Compression type to use. Default is "zstd(3)". + Available compression types are - "uncompressed": No compression. - "snappy": Snappy compression. - - "gzip": Gzip compression. - - "brotli": Brotli compression. + - "gzip(n)": Gzip compression with level n. + - "brotli(n)": Brotli compression with level n. - "lz4": LZ4 compression. - "lz4_raw": LZ4_RAW compression. - - "zstd": Zstandard compression. - Note: LZO is not yet implemented in arrow-rs and is therefore excluded. - compression_level: Compression level to use. For ZSTD, the - recommended range is 1 to 22, with the default being 4. Higher levels - provide better compression but slower speed. - """ - # Convert string to Compression enum if necessary - if isinstance(compression, str): - compression = Compression.from_str(compression) - - if ( - compression in {Compression.GZIP, Compression.BROTLI, Compression.ZSTD} - and compression_level is None - ): - compression_level = compression.get_default_level() + - "zstd(n)": Zstandard compression with level n. + dictionary_enabled: Sets if dictionary encoding is enabled. If None, uses + the default parquet writer setting. + dictionary_page_size_limit: Sets best effort maximum dictionary page size, + in bytes. + statistics_enabled: Sets if statistics are enabled for any column Valid + values are `none`, `chunk`, and `page`. If None, uses the default + parquet writer setting. + max_row_group_size: Target maximum number of rows in each row group + (defaults to 1M rows). Writing larger row groups requires more memory to + write, but can get better compression and be faster to read. + created_by: Sets "created by" property. + column_index_truncate_length: Sets column index truncate length. + statistics_truncate_length: Sets statistics truncate length. If None, uses + the default parquet writer setting. + data_page_row_count_limit: Sets best effort maximum number of rows in a data + page. + encoding: Sets default encoding for any column. Valid values are `plain`, + `plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`, + `delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, and + `byte_stream_split`. If None, uses the default parquet writer setting. + bloom_filter_on_write: Write bloom filters for all columns when creating + parquet files. + bloom_filter_fpp: Sets bloom filter false positive probability. If None, + uses the default parquet writer setting + bloom_filter_ndv: Sets bloom filter number of distinct values. If None, uses + the default parquet writer setting. + allow_single_file_parallelism: Controls whether DataFusion will attempt to + speed up writing parquet files by serializing them in parallel. Each + column in each row group in each output file are serialized in parallel + leveraging a maximum possible core count of n_files * n_row_groups * + n_columns. + maximum_parallel_row_group_writers: By default parallel parquet writer is + tuned for minimum memory usage in a streaming execution plan. You may + see a performance benefit when writing large parquet files by increasing + `maximum_parallel_row_group_writers` and + `maximum_buffered_record_batches_per_stream` if your system has idle + cores and can tolerate additional memory usage. Boosting these values is + likely worthwhile when writing out already in-memory data, such as from + a cached data frame. + maximum_buffered_record_batches_per_stream: See + `maximum_parallel_row_group_writers`. + column_specific_options: Overrides options for specific columns. If a column + is not a part of this dictionary, it will use the parameters provided in + the `write_parquet`. + """ + options_internal = ParquetWriterOptionsInternal( + data_pagesize_limit, + write_batch_size, + writer_version, + skip_arrow_metadata, + compression, + dictionary_enabled, + dictionary_page_size_limit, + statistics_enabled, + max_row_group_size, + created_by, + column_index_truncate_length, + statistics_truncate_length, + data_page_row_count_limit, + encoding, + bloom_filter_on_write, + bloom_filter_fpp, + bloom_filter_ndv, + allow_single_file_parallelism, + maximum_parallel_row_group_writers, + maximum_buffered_record_batches_per_stream, + ) + + if column_specific_options is None: + column_specific_options = {} + + column_specific_options_internal = {} + for column, opts in column_specific_options.items(): + column_specific_options_internal[column] = ParquetColumnOptionsInternal( + bloom_filter_enabled=opts.bloom_filter_enabled, + encoding=opts.encoding, + dictionary_enabled=opts.dictionary_enabled, + compression=opts.compression, + statistics_enabled=opts.statistics_enabled, + bloom_filter_fpp=opts.bloom_filter_fpp, + bloom_filter_ndv=opts.bloom_filter_ndv, + ) - self.df.write_parquet(str(path), compression.value, compression_level) + self.df.write_parquet( + str(path), + options_internal, + column_specific_options_internal, + ) def write_json(self, path: str | pathlib.Path) -> None: """Execute the :py:class:`DataFrame` and write the results to a JSON file. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index e01308c86..e1e29c45c 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -23,6 +23,7 @@ import pytest from datafusion import ( DataFrame, + ParquetColumnOptions, SessionContext, WindowFrame, column, @@ -62,6 +63,21 @@ def df(): return ctx.from_arrow(batch) +@pytest.fixture +def large_df(): + ctx = SessionContext() + + rows = 100000 + data = { + "a": list(range(rows)), + "b": [f"s-{i}" for i in range(rows)], + "c": [float(i + 0.1) for i in range(rows)], + } + batch = pa.record_batch(data) + + return ctx.from_arrow(batch) + + @pytest.fixture def struct_df(): ctx = SessionContext() @@ -1533,16 +1549,26 @@ def test_write_parquet(df, tmp_path, path_to_str): assert result == expected +def test_write_parquet_default_compression(df, tmp_path): + """Test that the default compression is ZSTD.""" + df.write_parquet(tmp_path) + + for file in tmp_path.rglob("*.parquet"): + metadata = pq.ParquetFile(file).metadata.to_dict() + for row_group in metadata["row_groups"]: + for col in row_group["columns"]: + assert col["compression"].lower() == "zstd" + + @pytest.mark.parametrize( - ("compression", "compression_level"), - [("gzip", 6), ("brotli", 7), ("zstd", 15)], + "compression", + ["gzip(6)", "brotli(7)", "zstd(15)", "snappy", "uncompressed"], ) -def test_write_compressed_parquet(df, tmp_path, compression, compression_level): - path = tmp_path +def test_write_compressed_parquet(df, tmp_path, compression): + import re - df.write_parquet( - str(path), compression=compression, compression_level=compression_level - ) + path = tmp_path + df.write_parquet(str(path), compression=compression) # test that the actual compression scheme is the one written for _root, _dirs, files in os.walk(path): @@ -1550,8 +1576,10 @@ def test_write_compressed_parquet(df, tmp_path, compression, compression_level): if file.endswith(".parquet"): metadata = pq.ParquetFile(tmp_path / file).metadata.to_dict() for row_group in metadata["row_groups"]: - for columns in row_group["columns"]: - assert columns["compression"].lower() == compression + for col in row_group["columns"]: + assert col["compression"].lower() == re.sub( + r"\(\d+\)", "", compression + ) result = pq.read_table(str(path)).to_pydict() expected = df.to_pydict() @@ -1560,40 +1588,323 @@ def test_write_compressed_parquet(df, tmp_path, compression, compression_level): @pytest.mark.parametrize( - ("compression", "compression_level"), - [("gzip", 12), ("brotli", 15), ("zstd", 23), ("wrong", 12)], + "compression", + ["gzip(12)", "brotli(15)", "zstd(23)"], ) -def test_write_compressed_parquet_wrong_compression_level( - df, tmp_path, compression, compression_level -): +def test_write_compressed_parquet_wrong_compression_level(df, tmp_path, compression): path = tmp_path - with pytest.raises(ValueError): - df.write_parquet( - str(path), - compression=compression, - compression_level=compression_level, - ) + with pytest.raises(Exception, match=r"valid compression range .*? exceeded."): + df.write_parquet(str(path), compression=compression) -@pytest.mark.parametrize("compression", ["wrong"]) +@pytest.mark.parametrize("compression", ["wrong", "wrong(12)"]) def test_write_compressed_parquet_invalid_compression(df, tmp_path, compression): path = tmp_path - with pytest.raises(ValueError): + with pytest.raises(Exception, match="Unknown or unsupported parquet compression"): df.write_parquet(str(path), compression=compression) -# not testing lzo because it it not implemented yet -# https://github.com/apache/arrow-rs/issues/6970 -@pytest.mark.parametrize("compression", ["zstd", "brotli", "gzip"]) -def test_write_compressed_parquet_default_compression_level(df, tmp_path, compression): - # Test write_parquet with zstd, brotli, gzip default compression level, - # ie don't specify compression level - # should complete without error - path = tmp_path +@pytest.mark.parametrize( + ("writer_version", "format_version"), + [("1.0", "1.0"), ("2.0", "2.6"), (None, "1.0")], +) +def test_write_parquet_writer_version(df, tmp_path, writer_version, format_version): + """Test the Parquet writer version. Note that writer_version=2.0 results in + format_version=2.6""" + if writer_version is None: + df.write_parquet(tmp_path) + else: + df.write_parquet(tmp_path, writer_version=writer_version) - df.write_parquet(str(path), compression=compression) + for file in tmp_path.rglob("*.parquet"): + parquet = pq.ParquetFile(file) + metadata = parquet.metadata.to_dict() + assert metadata["format_version"] == format_version + + +@pytest.mark.parametrize("writer_version", ["1.2.3", "custom-version", "0"]) +def test_write_parquet_wrong_writer_version(df, tmp_path, writer_version): + """Test that invalid writer versions in Parquet throw an exception.""" + with pytest.raises( + Exception, match="Unknown or unsupported parquet writer version" + ): + df.write_parquet(tmp_path, writer_version=writer_version) + + +@pytest.mark.parametrize("dictionary_enabled", [True, False, None]) +def test_write_parquet_dictionary_enabled(df, tmp_path, dictionary_enabled): + """Test enabling/disabling the dictionaries in Parquet.""" + df.write_parquet(tmp_path, dictionary_enabled=dictionary_enabled) + # by default, the dictionary is enabled, so None results in True + result = dictionary_enabled if dictionary_enabled is not None else True + + for file in tmp_path.rglob("*.parquet"): + parquet = pq.ParquetFile(file) + metadata = parquet.metadata.to_dict() + + for row_group in metadata["row_groups"]: + for col in row_group["columns"]: + assert col["has_dictionary_page"] == result + + +@pytest.mark.parametrize( + ("statistics_enabled", "has_statistics"), + [("page", True), ("chunk", True), ("none", False), (None, True)], +) +def test_write_parquet_statistics_enabled( + df, tmp_path, statistics_enabled, has_statistics +): + """Test configuring the statistics in Parquet. In pyarrow we can only check for + column-level statistics, so "page" and "chunk" are tested in the same way.""" + df.write_parquet(tmp_path, statistics_enabled=statistics_enabled) + + for file in tmp_path.rglob("*.parquet"): + parquet = pq.ParquetFile(file) + metadata = parquet.metadata.to_dict() + + for row_group in metadata["row_groups"]: + for col in row_group["columns"]: + if has_statistics: + assert col["statistics"] is not None + else: + assert col["statistics"] is None + + +@pytest.mark.parametrize("max_row_group_size", [1000, 5000, 10000, 100000]) +def test_write_parquet_max_row_group_size(large_df, tmp_path, max_row_group_size): + """Test configuring the max number of rows per group in Parquet. These test cases + guarantee that the number of rows for each row group is max_row_group_size, given + the total number of rows is a multiple of max_row_group_size.""" + large_df.write_parquet(tmp_path, max_row_group_size=max_row_group_size) + + for file in tmp_path.rglob("*.parquet"): + parquet = pq.ParquetFile(file) + metadata = parquet.metadata.to_dict() + for row_group in metadata["row_groups"]: + assert row_group["num_rows"] == max_row_group_size + + +@pytest.mark.parametrize("created_by", ["datafusion", "datafusion-python", "custom"]) +def test_write_parquet_created_by(df, tmp_path, created_by): + """Test configuring the created by metadata in Parquet.""" + df.write_parquet(tmp_path, created_by=created_by) + + for file in tmp_path.rglob("*.parquet"): + parquet = pq.ParquetFile(file) + metadata = parquet.metadata.to_dict() + assert metadata["created_by"] == created_by + + +@pytest.mark.parametrize("statistics_truncate_length", [5, 25, 50]) +def test_write_parquet_statistics_truncate_length( + df, tmp_path, statistics_truncate_length +): + """Test configuring the truncate limit in Parquet's row-group-level statistics.""" + ctx = SessionContext() + data = { + "a": [ + "a_the_quick_brown_fox_jumps_over_the_lazy_dog", + "m_the_quick_brown_fox_jumps_over_the_lazy_dog", + "z_the_quick_brown_fox_jumps_over_the_lazy_dog", + ], + "b": ["a_smaller", "m_smaller", "z_smaller"], + } + df = ctx.from_arrow(pa.record_batch(data)) + df.write_parquet(tmp_path, statistics_truncate_length=statistics_truncate_length) + + for file in tmp_path.rglob("*.parquet"): + parquet = pq.ParquetFile(file) + metadata = parquet.metadata.to_dict() + + for row_group in metadata["row_groups"]: + for col in row_group["columns"]: + statistics = col["statistics"] + assert len(statistics["min"]) <= statistics_truncate_length + assert len(statistics["max"]) <= statistics_truncate_length + + +def test_write_parquet_default_encoding(tmp_path): + """Test that, by default, Parquet files are written with dictionary encoding. + Note that dictionary encoding is not used for boolean values, so it is not tested + here.""" + ctx = SessionContext() + data = { + "a": [1, 2, 3], + "b": ["1", "2", "3"], + "c": [1.01, 2.02, 3.03], + } + df = ctx.from_arrow(pa.record_batch(data)) + df.write_parquet(tmp_path) + + for file in tmp_path.rglob("*.parquet"): + parquet = pq.ParquetFile(file) + metadata = parquet.metadata.to_dict() + + for row_group in metadata["row_groups"]: + for col in row_group["columns"]: + assert col["encodings"] == ("PLAIN", "RLE", "RLE_DICTIONARY") + + +@pytest.mark.parametrize( + ("encoding", "data_types", "result"), + [ + ("plain", ["int", "float", "str", "bool"], ("PLAIN", "RLE")), + ("rle", ["bool"], ("RLE",)), + ("delta_binary_packed", ["int"], ("RLE", "DELTA_BINARY_PACKED")), + ("delta_length_byte_array", ["str"], ("RLE", "DELTA_LENGTH_BYTE_ARRAY")), + ("delta_byte_array", ["str"], ("RLE", "DELTA_BYTE_ARRAY")), + ("byte_stream_split", ["int", "float"], ("RLE", "BYTE_STREAM_SPLIT")), + ], +) +def test_write_parquet_encoding(tmp_path, encoding, data_types, result): + """Test different encodings in Parquet in their respective support column types.""" + ctx = SessionContext() + + data = {} + for data_type in data_types: + match data_type: + case "int": + data["int"] = [1, 2, 3] + case "float": + data["float"] = [1.01, 2.02, 3.03] + case "str": + data["str"] = ["a", "b", "c"] + case "bool": + data["bool"] = [True, False, True] + + df = ctx.from_arrow(pa.record_batch(data)) + df.write_parquet(tmp_path, encoding=encoding, dictionary_enabled=False) + + for file in tmp_path.rglob("*.parquet"): + parquet = pq.ParquetFile(file) + metadata = parquet.metadata.to_dict() + + for row_group in metadata["row_groups"]: + for col in row_group["columns"]: + assert col["encodings"] == result + + +@pytest.mark.parametrize("encoding", ["bit_packed"]) +def test_write_parquet_unsupported_encoding(df, tmp_path, encoding): + """Test that unsupported Parquet encodings do not work.""" + # BaseException is used since this throws a Rust panic: https://github.com/PyO3/pyo3/issues/3519 + with pytest.raises(BaseException, match="Encoding .*? is not supported"): + df.write_parquet(tmp_path, encoding=encoding) + + +@pytest.mark.parametrize("encoding", ["non_existent", "unknown", "plain123"]) +def test_write_parquet_invalid_encoding(df, tmp_path, encoding): + """Test that invalid Parquet encodings do not work.""" + with pytest.raises(Exception, match="Unknown or unsupported parquet encoding"): + df.write_parquet(tmp_path, encoding=encoding) + + +@pytest.mark.parametrize("encoding", ["plain_dictionary", "rle_dictionary"]) +def test_write_parquet_dictionary_encoding_fallback(df, tmp_path, encoding): + """Test that the dictionary encoding cannot be used as fallback in Parquet.""" + # BaseException is used since this throws a Rust panic: https://github.com/PyO3/pyo3/issues/3519 + with pytest.raises( + BaseException, match="Dictionary encoding can not be used as fallback encoding" + ): + df.write_parquet(tmp_path, encoding=encoding) + + +def test_write_parquet_bloom_filter(df, tmp_path): + """Test Parquet files with and without (default) bloom filters. Since pyarrow does + not expose any information about bloom filters, the easiest way to confirm that they + are actually written is to compare the file size.""" + path_no_bloom_filter = tmp_path / "1" + path_bloom_filter = tmp_path / "2" + + df.write_parquet(path_no_bloom_filter) + df.write_parquet(path_bloom_filter, bloom_filter_on_write=True) + + size_no_bloom_filter = 0 + for file in path_no_bloom_filter.rglob("*.parquet"): + size_no_bloom_filter += os.path.getsize(file) + + size_bloom_filter = 0 + for file in path_bloom_filter.rglob("*.parquet"): + size_bloom_filter += os.path.getsize(file) + + assert size_no_bloom_filter < size_bloom_filter + + +def test_write_parquet_column_options(df, tmp_path): + """Test writing Parquet files with different options for each column, which replace + the global configs (when provided).""" + data = { + "a": [1, 2, 3], + "b": ["a", "b", "c"], + "c": [False, True, False], + "d": [1.01, 2.02, 3.03], + "e": [4, 5, 6], + } + + column_specific_options = { + "a": ParquetColumnOptions(statistics_enabled="none"), + "b": ParquetColumnOptions(encoding="plain", dictionary_enabled=False), + "c": ParquetColumnOptions( + compression="snappy", encoding="rle", dictionary_enabled=False + ), + "d": ParquetColumnOptions( + compression="zstd(6)", + encoding="byte_stream_split", + dictionary_enabled=False, + statistics_enabled="none", + ), + # column "e" will use the global configs + } + + results = { + "a": { + "statistics": False, + "compression": "brotli", + "encodings": ("PLAIN", "RLE", "RLE_DICTIONARY"), + }, + "b": { + "statistics": True, + "compression": "brotli", + "encodings": ("PLAIN", "RLE"), + }, + "c": { + "statistics": True, + "compression": "snappy", + "encodings": ("RLE",), + }, + "d": { + "statistics": False, + "compression": "zstd", + "encodings": ("RLE", "BYTE_STREAM_SPLIT"), + }, + "e": { + "statistics": True, + "compression": "brotli", + "encodings": ("PLAIN", "RLE", "RLE_DICTIONARY"), + }, + } + + ctx = SessionContext() + df = ctx.from_arrow(pa.record_batch(data)) + df.write_parquet( + tmp_path, + compression="brotli(8)", + column_specific_options=column_specific_options, + ) + + for file in tmp_path.rglob("*.parquet"): + parquet = pq.ParquetFile(file) + metadata = parquet.metadata.to_dict() + + for row_group in metadata["row_groups"]: + for col in row_group["columns"]: + column_name = col["path_in_schema"] + result = results[column_name] + assert (col["statistics"] is not None) == result["statistics"] + assert col["compression"].lower() == result["compression"].lower() + assert col["encodings"] == result["encodings"] def test_dataframe_export(df) -> None: diff --git a/src/dataframe.rs b/src/dataframe.rs index 211e31bd1..ffb3f36cf 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use std::collections::HashMap; use std::ffi::CString; use std::sync::Arc; @@ -27,12 +28,11 @@ use datafusion::arrow::datatypes::Schema; use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; use datafusion::arrow::util::pretty; use datafusion::common::UnnestOptions; -use datafusion::config::{CsvOptions, TableParquetOptions}; +use datafusion::config::{CsvOptions, ParquetColumnOptions, ParquetOptions, TableParquetOptions}; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::datasource::TableProvider; use datafusion::error::DataFusionError; use datafusion::execution::SendableRecordBatchStream; -use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; use datafusion::prelude::*; use futures::{StreamExt, TryStreamExt}; use pyo3::exceptions::PyValueError; @@ -165,10 +165,105 @@ fn build_formatter_config_from_python(formatter: &Bound<'_, PyAny>) -> PyResult< // Return the validated config, converting String error to PyErr config .validate() - .map_err(|e| pyo3::exceptions::PyValueError::new_err(e))?; + .map_err(pyo3::exceptions::PyValueError::new_err)?; Ok(config) } +/// Python mapping of `ParquetOptions` (includes just the writer-related options). +#[pyclass(name = "ParquetWriterOptions", module = "datafusion", subclass)] +#[derive(Clone, Default)] +pub struct PyParquetWriterOptions { + options: ParquetOptions, +} + +#[pymethods] +impl PyParquetWriterOptions { + #[new] + #[allow(clippy::too_many_arguments)] + pub fn new( + data_pagesize_limit: usize, + write_batch_size: usize, + writer_version: String, + skip_arrow_metadata: bool, + compression: Option, + dictionary_enabled: Option, + dictionary_page_size_limit: usize, + statistics_enabled: Option, + max_row_group_size: usize, + created_by: String, + column_index_truncate_length: Option, + statistics_truncate_length: Option, + data_page_row_count_limit: usize, + encoding: Option, + bloom_filter_on_write: bool, + bloom_filter_fpp: Option, + bloom_filter_ndv: Option, + allow_single_file_parallelism: bool, + maximum_parallel_row_group_writers: usize, + maximum_buffered_record_batches_per_stream: usize, + ) -> Self { + Self { + options: ParquetOptions { + data_pagesize_limit, + write_batch_size, + writer_version, + skip_arrow_metadata, + compression, + dictionary_enabled, + dictionary_page_size_limit, + statistics_enabled, + max_row_group_size, + created_by, + column_index_truncate_length, + statistics_truncate_length, + data_page_row_count_limit, + encoding, + bloom_filter_on_write, + bloom_filter_fpp, + bloom_filter_ndv, + allow_single_file_parallelism, + maximum_parallel_row_group_writers, + maximum_buffered_record_batches_per_stream, + ..Default::default() + }, + } + } +} + +/// Python mapping of `ParquetColumnOptions`. +#[pyclass(name = "ParquetColumnOptions", module = "datafusion", subclass)] +#[derive(Clone, Default)] +pub struct PyParquetColumnOptions { + options: ParquetColumnOptions, +} + +#[pymethods] +impl PyParquetColumnOptions { + #[new] + pub fn new( + bloom_filter_enabled: Option, + encoding: Option, + dictionary_enabled: Option, + compression: Option, + statistics_enabled: Option, + bloom_filter_fpp: Option, + bloom_filter_ndv: Option, + ) -> Self { + Self { + options: ParquetColumnOptions { + bloom_filter_enabled, + encoding, + dictionary_enabled, + compression, + statistics_enabled, + bloom_filter_fpp, + bloom_filter_ndv, + ..Default::default() + }, + } + } +} + /// A PyDataFrame is a representation of a logical plan and an API to compose statements. /// Use it to build a plan and `.collect()` to execute the plan and collect the result. /// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment. @@ -613,61 +708,28 @@ impl PyDataFrame { } /// Write a `DataFrame` to a Parquet file. - #[pyo3(signature = ( - path, - compression="zstd", - compression_level=None - ))] fn write_parquet( &self, path: &str, - compression: &str, - compression_level: Option, + options: PyParquetWriterOptions, + column_specific_options: HashMap, py: Python, ) -> PyDataFusionResult<()> { - fn verify_compression_level(cl: Option) -> Result { - cl.ok_or(PyValueError::new_err("compression_level is not defined")) - } - - let _validated = match compression.to_lowercase().as_str() { - "snappy" => Compression::SNAPPY, - "gzip" => Compression::GZIP( - GzipLevel::try_new(compression_level.unwrap_or(6)) - .map_err(|e| PyValueError::new_err(format!("{e}")))?, - ), - "brotli" => Compression::BROTLI( - BrotliLevel::try_new(verify_compression_level(compression_level)?) - .map_err(|e| PyValueError::new_err(format!("{e}")))?, - ), - "zstd" => Compression::ZSTD( - ZstdLevel::try_new(verify_compression_level(compression_level)? as i32) - .map_err(|e| PyValueError::new_err(format!("{e}")))?, - ), - "lzo" => Compression::LZO, - "lz4" => Compression::LZ4, - "lz4_raw" => Compression::LZ4_RAW, - "uncompressed" => Compression::UNCOMPRESSED, - _ => { - return Err(PyDataFusionError::Common(format!( - "Unrecognized compression type {compression}" - ))); - } + let table_options = TableParquetOptions { + global: options.options, + column_specific_options: column_specific_options + .into_iter() + .map(|(k, v)| (k, v.options)) + .collect(), + ..Default::default() }; - let mut compression_string = compression.to_string(); - if let Some(level) = compression_level { - compression_string.push_str(&format!("({level})")); - } - - let mut options = TableParquetOptions::default(); - options.global.compression = Some(compression_string); - wait_for_future( py, self.df.as_ref().clone().write_parquet( path, DataFrameWriteOptions::new(), - Option::from(options), + Option::from(table_options), ), )?; Ok(()) diff --git a/src/lib.rs b/src/lib.rs index 6eeda0878..990231c66 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -85,6 +85,8 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; From 5d5b1baf3f7bd6681f4f3346c09d560e791db14f Mon Sep 17 00:00:00 2001 From: nuno-faria Date: Sat, 14 Jun 2025 20:03:33 +0100 Subject: [PATCH 2/5] Create dedicated write_parquet_options function --- python/datafusion/__init__.py | 4 +- python/datafusion/dataframe.py | 338 +++++++++++++++++++++++---------- python/tests/test_dataframe.py | 159 +++++++++++----- src/dataframe.rs | 62 ++++++ 4 files changed, 411 insertions(+), 152 deletions(-) diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 273abbadb..fd7cd000a 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -31,7 +31,7 @@ from . import functions, object_store, substrait, unparser # The following imports are okay to remain as opaque to the user. -from ._internal import Config, ParquetWriterOptions +from ._internal import Config from .catalog import Catalog, Database, Table from .common import ( DFSchema, @@ -42,7 +42,7 @@ SessionContext, SQLOptions, ) -from .dataframe import DataFrame, ParquetColumnOptions +from .dataframe import DataFrame, ParquetColumnOptions, ParquetWriterOptions from .expr import ( Expr, WindowFrame, diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 96f939e70..3c8c09b38 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -28,6 +28,7 @@ Iterable, Literal, Optional, + Union, overload, ) @@ -50,16 +51,194 @@ from datafusion._internal import DataFrame as DataFrameInternal from datafusion._internal import expr as expr_internal +from enum import Enum + from datafusion._internal import ParquetColumnOptions as ParquetColumnOptionsInternal from datafusion._internal import ParquetWriterOptions as ParquetWriterOptionsInternal from datafusion.expr import Expr, SortExpr, sort_or_default +# excerpt from deltalake +# https://github.com/apache/datafusion-python/pull/981#discussion_r1905619163 +class Compression(Enum): + """Enum representing the available compression types for Parquet files.""" + + UNCOMPRESSED = "uncompressed" + SNAPPY = "snappy" + GZIP = "gzip" + BROTLI = "brotli" + LZ4 = "lz4" + # lzo is not implemented yet + # https://github.com/apache/arrow-rs/issues/6970 + # LZO = "lzo" + ZSTD = "zstd" + LZ4_RAW = "lz4_raw" + + @classmethod + def from_str(cls: type[Compression], value: str) -> Compression: + """Convert a string to a Compression enum value. + + Args: + value: The string representation of the compression type. + + Returns: + The Compression enum lowercase value. + + Raises: + ValueError: If the string does not match any Compression enum value. + """ + try: + return cls(value.lower()) + except ValueError as err: + valid_values = str([item.value for item in Compression]) + error_msg = f""" + {value} is not a valid Compression. + Valid values are: {valid_values} + """ + raise ValueError(error_msg) from err + + def get_default_level(self) -> Optional[int]: + """Get the default compression level for the compression type. + + Returns: + The default compression level for the compression type. + """ + # GZIP, BROTLI default values from deltalake repo + # https://github.com/apache/datafusion-python/pull/981#discussion_r1905619163 + # ZSTD default value from delta-rs + # https://github.com/apache/datafusion-python/pull/981#discussion_r1904789223 + if self == Compression.GZIP: + return 6 + if self == Compression.BROTLI: + return 1 + if self == Compression.ZSTD: + return 4 + return None + + +class ParquetWriterOptions: + """Advanced parquet writer options. + + Allows settings the writer options that apply to the entire file. Some options can + also be set on a column by column basis, with the field `column_specific_options` + (see `ParquetColumnOptions`). + + Attributes: + data_pagesize_limit: Sets best effort maximum size of data page in bytes. + write_batch_size: Sets write_batch_size in bytes. + writer_version: Sets parquet writer version. Valid values are `1.0` and + `2.0`. + skip_arrow_metadata: Skip encoding the embedded arrow metadata in the + KV_meta. + compression: Compression type to use. Default is "zstd(3)". + Available compression types are + - "uncompressed": No compression. + - "snappy": Snappy compression. + - "gzip(n)": Gzip compression with level n. + - "brotli(n)": Brotli compression with level n. + - "lz4": LZ4 compression. + - "lz4_raw": LZ4_RAW compression. + - "zstd(n)": Zstandard compression with level n. + dictionary_enabled: Sets if dictionary encoding is enabled. If None, uses + the default parquet writer setting. + dictionary_page_size_limit: Sets best effort maximum dictionary page size, + in bytes. + statistics_enabled: Sets if statistics are enabled for any column Valid + values are `none`, `chunk`, and `page`. If None, uses the default + parquet writer setting. + max_row_group_size: Target maximum number of rows in each row group + (defaults to 1M rows). Writing larger row groups requires more memory to + write, but can get better compression and be faster to read. + created_by: Sets "created by" property. + column_index_truncate_length: Sets column index truncate length. + statistics_truncate_length: Sets statistics truncate length. If None, uses + the default parquet writer setting. + data_page_row_count_limit: Sets best effort maximum number of rows in a data + page. + encoding: Sets default encoding for any column. Valid values are `plain`, + `plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`, + `delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, and + `byte_stream_split`. If None, uses the default parquet writer setting. + bloom_filter_on_write: Write bloom filters for all columns when creating + parquet files. + bloom_filter_fpp: Sets bloom filter false positive probability. If None, + uses the default parquet writer setting + bloom_filter_ndv: Sets bloom filter number of distinct values. If None, uses + the default parquet writer setting. + allow_single_file_parallelism: Controls whether DataFusion will attempt to + speed up writing parquet files by serializing them in parallel. Each + column in each row group in each output file are serialized in parallel + leveraging a maximum possible core count of n_files * n_row_groups * + n_columns. + maximum_parallel_row_group_writers: By default parallel parquet writer is + tuned for minimum memory usage in a streaming execution plan. You may + see a performance benefit when writing large parquet files by increasing + `maximum_parallel_row_group_writers` and + `maximum_buffered_record_batches_per_stream` if your system has idle + cores and can tolerate additional memory usage. Boosting these values is + likely worthwhile when writing out already in-memory data, such as from + a cached data frame. + maximum_buffered_record_batches_per_stream: See + `maximum_parallel_row_group_writers`. + column_specific_options: Overrides options for specific columns. If a column + is not a part of this dictionary, it will use the parameters provided here. + """ + + def __init__( + self, + data_pagesize_limit: int = 1024 * 1024, + write_batch_size: int = 1024, + writer_version: str = "1.0", + skip_arrow_metadata: bool = False, + compression: Optional[str] = "zstd(3)", + dictionary_enabled: Optional[bool] = True, + dictionary_page_size_limit: int = 1024 * 1024, + statistics_enabled: Optional[str] = "page", + max_row_group_size: int = 1024 * 1024, + created_by: str = "datafusion-python", + column_index_truncate_length: Optional[int] = 64, + statistics_truncate_length: Optional[int] = None, + data_page_row_count_limit: int = 20_000, + encoding: Optional[str] = None, + bloom_filter_on_write: bool = False, + bloom_filter_fpp: Optional[float] = None, + bloom_filter_ndv: Optional[int] = None, + allow_single_file_parallelism: bool = True, + maximum_parallel_row_group_writers: int = 1, + maximum_buffered_record_batches_per_stream: int = 2, + column_specific_options: Optional[dict[str, ParquetColumnOptions]] = None, + ) -> None: + """Initialize the ParquetWriterOptions.""" + self.data_pagesize_limit = data_pagesize_limit + self.write_batch_size = write_batch_size + self.writer_version = writer_version + self.skip_arrow_metadata = skip_arrow_metadata + self.compression = compression + self.dictionary_enabled = dictionary_enabled + self.dictionary_page_size_limit = dictionary_page_size_limit + self.statistics_enabled = statistics_enabled + self.max_row_group_size = max_row_group_size + self.created_by = created_by + self.column_index_truncate_length = column_index_truncate_length + self.statistics_truncate_length = statistics_truncate_length + self.data_page_row_count_limit = data_page_row_count_limit + self.encoding = encoding + self.bloom_filter_on_write = bloom_filter_on_write + self.bloom_filter_fpp = bloom_filter_fpp + self.bloom_filter_ndv = bloom_filter_ndv + self.allow_single_file_parallelism = allow_single_file_parallelism + self.maximum_parallel_row_group_writers = maximum_parallel_row_group_writers + self.maximum_buffered_record_batches_per_stream = ( + maximum_buffered_record_batches_per_stream + ) + self.column_specific_options = column_specific_options + + class ParquetColumnOptions: """Parquet options for individual columns. Contains the available options that can be applied for an individual Parquet column, - replacing the provided options in the `write_parquet`. + replacing the global options in `ParquetWriterOptions`. Attributes: encoding: Sets encoding for the column path. Valid values are: `plain`, @@ -694,120 +873,75 @@ def write_csv(self, path: str | pathlib.Path, with_header: bool = False) -> None def write_parquet( self, path: str | pathlib.Path, - data_pagesize_limit: int = 1024 * 1024, - write_batch_size: int = 1024, - writer_version: str = "1.0", - skip_arrow_metadata: bool = False, - compression: Optional[str] = "zstd(3)", - dictionary_enabled: Optional[bool] = True, - dictionary_page_size_limit: int = 1024 * 1024, - statistics_enabled: Optional[str] = "page", - max_row_group_size: int = 1024 * 1024, - created_by: str = "datafusion-python", - column_index_truncate_length: Optional[int] = 64, - statistics_truncate_length: Optional[int] = None, - data_page_row_count_limit: int = 20_000, - encoding: Optional[str] = None, - bloom_filter_on_write: bool = False, - bloom_filter_fpp: Optional[float] = None, - bloom_filter_ndv: Optional[int] = None, - allow_single_file_parallelism: bool = True, - maximum_parallel_row_group_writers: int = 1, - maximum_buffered_record_batches_per_stream: int = 2, - column_specific_options: Optional[dict[str, ParquetColumnOptions]] = None, + compression: Union[str, Compression] = Compression.ZSTD, + compression_level: int | None = None, ) -> None: """Execute the :py:class:`DataFrame` and write the results to a Parquet file. Args: path: Path of the Parquet file to write. - data_pagesize_limit: Sets best effort maximum size of data page in bytes. - write_batch_size: Sets write_batch_size in bytes. - writer_version: Sets parquet writer version. Valid values are `1.0` and - `2.0`. - skip_arrow_metadata: Skip encoding the embedded arrow metadata in the - KV_meta. - compression: Compression type to use. Default is "zstd(3)". - Available compression types are + compression: Compression type to use. Default is "ZSTD". + Available compression types are: - "uncompressed": No compression. - "snappy": Snappy compression. - - "gzip(n)": Gzip compression with level n. - - "brotli(n)": Brotli compression with level n. + - "gzip": Gzip compression. + - "brotli": Brotli compression. - "lz4": LZ4 compression. - "lz4_raw": LZ4_RAW compression. - - "zstd(n)": Zstandard compression with level n. - dictionary_enabled: Sets if dictionary encoding is enabled. If None, uses - the default parquet writer setting. - dictionary_page_size_limit: Sets best effort maximum dictionary page size, - in bytes. - statistics_enabled: Sets if statistics are enabled for any column Valid - values are `none`, `chunk`, and `page`. If None, uses the default - parquet writer setting. - max_row_group_size: Target maximum number of rows in each row group - (defaults to 1M rows). Writing larger row groups requires more memory to - write, but can get better compression and be faster to read. - created_by: Sets "created by" property. - column_index_truncate_length: Sets column index truncate length. - statistics_truncate_length: Sets statistics truncate length. If None, uses - the default parquet writer setting. - data_page_row_count_limit: Sets best effort maximum number of rows in a data - page. - encoding: Sets default encoding for any column. Valid values are `plain`, - `plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`, - `delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, and - `byte_stream_split`. If None, uses the default parquet writer setting. - bloom_filter_on_write: Write bloom filters for all columns when creating - parquet files. - bloom_filter_fpp: Sets bloom filter false positive probability. If None, - uses the default parquet writer setting - bloom_filter_ndv: Sets bloom filter number of distinct values. If None, uses - the default parquet writer setting. - allow_single_file_parallelism: Controls whether DataFusion will attempt to - speed up writing parquet files by serializing them in parallel. Each - column in each row group in each output file are serialized in parallel - leveraging a maximum possible core count of n_files * n_row_groups * - n_columns. - maximum_parallel_row_group_writers: By default parallel parquet writer is - tuned for minimum memory usage in a streaming execution plan. You may - see a performance benefit when writing large parquet files by increasing - `maximum_parallel_row_group_writers` and - `maximum_buffered_record_batches_per_stream` if your system has idle - cores and can tolerate additional memory usage. Boosting these values is - likely worthwhile when writing out already in-memory data, such as from - a cached data frame. - maximum_buffered_record_batches_per_stream: See - `maximum_parallel_row_group_writers`. - column_specific_options: Overrides options for specific columns. If a column - is not a part of this dictionary, it will use the parameters provided in - the `write_parquet`. + - "zstd": Zstandard compression. + Note: LZO is not yet implemented in arrow-rs and is therefore excluded. + compression_level: Compression level to use. For ZSTD, the + recommended range is 1 to 22, with the default being 4. Higher levels + provide better compression but slower speed. + """ + # Convert string to Compression enum if necessary + if isinstance(compression, str): + compression = Compression.from_str(compression) + + if ( + compression in {Compression.GZIP, Compression.BROTLI, Compression.ZSTD} + and compression_level is None + ): + compression_level = compression.get_default_level() + + self.df.write_parquet(str(path), compression.value, compression_level) + + def write_parquet_options( + self, path: str | pathlib.Path, options: ParquetWriterOptions + ) -> None: + """Execute the :py:class:`DataFrame` and write the results to a Parquet file. + + Allows advanced writer options to be set with `ParquetWriterOptions`. + + Args: + path: Path of the Parquet file to write. + options: Sets the writer parquet options (see `ParquetWriterOptions`). """ options_internal = ParquetWriterOptionsInternal( - data_pagesize_limit, - write_batch_size, - writer_version, - skip_arrow_metadata, - compression, - dictionary_enabled, - dictionary_page_size_limit, - statistics_enabled, - max_row_group_size, - created_by, - column_index_truncate_length, - statistics_truncate_length, - data_page_row_count_limit, - encoding, - bloom_filter_on_write, - bloom_filter_fpp, - bloom_filter_ndv, - allow_single_file_parallelism, - maximum_parallel_row_group_writers, - maximum_buffered_record_batches_per_stream, + options.data_pagesize_limit, + options.write_batch_size, + options.writer_version, + options.skip_arrow_metadata, + options.compression, + options.dictionary_enabled, + options.dictionary_page_size_limit, + options.statistics_enabled, + options.max_row_group_size, + options.created_by, + options.column_index_truncate_length, + options.statistics_truncate_length, + options.data_page_row_count_limit, + options.encoding, + options.bloom_filter_on_write, + options.bloom_filter_fpp, + options.bloom_filter_ndv, + options.allow_single_file_parallelism, + options.maximum_parallel_row_group_writers, + options.maximum_buffered_record_batches_per_stream, ) - if column_specific_options is None: - column_specific_options = {} - column_specific_options_internal = {} - for column, opts in column_specific_options.items(): + for column, opts in (options.column_specific_options or {}).items(): column_specific_options_internal[column] = ParquetColumnOptionsInternal( bloom_filter_enabled=opts.bloom_filter_enabled, encoding=opts.encoding, @@ -818,7 +952,7 @@ def write_parquet( bloom_filter_ndv=opts.bloom_filter_ndv, ) - self.df.write_parquet( + self.df.write_parquet_options( str(path), options_internal, column_specific_options_internal, diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index e1e29c45c..8ad62f79f 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -23,6 +23,7 @@ import pytest from datafusion import ( DataFrame, + ParquetWriterOptions, ParquetColumnOptions, SessionContext, WindowFrame, @@ -1549,7 +1550,70 @@ def test_write_parquet(df, tmp_path, path_to_str): assert result == expected -def test_write_parquet_default_compression(df, tmp_path): +@pytest.mark.parametrize( + ("compression", "compression_level"), + [("gzip", 6), ("brotli", 7), ("zstd", 15)], +) +def test_write_compressed_parquet(df, tmp_path, compression, compression_level): + path = tmp_path + + df.write_parquet( + str(path), compression=compression, compression_level=compression_level + ) + + # test that the actual compression scheme is the one written + for _root, _dirs, files in os.walk(path): + for file in files: + if file.endswith(".parquet"): + metadata = pq.ParquetFile(tmp_path / file).metadata.to_dict() + for row_group in metadata["row_groups"]: + for columns in row_group["columns"]: + assert columns["compression"].lower() == compression + + result = pq.read_table(str(path)).to_pydict() + expected = df.to_pydict() + + assert result == expected + + +@pytest.mark.parametrize( + ("compression", "compression_level"), + [("gzip", 12), ("brotli", 15), ("zstd", 23), ("wrong", 12)], +) +def test_write_compressed_parquet_wrong_compression_level( + df, tmp_path, compression, compression_level +): + path = tmp_path + + with pytest.raises(ValueError): + df.write_parquet( + str(path), + compression=compression, + compression_level=compression_level, + ) + + +@pytest.mark.parametrize("compression", ["wrong"]) +def test_write_compressed_parquet_invalid_compression(df, tmp_path, compression): + path = tmp_path + + with pytest.raises(ValueError): + df.write_parquet(str(path), compression=compression) + + +# not testing lzo because it it not implemented yet +# https://github.com/apache/arrow-rs/issues/6970 +@pytest.mark.parametrize("compression", ["zstd", "brotli", "gzip"]) +def test_write_compressed_parquet_default_compression_level(df, tmp_path, compression): + # Test write_parquet with zstd, brotli, gzip default compression level, + # ie don't specify compression level + # should complete without error + path = tmp_path + + df.write_parquet(str(path), compression=compression) + + +def test_write_parquet_options_default_compression(df, tmp_path): """Test that the default compression is ZSTD.""" df.write_parquet(tmp_path) @@ -1564,11 +1628,11 @@ def test_write_parquet_default_compression(df, tmp_path): "compression", ["gzip(6)", "brotli(7)", "zstd(15)", "snappy", "uncompressed"], ) -def test_write_compressed_parquet(df, tmp_path, compression): +def test_write_parquet_options_compression(df, tmp_path, compression): import re path = tmp_path - df.write_parquet(str(path), compression=compression) + df.write_parquet_options(str(path), ParquetWriterOptions(compression=compression)) # test that the actual compression scheme is the one written for _root, _dirs, files in os.walk(path): @@ -1591,32 +1655,32 @@ def test_write_compressed_parquet(df, tmp_path, compression): "compression", ["gzip(12)", "brotli(15)", "zstd(23)"], ) -def test_write_compressed_parquet_wrong_compression_level(df, tmp_path, compression): +def test_write_parquet_options_wrong_compression_level(df, tmp_path, compression): path = tmp_path with pytest.raises(Exception, match=r"valid compression range .*? exceeded."): - df.write_parquet(str(path), compression=compression) + df.write_parquet_options(str(path), ParquetWriterOptions(compression=compression)) @pytest.mark.parametrize("compression", ["wrong", "wrong(12)"]) -def test_write_compressed_parquet_invalid_compression(df, tmp_path, compression): +def test_write_parquet_options_invalid_compression(df, tmp_path, compression): path = tmp_path with pytest.raises(Exception, match="Unknown or unsupported parquet compression"): - df.write_parquet(str(path), compression=compression) + df.write_parquet_options(str(path), ParquetWriterOptions(compression=compression)) @pytest.mark.parametrize( ("writer_version", "format_version"), [("1.0", "1.0"), ("2.0", "2.6"), (None, "1.0")], ) -def test_write_parquet_writer_version(df, tmp_path, writer_version, format_version): +def test_write_parquet_options_writer_version(df, tmp_path, writer_version, format_version): """Test the Parquet writer version. Note that writer_version=2.0 results in format_version=2.6""" if writer_version is None: - df.write_parquet(tmp_path) + df.write_parquet_options(tmp_path, ParquetWriterOptions()) else: - df.write_parquet(tmp_path, writer_version=writer_version) + df.write_parquet_options(tmp_path, ParquetWriterOptions(writer_version=writer_version)) for file in tmp_path.rglob("*.parquet"): parquet = pq.ParquetFile(file) @@ -1625,18 +1689,18 @@ def test_write_parquet_writer_version(df, tmp_path, writer_version, format_versi @pytest.mark.parametrize("writer_version", ["1.2.3", "custom-version", "0"]) -def test_write_parquet_wrong_writer_version(df, tmp_path, writer_version): +def test_write_parquet_options_wrong_writer_version(df, tmp_path, writer_version): """Test that invalid writer versions in Parquet throw an exception.""" with pytest.raises( Exception, match="Unknown or unsupported parquet writer version" ): - df.write_parquet(tmp_path, writer_version=writer_version) + df.write_parquet_options(tmp_path, ParquetWriterOptions(writer_version=writer_version)) @pytest.mark.parametrize("dictionary_enabled", [True, False, None]) -def test_write_parquet_dictionary_enabled(df, tmp_path, dictionary_enabled): +def test_write_parquet_options_dictionary_enabled(df, tmp_path, dictionary_enabled): """Test enabling/disabling the dictionaries in Parquet.""" - df.write_parquet(tmp_path, dictionary_enabled=dictionary_enabled) + df.write_parquet_options(tmp_path, ParquetWriterOptions(dictionary_enabled=dictionary_enabled)) # by default, the dictionary is enabled, so None results in True result = dictionary_enabled if dictionary_enabled is not None else True @@ -1653,12 +1717,12 @@ def test_write_parquet_dictionary_enabled(df, tmp_path, dictionary_enabled): ("statistics_enabled", "has_statistics"), [("page", True), ("chunk", True), ("none", False), (None, True)], ) -def test_write_parquet_statistics_enabled( +def test_write_parquet_options_statistics_enabled( df, tmp_path, statistics_enabled, has_statistics ): """Test configuring the statistics in Parquet. In pyarrow we can only check for column-level statistics, so "page" and "chunk" are tested in the same way.""" - df.write_parquet(tmp_path, statistics_enabled=statistics_enabled) + df.write_parquet_options(tmp_path, ParquetWriterOptions(statistics_enabled=statistics_enabled)) for file in tmp_path.rglob("*.parquet"): parquet = pq.ParquetFile(file) @@ -1673,11 +1737,11 @@ def test_write_parquet_statistics_enabled( @pytest.mark.parametrize("max_row_group_size", [1000, 5000, 10000, 100000]) -def test_write_parquet_max_row_group_size(large_df, tmp_path, max_row_group_size): +def test_write_parquet_options_max_row_group_size(large_df, tmp_path, max_row_group_size): """Test configuring the max number of rows per group in Parquet. These test cases guarantee that the number of rows for each row group is max_row_group_size, given the total number of rows is a multiple of max_row_group_size.""" - large_df.write_parquet(tmp_path, max_row_group_size=max_row_group_size) + large_df.write_parquet_options(tmp_path, ParquetWriterOptions(max_row_group_size=max_row_group_size)) for file in tmp_path.rglob("*.parquet"): parquet = pq.ParquetFile(file) @@ -1687,9 +1751,9 @@ def test_write_parquet_max_row_group_size(large_df, tmp_path, max_row_group_size @pytest.mark.parametrize("created_by", ["datafusion", "datafusion-python", "custom"]) -def test_write_parquet_created_by(df, tmp_path, created_by): +def test_write_parquet_options_created_by(df, tmp_path, created_by): """Test configuring the created by metadata in Parquet.""" - df.write_parquet(tmp_path, created_by=created_by) + df.write_parquet_options(tmp_path, ParquetWriterOptions(created_by=created_by)) for file in tmp_path.rglob("*.parquet"): parquet = pq.ParquetFile(file) @@ -1698,7 +1762,7 @@ def test_write_parquet_created_by(df, tmp_path, created_by): @pytest.mark.parametrize("statistics_truncate_length", [5, 25, 50]) -def test_write_parquet_statistics_truncate_length( +def test_write_parquet_options_statistics_truncate_length( df, tmp_path, statistics_truncate_length ): """Test configuring the truncate limit in Parquet's row-group-level statistics.""" @@ -1712,7 +1776,7 @@ def test_write_parquet_statistics_truncate_length( "b": ["a_smaller", "m_smaller", "z_smaller"], } df = ctx.from_arrow(pa.record_batch(data)) - df.write_parquet(tmp_path, statistics_truncate_length=statistics_truncate_length) + df.write_parquet_options(tmp_path, ParquetWriterOptions(statistics_truncate_length=statistics_truncate_length)) for file in tmp_path.rglob("*.parquet"): parquet = pq.ParquetFile(file) @@ -1725,7 +1789,7 @@ def test_write_parquet_statistics_truncate_length( assert len(statistics["max"]) <= statistics_truncate_length -def test_write_parquet_default_encoding(tmp_path): +def test_write_parquet_options_default_encoding(tmp_path): """Test that, by default, Parquet files are written with dictionary encoding. Note that dictionary encoding is not used for boolean values, so it is not tested here.""" @@ -1736,7 +1800,7 @@ def test_write_parquet_default_encoding(tmp_path): "c": [1.01, 2.02, 3.03], } df = ctx.from_arrow(pa.record_batch(data)) - df.write_parquet(tmp_path) + df.write_parquet_options(tmp_path, ParquetWriterOptions()) for file in tmp_path.rglob("*.parquet"): parquet = pq.ParquetFile(file) @@ -1758,24 +1822,23 @@ def test_write_parquet_default_encoding(tmp_path): ("byte_stream_split", ["int", "float"], ("RLE", "BYTE_STREAM_SPLIT")), ], ) -def test_write_parquet_encoding(tmp_path, encoding, data_types, result): +def test_write_parquet_options_encoding(tmp_path, encoding, data_types, result): """Test different encodings in Parquet in their respective support column types.""" ctx = SessionContext() data = {} for data_type in data_types: - match data_type: - case "int": - data["int"] = [1, 2, 3] - case "float": - data["float"] = [1.01, 2.02, 3.03] - case "str": - data["str"] = ["a", "b", "c"] - case "bool": - data["bool"] = [True, False, True] + if data_type == "int": + data["int"] = [1, 2, 3] + elif data_type == "float": + data["float"] = [1.01, 2.02, 3.03] + elif data_type == "str": + data["str"] = ["a", "b", "c"] + elif data_type == "bool": + data["bool"] = [True, False, True] df = ctx.from_arrow(pa.record_batch(data)) - df.write_parquet(tmp_path, encoding=encoding, dictionary_enabled=False) + df.write_parquet_options(tmp_path, ParquetWriterOptions(encoding=encoding, dictionary_enabled=False)) for file in tmp_path.rglob("*.parquet"): parquet = pq.ParquetFile(file) @@ -1787,39 +1850,39 @@ def test_write_parquet_encoding(tmp_path, encoding, data_types, result): @pytest.mark.parametrize("encoding", ["bit_packed"]) -def test_write_parquet_unsupported_encoding(df, tmp_path, encoding): +def test_write_parquet_options_unsupported_encoding(df, tmp_path, encoding): """Test that unsupported Parquet encodings do not work.""" # BaseException is used since this throws a Rust panic: https://github.com/PyO3/pyo3/issues/3519 with pytest.raises(BaseException, match="Encoding .*? is not supported"): - df.write_parquet(tmp_path, encoding=encoding) + df.write_parquet_options(tmp_path, ParquetWriterOptions(encoding=encoding)) @pytest.mark.parametrize("encoding", ["non_existent", "unknown", "plain123"]) -def test_write_parquet_invalid_encoding(df, tmp_path, encoding): +def test_write_parquet_options_invalid_encoding(df, tmp_path, encoding): """Test that invalid Parquet encodings do not work.""" with pytest.raises(Exception, match="Unknown or unsupported parquet encoding"): - df.write_parquet(tmp_path, encoding=encoding) + df.write_parquet_options(tmp_path, ParquetWriterOptions(encoding=encoding)) @pytest.mark.parametrize("encoding", ["plain_dictionary", "rle_dictionary"]) -def test_write_parquet_dictionary_encoding_fallback(df, tmp_path, encoding): +def test_write_parquet_options_dictionary_encoding_fallback(df, tmp_path, encoding): """Test that the dictionary encoding cannot be used as fallback in Parquet.""" # BaseException is used since this throws a Rust panic: https://github.com/PyO3/pyo3/issues/3519 with pytest.raises( BaseException, match="Dictionary encoding can not be used as fallback encoding" ): - df.write_parquet(tmp_path, encoding=encoding) + df.write_parquet_options(tmp_path, ParquetWriterOptions(encoding=encoding)) -def test_write_parquet_bloom_filter(df, tmp_path): +def test_write_parquet_options_bloom_filter(df, tmp_path): """Test Parquet files with and without (default) bloom filters. Since pyarrow does not expose any information about bloom filters, the easiest way to confirm that they are actually written is to compare the file size.""" path_no_bloom_filter = tmp_path / "1" path_bloom_filter = tmp_path / "2" - df.write_parquet(path_no_bloom_filter) - df.write_parquet(path_bloom_filter, bloom_filter_on_write=True) + df.write_parquet_options(path_no_bloom_filter, ParquetWriterOptions()) + df.write_parquet_options(path_bloom_filter, ParquetWriterOptions(bloom_filter_on_write=True)) size_no_bloom_filter = 0 for file in path_no_bloom_filter.rglob("*.parquet"): @@ -1832,7 +1895,7 @@ def test_write_parquet_bloom_filter(df, tmp_path): assert size_no_bloom_filter < size_bloom_filter -def test_write_parquet_column_options(df, tmp_path): +def test_write_parquet_options_column_options(df, tmp_path): """Test writing Parquet files with different options for each column, which replace the global configs (when provided).""" data = { @@ -1888,10 +1951,10 @@ def test_write_parquet_column_options(df, tmp_path): ctx = SessionContext() df = ctx.from_arrow(pa.record_batch(data)) - df.write_parquet( + df.write_parquet_options( tmp_path, - compression="brotli(8)", - column_specific_options=column_specific_options, + ParquetWriterOptions(compression="brotli(8)", + column_specific_options=column_specific_options), ) for file in tmp_path.rglob("*.parquet"): diff --git a/src/dataframe.rs b/src/dataframe.rs index ffb3f36cf..eca039c44 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -33,6 +33,7 @@ use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::datasource::TableProvider; use datafusion::error::DataFusionError; use datafusion::execution::SendableRecordBatchStream; +use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; use datafusion::prelude::*; use futures::{StreamExt, TryStreamExt}; use pyo3::exceptions::PyValueError; @@ -708,7 +709,68 @@ impl PyDataFrame { } /// Write a `DataFrame` to a Parquet file. + #[pyo3(signature = ( + path, + compression="zstd", + compression_level=None + ))] fn write_parquet( + &self, + path: &str, + compression: &str, + compression_level: Option, + py: Python, + ) -> PyDataFusionResult<()> { + fn verify_compression_level(cl: Option) -> Result { + cl.ok_or(PyValueError::new_err("compression_level is not defined")) + } + + let _validated = match compression.to_lowercase().as_str() { + "snappy" => Compression::SNAPPY, + "gzip" => Compression::GZIP( + GzipLevel::try_new(compression_level.unwrap_or(6)) + .map_err(|e| PyValueError::new_err(format!("{e}")))?, + ), + "brotli" => Compression::BROTLI( + BrotliLevel::try_new(verify_compression_level(compression_level)?) + .map_err(|e| PyValueError::new_err(format!("{e}")))?, + ), + "zstd" => Compression::ZSTD( + ZstdLevel::try_new(verify_compression_level(compression_level)? as i32) + .map_err(|e| PyValueError::new_err(format!("{e}")))?, + ), + "lzo" => Compression::LZO, + "lz4" => Compression::LZ4, + "lz4_raw" => Compression::LZ4_RAW, + "uncompressed" => Compression::UNCOMPRESSED, + _ => { + return Err(PyDataFusionError::Common(format!( + "Unrecognized compression type {compression}" + ))); + } + }; + + let mut compression_string = compression.to_string(); + if let Some(level) = compression_level { + compression_string.push_str(&format!("({level})")); + } + + let mut options = TableParquetOptions::default(); + options.global.compression = Some(compression_string); + + wait_for_future( + py, + self.df.as_ref().clone().write_parquet( + path, + DataFrameWriteOptions::new(), + Option::from(options), + ), + )?; + Ok(()) + } + + /// Write a `DataFrame` to a Parquet file, using advanced options. + fn write_parquet_options( &self, path: &str, options: PyParquetWriterOptions, From b738b19a18c2485cd0ad7de3c205cf6b98788396 Mon Sep 17 00:00:00 2001 From: nuno-faria Date: Fri, 20 Jun 2025 08:58:58 +0100 Subject: [PATCH 3/5] Rename write_parquet_options to write_parquet_with_options --- python/datafusion/dataframe.py | 4 +- python/tests/test_dataframe.py | 74 +++++++++++++++++----------------- src/dataframe.rs | 2 +- 3 files changed, 40 insertions(+), 40 deletions(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 3c8c09b38..014331541 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -906,7 +906,7 @@ def write_parquet( self.df.write_parquet(str(path), compression.value, compression_level) - def write_parquet_options( + def write_parquet_with_options( self, path: str | pathlib.Path, options: ParquetWriterOptions ) -> None: """Execute the :py:class:`DataFrame` and write the results to a Parquet file. @@ -952,7 +952,7 @@ def write_parquet_options( bloom_filter_ndv=opts.bloom_filter_ndv, ) - self.df.write_parquet_options( + self.df.write_parquet_with_options( str(path), options_internal, column_specific_options_internal, diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 8ad62f79f..daa4331df 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1613,7 +1613,7 @@ def test_write_compressed_parquet_default_compression_level(df, tmp_path, compre df.write_parquet(str(path), compression=compression) -def test_write_parquet_options_default_compression(df, tmp_path): +def test_write_parquet_with_options_default_compression(df, tmp_path): """Test that the default compression is ZSTD.""" df.write_parquet(tmp_path) @@ -1628,11 +1628,11 @@ def test_write_parquet_options_default_compression(df, tmp_path): "compression", ["gzip(6)", "brotli(7)", "zstd(15)", "snappy", "uncompressed"], ) -def test_write_parquet_options_compression(df, tmp_path, compression): +def test_write_parquet_with_options_compression(df, tmp_path, compression): import re path = tmp_path - df.write_parquet_options(str(path), ParquetWriterOptions(compression=compression)) + df.write_parquet_with_options(str(path), ParquetWriterOptions(compression=compression)) # test that the actual compression scheme is the one written for _root, _dirs, files in os.walk(path): @@ -1655,32 +1655,32 @@ def test_write_parquet_options_compression(df, tmp_path, compression): "compression", ["gzip(12)", "brotli(15)", "zstd(23)"], ) -def test_write_parquet_options_wrong_compression_level(df, tmp_path, compression): +def test_write_parquet_with_options_wrong_compression_level(df, tmp_path, compression): path = tmp_path with pytest.raises(Exception, match=r"valid compression range .*? exceeded."): - df.write_parquet_options(str(path), ParquetWriterOptions(compression=compression)) + df.write_parquet_with_options(str(path), ParquetWriterOptions(compression=compression)) @pytest.mark.parametrize("compression", ["wrong", "wrong(12)"]) -def test_write_parquet_options_invalid_compression(df, tmp_path, compression): +def test_write_parquet_with_options_invalid_compression(df, tmp_path, compression): path = tmp_path with pytest.raises(Exception, match="Unknown or unsupported parquet compression"): - df.write_parquet_options(str(path), ParquetWriterOptions(compression=compression)) + df.write_parquet_with_options(str(path), ParquetWriterOptions(compression=compression)) @pytest.mark.parametrize( ("writer_version", "format_version"), [("1.0", "1.0"), ("2.0", "2.6"), (None, "1.0")], ) -def test_write_parquet_options_writer_version(df, tmp_path, writer_version, format_version): +def test_write_parquet_with_options_writer_version(df, tmp_path, writer_version, format_version): """Test the Parquet writer version. Note that writer_version=2.0 results in format_version=2.6""" if writer_version is None: - df.write_parquet_options(tmp_path, ParquetWriterOptions()) + df.write_parquet_with_options(tmp_path, ParquetWriterOptions()) else: - df.write_parquet_options(tmp_path, ParquetWriterOptions(writer_version=writer_version)) + df.write_parquet_with_options(tmp_path, ParquetWriterOptions(writer_version=writer_version)) for file in tmp_path.rglob("*.parquet"): parquet = pq.ParquetFile(file) @@ -1689,18 +1689,18 @@ def test_write_parquet_options_writer_version(df, tmp_path, writer_version, form @pytest.mark.parametrize("writer_version", ["1.2.3", "custom-version", "0"]) -def test_write_parquet_options_wrong_writer_version(df, tmp_path, writer_version): +def test_write_parquet_with_options_wrong_writer_version(df, tmp_path, writer_version): """Test that invalid writer versions in Parquet throw an exception.""" with pytest.raises( Exception, match="Unknown or unsupported parquet writer version" ): - df.write_parquet_options(tmp_path, ParquetWriterOptions(writer_version=writer_version)) + df.write_parquet_with_options(tmp_path, ParquetWriterOptions(writer_version=writer_version)) @pytest.mark.parametrize("dictionary_enabled", [True, False, None]) -def test_write_parquet_options_dictionary_enabled(df, tmp_path, dictionary_enabled): +def test_write_parquet_with_options_dictionary_enabled(df, tmp_path, dictionary_enabled): """Test enabling/disabling the dictionaries in Parquet.""" - df.write_parquet_options(tmp_path, ParquetWriterOptions(dictionary_enabled=dictionary_enabled)) + df.write_parquet_with_options(tmp_path, ParquetWriterOptions(dictionary_enabled=dictionary_enabled)) # by default, the dictionary is enabled, so None results in True result = dictionary_enabled if dictionary_enabled is not None else True @@ -1717,12 +1717,12 @@ def test_write_parquet_options_dictionary_enabled(df, tmp_path, dictionary_enabl ("statistics_enabled", "has_statistics"), [("page", True), ("chunk", True), ("none", False), (None, True)], ) -def test_write_parquet_options_statistics_enabled( +def test_write_parquet_with_options_statistics_enabled( df, tmp_path, statistics_enabled, has_statistics ): """Test configuring the statistics in Parquet. In pyarrow we can only check for column-level statistics, so "page" and "chunk" are tested in the same way.""" - df.write_parquet_options(tmp_path, ParquetWriterOptions(statistics_enabled=statistics_enabled)) + df.write_parquet_with_options(tmp_path, ParquetWriterOptions(statistics_enabled=statistics_enabled)) for file in tmp_path.rglob("*.parquet"): parquet = pq.ParquetFile(file) @@ -1737,11 +1737,11 @@ def test_write_parquet_options_statistics_enabled( @pytest.mark.parametrize("max_row_group_size", [1000, 5000, 10000, 100000]) -def test_write_parquet_options_max_row_group_size(large_df, tmp_path, max_row_group_size): +def test_write_parquet_with_options_max_row_group_size(large_df, tmp_path, max_row_group_size): """Test configuring the max number of rows per group in Parquet. These test cases guarantee that the number of rows for each row group is max_row_group_size, given the total number of rows is a multiple of max_row_group_size.""" - large_df.write_parquet_options(tmp_path, ParquetWriterOptions(max_row_group_size=max_row_group_size)) + large_df.write_parquet_with_options(tmp_path, ParquetWriterOptions(max_row_group_size=max_row_group_size)) for file in tmp_path.rglob("*.parquet"): parquet = pq.ParquetFile(file) @@ -1751,9 +1751,9 @@ def test_write_parquet_options_max_row_group_size(large_df, tmp_path, max_row_gr @pytest.mark.parametrize("created_by", ["datafusion", "datafusion-python", "custom"]) -def test_write_parquet_options_created_by(df, tmp_path, created_by): +def test_write_parquet_with_options_created_by(df, tmp_path, created_by): """Test configuring the created by metadata in Parquet.""" - df.write_parquet_options(tmp_path, ParquetWriterOptions(created_by=created_by)) + df.write_parquet_with_options(tmp_path, ParquetWriterOptions(created_by=created_by)) for file in tmp_path.rglob("*.parquet"): parquet = pq.ParquetFile(file) @@ -1762,7 +1762,7 @@ def test_write_parquet_options_created_by(df, tmp_path, created_by): @pytest.mark.parametrize("statistics_truncate_length", [5, 25, 50]) -def test_write_parquet_options_statistics_truncate_length( +def test_write_parquet_with_options_statistics_truncate_length( df, tmp_path, statistics_truncate_length ): """Test configuring the truncate limit in Parquet's row-group-level statistics.""" @@ -1776,7 +1776,7 @@ def test_write_parquet_options_statistics_truncate_length( "b": ["a_smaller", "m_smaller", "z_smaller"], } df = ctx.from_arrow(pa.record_batch(data)) - df.write_parquet_options(tmp_path, ParquetWriterOptions(statistics_truncate_length=statistics_truncate_length)) + df.write_parquet_with_options(tmp_path, ParquetWriterOptions(statistics_truncate_length=statistics_truncate_length)) for file in tmp_path.rglob("*.parquet"): parquet = pq.ParquetFile(file) @@ -1789,7 +1789,7 @@ def test_write_parquet_options_statistics_truncate_length( assert len(statistics["max"]) <= statistics_truncate_length -def test_write_parquet_options_default_encoding(tmp_path): +def test_write_parquet_with_options_default_encoding(tmp_path): """Test that, by default, Parquet files are written with dictionary encoding. Note that dictionary encoding is not used for boolean values, so it is not tested here.""" @@ -1800,7 +1800,7 @@ def test_write_parquet_options_default_encoding(tmp_path): "c": [1.01, 2.02, 3.03], } df = ctx.from_arrow(pa.record_batch(data)) - df.write_parquet_options(tmp_path, ParquetWriterOptions()) + df.write_parquet_with_options(tmp_path, ParquetWriterOptions()) for file in tmp_path.rglob("*.parquet"): parquet = pq.ParquetFile(file) @@ -1822,7 +1822,7 @@ def test_write_parquet_options_default_encoding(tmp_path): ("byte_stream_split", ["int", "float"], ("RLE", "BYTE_STREAM_SPLIT")), ], ) -def test_write_parquet_options_encoding(tmp_path, encoding, data_types, result): +def test_write_parquet_with_options_encoding(tmp_path, encoding, data_types, result): """Test different encodings in Parquet in their respective support column types.""" ctx = SessionContext() @@ -1838,7 +1838,7 @@ def test_write_parquet_options_encoding(tmp_path, encoding, data_types, result): data["bool"] = [True, False, True] df = ctx.from_arrow(pa.record_batch(data)) - df.write_parquet_options(tmp_path, ParquetWriterOptions(encoding=encoding, dictionary_enabled=False)) + df.write_parquet_with_options(tmp_path, ParquetWriterOptions(encoding=encoding, dictionary_enabled=False)) for file in tmp_path.rglob("*.parquet"): parquet = pq.ParquetFile(file) @@ -1850,39 +1850,39 @@ def test_write_parquet_options_encoding(tmp_path, encoding, data_types, result): @pytest.mark.parametrize("encoding", ["bit_packed"]) -def test_write_parquet_options_unsupported_encoding(df, tmp_path, encoding): +def test_write_parquet_with_options_unsupported_encoding(df, tmp_path, encoding): """Test that unsupported Parquet encodings do not work.""" # BaseException is used since this throws a Rust panic: https://github.com/PyO3/pyo3/issues/3519 with pytest.raises(BaseException, match="Encoding .*? is not supported"): - df.write_parquet_options(tmp_path, ParquetWriterOptions(encoding=encoding)) + df.write_parquet_with_options(tmp_path, ParquetWriterOptions(encoding=encoding)) @pytest.mark.parametrize("encoding", ["non_existent", "unknown", "plain123"]) -def test_write_parquet_options_invalid_encoding(df, tmp_path, encoding): +def test_write_parquet_with_options_invalid_encoding(df, tmp_path, encoding): """Test that invalid Parquet encodings do not work.""" with pytest.raises(Exception, match="Unknown or unsupported parquet encoding"): - df.write_parquet_options(tmp_path, ParquetWriterOptions(encoding=encoding)) + df.write_parquet_with_options(tmp_path, ParquetWriterOptions(encoding=encoding)) @pytest.mark.parametrize("encoding", ["plain_dictionary", "rle_dictionary"]) -def test_write_parquet_options_dictionary_encoding_fallback(df, tmp_path, encoding): +def test_write_parquet_with_options_dictionary_encoding_fallback(df, tmp_path, encoding): """Test that the dictionary encoding cannot be used as fallback in Parquet.""" # BaseException is used since this throws a Rust panic: https://github.com/PyO3/pyo3/issues/3519 with pytest.raises( BaseException, match="Dictionary encoding can not be used as fallback encoding" ): - df.write_parquet_options(tmp_path, ParquetWriterOptions(encoding=encoding)) + df.write_parquet_with_options(tmp_path, ParquetWriterOptions(encoding=encoding)) -def test_write_parquet_options_bloom_filter(df, tmp_path): +def test_write_parquet_with_options_bloom_filter(df, tmp_path): """Test Parquet files with and without (default) bloom filters. Since pyarrow does not expose any information about bloom filters, the easiest way to confirm that they are actually written is to compare the file size.""" path_no_bloom_filter = tmp_path / "1" path_bloom_filter = tmp_path / "2" - df.write_parquet_options(path_no_bloom_filter, ParquetWriterOptions()) - df.write_parquet_options(path_bloom_filter, ParquetWriterOptions(bloom_filter_on_write=True)) + df.write_parquet_with_options(path_no_bloom_filter, ParquetWriterOptions()) + df.write_parquet_with_options(path_bloom_filter, ParquetWriterOptions(bloom_filter_on_write=True)) size_no_bloom_filter = 0 for file in path_no_bloom_filter.rglob("*.parquet"): @@ -1895,7 +1895,7 @@ def test_write_parquet_options_bloom_filter(df, tmp_path): assert size_no_bloom_filter < size_bloom_filter -def test_write_parquet_options_column_options(df, tmp_path): +def test_write_parquet_with_options_column_options(df, tmp_path): """Test writing Parquet files with different options for each column, which replace the global configs (when provided).""" data = { @@ -1951,7 +1951,7 @@ def test_write_parquet_options_column_options(df, tmp_path): ctx = SessionContext() df = ctx.from_arrow(pa.record_batch(data)) - df.write_parquet_options( + df.write_parquet_with_options( tmp_path, ParquetWriterOptions(compression="brotli(8)", column_specific_options=column_specific_options), diff --git a/src/dataframe.rs b/src/dataframe.rs index eca039c44..afe25ea7c 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -770,7 +770,7 @@ impl PyDataFrame { } /// Write a `DataFrame` to a Parquet file, using advanced options. - fn write_parquet_options( + fn write_parquet_with_options( &self, path: &str, options: PyParquetWriterOptions, From 567955f8b10eb841f3d7bd21aec8de699862c51d Mon Sep 17 00:00:00 2001 From: nuno-faria Date: Fri, 20 Jun 2025 09:14:53 +0100 Subject: [PATCH 4/5] Merge remote-tracking branch 'origin/main' into write_parquet_options --- .github/workflows/build.yml | 8 +- .github/workflows/docs.yaml | 2 +- .github/workflows/test.yaml | 13 +- Cargo.lock | 1064 +++++++------ Cargo.toml | 16 +- dev/changelog/47.0.0.md | 64 + dev/release/README.md | 20 +- docs/source/api/dataframe.rst | 387 +++++ docs/source/api/index.rst | 27 + docs/source/conf.py | 4 + docs/source/index.rst | 2 + .../common-operations/functions.rst | 21 + .../common-operations/udf-and-udfa.rst | 44 +- docs/source/user-guide/dataframe.rst | 3 +- .../.cargo/config.toml | 0 .../Cargo.lock | 1333 ++++++++++------- .../Cargo.toml | 12 +- .../build.rs | 0 .../pyproject.toml | 2 +- .../python/tests/_test_table_function.py | 134 ++ .../python/tests/_test_table_provider.py | 4 +- examples/datafusion-ffi-example/src/lib.rs | 30 + .../src/table_function.rs | 56 + .../src/table_provider.rs} | 48 +- examples/python-udwf.py | 2 +- pyproject.toml | 4 + python/datafusion/__init__.py | 33 +- python/datafusion/catalog.py | 12 + python/datafusion/context.py | 76 +- python/datafusion/dataframe.py | 30 +- python/datafusion/expr.py | 18 + python/datafusion/io.py | 8 +- python/datafusion/udf.py | 756 +--------- python/datafusion/user_defined.py | 845 +++++++++++ python/tests/test_dataframe.py | 387 +++++ python/tests/test_expr.py | 523 ++++++- python/tests/test_functions.py | 61 + python/tests/test_imports.py | 2 +- python/tests/test_sql.py | 26 +- python/tests/test_udwf.py | 2 +- python/tests/test_wrapper_coverage.py | 7 +- src/catalog.rs | 2 +- src/config.rs | 21 +- src/context.rs | 195 ++- src/dataframe.rs | 90 +- src/errors.rs | 4 +- src/expr.rs | 48 +- src/expr/literal.rs | 16 +- src/expr/window.rs | 29 +- src/functions.rs | 6 +- src/lib.rs | 2 + src/pyarrow_filter_expression.rs | 4 +- src/record_batch.rs | 2 +- src/substrait.rs | 11 +- src/udtf.rs | 127 ++ src/udwf.rs | 8 +- src/utils.rs | 60 +- 57 files changed, 4659 insertions(+), 2052 deletions(-) create mode 100644 dev/changelog/47.0.0.md create mode 100644 docs/source/api/dataframe.rst create mode 100644 docs/source/api/index.rst rename examples/{ffi-table-provider => datafusion-ffi-example}/.cargo/config.toml (100%) rename examples/{ffi-table-provider => datafusion-ffi-example}/Cargo.lock (71%) rename examples/{ffi-table-provider => datafusion-ffi-example}/Cargo.toml (83%) rename examples/{ffi-table-provider => datafusion-ffi-example}/build.rs (100%) rename examples/{ffi-table-provider => datafusion-ffi-example}/pyproject.toml (97%) create mode 100644 examples/datafusion-ffi-example/python/tests/_test_table_function.py rename examples/{ffi-table-provider => datafusion-ffi-example}/python/tests/_test_table_provider.py (94%) create mode 100644 examples/datafusion-ffi-example/src/lib.rs create mode 100644 examples/datafusion-ffi-example/src/table_function.rs rename examples/{ffi-table-provider/src/lib.rs => datafusion-ffi-example/src/table_provider.rs} (71%) create mode 100644 python/datafusion/user_defined.py create mode 100644 src/udtf.rs diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index acabad3ca..61896e43d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -33,7 +33,7 @@ jobs: with: python-version: "3.12" - - uses: astral-sh/setup-uv@v5 + - uses: astral-sh/setup-uv@v6 with: enable-cache: true @@ -52,7 +52,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: astral-sh/setup-uv@v5 + - uses: astral-sh/setup-uv@v6 with: enable-cache: true @@ -94,7 +94,7 @@ jobs: version: "27.4" repo-token: ${{ secrets.GITHUB_TOKEN }} - - uses: astral-sh/setup-uv@v5 + - uses: astral-sh/setup-uv@v6 with: enable-cache: true @@ -150,7 +150,7 @@ jobs: version: "27.4" repo-token: ${{ secrets.GITHUB_TOKEN }} - - uses: astral-sh/setup-uv@v5 + - uses: astral-sh/setup-uv@v6 with: enable-cache: true diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 9037e0a5c..9341488a0 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -58,7 +58,7 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Install dependencies and build - uses: astral-sh/setup-uv@v5 + uses: astral-sh/setup-uv@v6 with: enable-cache: true diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index da3582766..4ae081406 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -76,10 +76,17 @@ jobs: run: cargo clippy --all-targets --all-features -- -D clippy::all -D warnings -A clippy::redundant_closure - name: Install dependencies and build - uses: astral-sh/setup-uv@v5 + uses: astral-sh/setup-uv@v6 with: enable-cache: true + - name: Check documentation + if: ${{ matrix.python-version == '3.10' && matrix.toolchain == 'stable' }} + run: | + uv sync --dev --group docs --no-install-package datafusion + uv run --no-project maturin develop --uv + uv run --no-project docs/build.sh + - name: Run tests env: RUST_BACKTRACE: 1 @@ -91,9 +98,9 @@ jobs: - name: FFI unit tests run: | - cd examples/ffi-table-provider + cd examples/datafusion-ffi-example uv run --no-project maturin develop --uv - uv run --no-project pytest python/tests/_test_table_provider.py + uv run --no-project pytest python/tests/_test*.py - name: Cache the generated dataset id: cache-tpch-dataset diff --git a/Cargo.lock b/Cargo.lock index b32d19d4d..112167cb4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -73,13 +73,13 @@ checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" [[package]] name = "ahash" -version = "0.8.11" +version = "0.8.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", "const-random", - "getrandom 0.2.15", + "getrandom 0.3.3", "once_cell", "version_check", "zerocopy", @@ -132,9 +132,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.95" +version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" [[package]] name = "apache-avro" @@ -150,7 +150,7 @@ dependencies = [ "log", "num-bigint", "quad-rand", - "rand", + "rand 0.8.5", "regex-lite", "serde", "serde_bytes", @@ -179,9 +179,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "55.0.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3095aaf545942ff5abd46654534f15b03a90fba78299d661e045e5d587222f0d" +checksum = "b1bb018b6960c87fd9d025009820406f74e83281185a8bdcb44880d2aa5c9a87" dependencies = [ "arrow-arith", "arrow-array", @@ -201,9 +201,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "55.0.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00752064ff47cee746e816ddb8450520c3a52cbad1e256f6fa861a35f86c45e7" +checksum = "44de76b51473aa888ecd6ad93ceb262fb8d40d1f1154a4df2f069b3590aa7575" dependencies = [ "arrow-array", "arrow-buffer", @@ -215,9 +215,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "55.0.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cebfe926794fbc1f49ddd0cdaf898956ca9f6e79541efce62dabccfd81380472" +checksum = "29ed77e22744475a9a53d00026cf8e166fe73cf42d89c4c4ae63607ee1cfcc3f" dependencies = [ "ahash", "arrow-buffer", @@ -226,15 +226,15 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown 0.15.2", + "hashbrown 0.15.3", "num", ] [[package]] name = "arrow-buffer" -version = "55.0.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0303c7ec4cf1a2c60310fc4d6bbc3350cd051a17bf9e9c0a8e47b4db79277824" +checksum = "b0391c96eb58bf7389171d1e103112d3fc3e5625ca6b372d606f2688f1ea4cce" dependencies = [ "bytes", "half", @@ -243,9 +243,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "55.0.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "335f769c5a218ea823d3760a743feba1ef7857cba114c01399a891c2fff34285" +checksum = "f39e1d774ece9292697fcbe06b5584401b26bd34be1bec25c33edae65c2420ff" dependencies = [ "arrow-array", "arrow-buffer", @@ -264,9 +264,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "55.0.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "510db7dfbb4d5761826516cc611d97b3a68835d0ece95b034a052601109c0b1b" +checksum = "9055c972a07bf12c2a827debfd34f88d3b93da1941d36e1d9fee85eebe38a12a" dependencies = [ "arrow-array", "arrow-cast", @@ -280,9 +280,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "55.0.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8affacf3351a24039ea24adab06f316ded523b6f8c3dbe28fbac5f18743451b" +checksum = "cf75ac27a08c7f48b88e5c923f267e980f27070147ab74615ad85b5c5f90473d" dependencies = [ "arrow-buffer", "arrow-schema", @@ -292,9 +292,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "55.0.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69880a9e6934d9cba2b8630dd08a3463a91db8693b16b499d54026b6137af284" +checksum = "a222f0d93772bd058d1268f4c28ea421a603d66f7979479048c429292fac7b2e" dependencies = [ "arrow-array", "arrow-buffer", @@ -306,9 +306,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "55.0.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8dafd17a05449e31e0114d740530e0ada7379d7cb9c338fd65b09a8130960b0" +checksum = "9085342bbca0f75e8cb70513c0807cc7351f1fbf5cb98192a67d5e3044acb033" dependencies = [ "arrow-array", "arrow-buffer", @@ -328,9 +328,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "55.0.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "895644523af4e17502d42c3cb6b27cb820f0cb77954c22d75c23a85247c849e1" +checksum = "ab2f1065a5cad7b9efa9e22ce5747ce826aa3855766755d4904535123ef431e7" dependencies = [ "arrow-array", "arrow-buffer", @@ -341,9 +341,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "55.0.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9be8a2a4e5e7d9c822b2b8095ecd77010576d824f654d347817640acfc97d229" +checksum = "3703a0e3e92d23c3f756df73d2dc9476873f873a76ae63ef9d3de17fda83b2d8" dependencies = [ "arrow-array", "arrow-buffer", @@ -354,18 +354,20 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "55.0.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7450c76ab7c5a6805be3440dc2e2096010da58f7cab301fdc996a4ee3ee74e49" +checksum = "73a47aa0c771b5381de2b7f16998d351a6f4eb839f1e13d48353e17e873d969b" dependencies = [ "bitflags", + "serde", + "serde_json", ] [[package]] name = "arrow-select" -version = "55.0.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa5f5a93c75f46ef48e4001535e7b6c922eeb0aa20b73cf58d09e13d057490d8" +checksum = "24b7b85575702b23b85272b01bc1c25a01c9b9852305e5d0078c79ba25d995d4" dependencies = [ "ahash", "arrow-array", @@ -377,9 +379,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "55.0.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e7005d858d84b56428ba2a98a107fe88c0132c61793cf6b8232a1f9bfc0452b" +checksum = "9260fddf1cdf2799ace2b4c2fc0356a9789fa7551e0953e35435536fecefebbd" dependencies = [ "arrow-array", "arrow-buffer", @@ -438,7 +440,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] @@ -449,7 +451,7 @@ checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] @@ -475,9 +477,9 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "backtrace" -version = "0.3.74" +version = "0.3.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a" +checksum = "6806a6321ec58106fea15becdad98371e28d92ccbc7c8f1b3b6dd724fe8f1002" dependencies = [ "addr2line", "cfg-if", @@ -485,7 +487,7 @@ dependencies = [ "miniz_oxide", "object", "rustc-demangle", - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -516,9 +518,9 @@ dependencies = [ [[package]] name = "bitflags" -version = "2.8.0" +version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" +checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" [[package]] name = "blake2" @@ -531,9 +533,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.8.1" +version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "389a099b34312839e16420d499a9cad9650541715937ffbdd40d36f49e77eeb3" +checksum = "3888aaa89e4b2a40fca9848e400f6a658a5a3978de7be858e209cafa8be9a4a0" dependencies = [ "arrayref", "arrayvec", @@ -553,9 +555,9 @@ dependencies = [ [[package]] name = "brotli" -version = "7.0.0" +version = "8.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc97b8f16f944bba54f0433f07e30be199b6dc2bd25937444bbad560bcea29bd" +checksum = "9991eea70ea4f293524138648e41ee89b0b2b12ddef3b255effa43c8056e0e0d" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -564,9 +566,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "4.0.2" +version = "5.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74fa05ad7d803d413eb8380983b092cbbaf9a85f151b871360e7b00cd7060b37" +checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -586,9 +588,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.10.0" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9" +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" [[package]] name = "bzip2" @@ -621,9 +623,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.14" +version = "1.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c3d1b2e905a3a7b00a6141adb0e4c0bb941d11caf55349d863942a1cc44e3c9" +checksum = "5f4ac86a9e5bc1e2b3449ab9d7d3a6a405e3d1bb28d7b9be8614f55846ae3766" dependencies = [ "jobserver", "libc", @@ -644,9 +646,9 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.40" +version = "0.4.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a7964611d71df112cb1730f2ee67324fcf4d0fc6606acbbe9bfe06df124637c" +checksum = "c469d952047f47f91b68d1cba3f10d63c11d73e4636f24f08daf0278abf01c4d" dependencies = [ "android-tzdata", "iana-time-zone", @@ -657,9 +659,9 @@ dependencies = [ [[package]] name = "chrono-tz" -version = "0.10.1" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c6ac4f2c0bf0f44e9161aec9675e1050aa4a530663c4a9e37e108fa948bca9f" +checksum = "efdce149c370f133a071ca8ef6ea340b7b88748ab0810097a9e2976eaa34b4f3" dependencies = [ "chrono", "chrono-tz-build", @@ -668,9 +670,9 @@ dependencies = [ [[package]] name = "chrono-tz-build" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94fea34d77a245229e7746bd2beb786cd2a896f306ff491fb8cecb3074b10a7" +checksum = "8f10f8c9340e31fc120ff885fcdb54a0b48e474bbd77cab557f0c30a3e569402" dependencies = [ "parse-zoneinfo", "phf_codegen", @@ -710,7 +712,7 @@ version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" dependencies = [ - "getrandom 0.2.15", + "getrandom 0.2.16", "once_cell", "tiny-keccak", ] @@ -787,9 +789,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.14" +version = "0.5.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ba6d68e24814cb8de6bb986db8222d3a027d15872cabc0d18817bc3c0e4471" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" dependencies = [ "crossbeam-utils", ] @@ -859,9 +861,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffe060b978f74ab446be722adb8a274e052e005bf6dfd171caadc3abaad10080" +checksum = "cc6cb8c2c81eada072059983657d6c9caf3fddefc43b4a65551d243253254a96" dependencies = [ "arrow", "arrow-ipc", @@ -887,7 +889,6 @@ dependencies = [ "datafusion-functions-nested", "datafusion-functions-table", "datafusion-functions-window", - "datafusion-macros", "datafusion-optimizer", "datafusion-physical-expr", "datafusion-physical-expr-common", @@ -902,7 +903,7 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand", + "rand 0.9.1", "regex", "sqlparser", "tempfile", @@ -915,9 +916,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61fe34f401bd03724a1f96d12108144f8cd495a3cdda2bf5e091822fb80b7e66" +checksum = "b7be8d1b627843af62e447396db08fe1372d882c0eb8d0ea655fd1fbc33120ee" dependencies = [ "arrow", "async-trait", @@ -941,9 +942,9 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4411b8e3bce5e0fc7521e44f201def2e2d5d1b5f176fb56e8cdc9942c890f00" +checksum = "38ab16c5ae43f65ee525fc493ceffbc41f40dee38b01f643dfcfc12959e92038" dependencies = [ "arrow", "async-trait", @@ -964,9 +965,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0734015d81c8375eb5d4869b7f7ecccc2ee8d6cb81948ef737cd0e7b743bd69c" +checksum = "d3d56b2ac9f476b93ca82e4ef5fb00769c8a3f248d12b4965af7e27635fa7e12" dependencies = [ "ahash", "apache-avro", @@ -989,9 +990,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5167bb1d2ccbb87c6bc36c295274d7a0519b14afcfdaf401d53cbcaa4ef4968b" +checksum = "16015071202d6133bc84d72756176467e3e46029f3ce9ad2cb788f9b1ff139b2" dependencies = [ "futures", "log", @@ -1000,9 +1001,9 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04e602dcdf2f50c2abf297cc2203c73531e6f48b29516af7695d338cf2a778b1" +checksum = "b77523c95c89d2a7eb99df14ed31390e04ab29b43ff793e562bdc1716b07e17b" dependencies = [ "arrow", "async-compression", @@ -1025,7 +1026,7 @@ dependencies = [ "log", "object_store", "parquet", - "rand", + "rand 0.9.1", "tempfile", "tokio", "tokio-util", @@ -1036,9 +1037,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-avro" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4ea5111aab9d3f2a8bff570343cccb03ce4c203875ef5a566b7d6f1eb72559e" +checksum = "1371cb4ef13c2e3a15685d37a07398cf13e3b0a85e705024b769fc4c511f5fef" dependencies = [ "apache-avro", "arrow", @@ -1061,9 +1062,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3bb2253952dc32296ed5b84077cb2e0257fea4be6373e1c376426e17ead4ef6" +checksum = "40d25c5e2c0ebe8434beeea997b8e88d55b3ccc0d19344293f2373f65bc524fc" dependencies = [ "arrow", "async-trait", @@ -1086,9 +1087,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b8c7f47a5d2fe03bfa521ec9bafdb8a5c82de8377f60967c3663f00c8790352" +checksum = "3dc6959e1155741ab35369e1dc7673ba30fc45ed568fad34c01b7cb1daeb4d4c" dependencies = [ "arrow", "async-trait", @@ -1111,9 +1112,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27d15868ea39ed2dc266728b554f6304acd473de2142281ecfa1294bb7415923" +checksum = "b7a6afdfe358d70f4237f60eaef26ae5a1ce7cb2c469d02d5fc6c7fd5d84e58b" dependencies = [ "arrow", "async-trait", @@ -1136,21 +1137,21 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand", + "rand 0.9.1", "tokio", ] [[package]] name = "datafusion-doc" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a91f8c2c5788ef32f48ff56c68e5b545527b744822a284373ac79bba1ba47292" +checksum = "9bcd8a3e3e3d02ea642541be23d44376b5d5c37c2938cce39b3873cdf7186eea" [[package]] name = "datafusion-execution" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06f004d100f49a3658c9da6fb0c3a9b760062d96cd4ad82ccc3b7b69a9fb2f84" +checksum = "670da1d45d045eee4c2319b8c7ea57b26cf48ab77b630aaa50b779e406da476a" dependencies = [ "arrow", "dashmap", @@ -1160,16 +1161,16 @@ dependencies = [ "log", "object_store", "parking_lot", - "rand", + "rand 0.9.1", "tempfile", "url", ] [[package]] name = "datafusion-expr" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a4e4ce3802609be38eeb607ee72f6fe86c3091460de9dbfae9e18db423b3964" +checksum = "b3a577f64bdb7e2cc4043cd97f8901d8c504711fde2dbcb0887645b00d7c660b" dependencies = [ "arrow", "chrono", @@ -1188,9 +1189,9 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "422ac9cf3b22bbbae8cdf8ceb33039107fde1b5492693168f13bd566b1bcc839" +checksum = "51b7916806ace3e9f41884f230f7f38ebf0e955dfbd88266da1826f29a0b9a6a" dependencies = [ "arrow", "datafusion-common", @@ -1201,9 +1202,9 @@ dependencies = [ [[package]] name = "datafusion-ffi" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cf3fe9ab492c56daeb7beed526690d33622d388b8870472e0b7b7f55490338c" +checksum = "980cca31de37f5dadf7ea18e4ffc2b6833611f45bed5ef9de0831d2abb50f1ef" dependencies = [ "abi_stable", "arrow", @@ -1211,7 +1212,9 @@ dependencies = [ "async-ffi", "async-trait", "datafusion", + "datafusion-functions-aggregate-common", "datafusion-proto", + "datafusion-proto-common", "futures", "log", "prost", @@ -1221,9 +1224,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ddf0a0a2db5d2918349c978d42d80926c6aa2459cd8a3c533a84ec4bb63479e" +checksum = "7fb31c9dc73d3e0c365063f91139dc273308f8a8e124adda9898db8085d68357" dependencies = [ "arrow", "arrow-buffer", @@ -1241,7 +1244,7 @@ dependencies = [ "itertools 0.14.0", "log", "md-5", - "rand", + "rand 0.9.1", "regex", "sha2", "unicode-segmentation", @@ -1250,9 +1253,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "408a05dafdc70d05a38a29005b8b15e21b0238734dab1e98483fcb58038c5aba" +checksum = "ebb72c6940697eaaba9bd1f746a697a07819de952b817e3fb841fb75331ad5d4" dependencies = [ "ahash", "arrow", @@ -1271,9 +1274,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "756d21da2dd6c9bef97af1504970ff56cbf35d03fbd4ffd62827f02f4d2279d4" +checksum = "d7fdc54656659e5ecd49bf341061f4156ab230052611f4f3609612a0da259696" dependencies = [ "ahash", "arrow", @@ -1284,9 +1287,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d8d50f6334b378930d992d801a10ac5b3e93b846b39e4a05085742572844537" +checksum = "fad94598e3374938ca43bca6b675febe557e7a14eb627d617db427d70d65118b" dependencies = [ "arrow", "arrow-ord", @@ -1305,9 +1308,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc9a97220736c8fff1446e936be90d57216c06f28969f9ffd3b72ac93c958c8a" +checksum = "de2fc6c2946da5cab8364fb28b5cac3115f0f3a87960b235ed031c3f7e2e639b" dependencies = [ "arrow", "async-trait", @@ -1321,10 +1324,11 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cefc2d77646e1aadd1d6a9c40088937aedec04e68c5f0465939912e1291f8193" +checksum = "3e5746548a8544870a119f556543adcd88fe0ba6b93723fe78ad0439e0fbb8b4" dependencies = [ + "arrow", "datafusion-common", "datafusion-doc", "datafusion-expr", @@ -1338,9 +1342,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd4aff082c42fa6da99ce0698c85addd5252928c908eb087ca3cfa64ff16b313" +checksum = "dcbe9404382cda257c434f22e13577bee7047031dfdb6216dd5e841b9465e6fe" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1348,20 +1352,20 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df6f88d7ee27daf8b108ba910f9015176b36fbc72902b1ca5c2a5f1d1717e1a1" +checksum = "8dce50e3b637dab0d25d04d2fe79dfdca2b257eabd76790bffd22c7f90d700c8" dependencies = [ "datafusion-expr", "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] name = "datafusion-optimizer" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "084d9f979c4b155346d3c34b18f4256e6904ded508e9554d90fed416415c3515" +checksum = "03cfaacf06445dc3bbc1e901242d2a44f2cae99a744f49f3fefddcee46240058" dependencies = [ "arrow", "chrono", @@ -1378,9 +1382,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64c536062b0076f4e30084065d805f389f9fe38af0ca75bcbac86bc5e9fbab65" +checksum = "1908034a89d7b2630898e06863583ae4c00a0dd310c1589ca284195ee3f7f8a6" dependencies = [ "ahash", "arrow", @@ -1395,14 +1399,14 @@ dependencies = [ "itertools 0.14.0", "log", "paste", - "petgraph", + "petgraph 0.8.2", ] [[package]] name = "datafusion-physical-expr-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8a92b53b3193fac1916a1c5b8e3f4347c526f6822e56b71faa5fb372327a863" +checksum = "47b7a12dd59ea07614b67dbb01d85254fbd93df45bcffa63495e11d3bdf847df" dependencies = [ "ahash", "arrow", @@ -1414,9 +1418,9 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fa0a5ac94c7cf3da97bedabd69d6bbca12aef84b9b37e6e9e8c25286511b5e2" +checksum = "4371cc4ad33978cc2a8be93bd54a232d3f2857b50401a14631c0705f3f910aae" dependencies = [ "arrow", "datafusion-common", @@ -1433,9 +1437,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "690c615db468c2e5fe5085b232d8b1c088299a6c63d87fd960a354a71f7acb55" +checksum = "dc47bc33025757a5c11f2cd094c5b6b5ed87f46fa33c023e6fdfa25fcbfade23" dependencies = [ "ahash", "arrow", @@ -1463,9 +1467,9 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a1afb2bdb05de7ff65be6883ebfd4ec027bd9f1f21c46aa3afd01927160a83" +checksum = "d8f5d9acd7d96e3bf2a7bb04818373cab6e51de0356e3694b94905fee7b4e8b6" dependencies = [ "arrow", "chrono", @@ -1479,9 +1483,9 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35b7a5876ebd6b564fb9a1fd2c3a2a9686b787071a256b47e4708f0916f9e46f" +checksum = "09ecb5ec152c4353b60f7a5635489834391f7a291d2b39a4820cd469e318b78e" dependencies = [ "arrow", "datafusion-common", @@ -1490,7 +1494,7 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "46.0.0" +version = "47.0.0" dependencies = [ "arrow", "async-trait", @@ -1513,9 +1517,9 @@ dependencies = [ [[package]] name = "datafusion-session" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad229a134c7406c057ece00c8743c0c34b97f4e72f78b475fe17b66c5e14fa4f" +checksum = "d7485da32283985d6b45bd7d13a65169dcbe8c869e25d01b2cfbc425254b4b49" dependencies = [ "arrow", "async-trait", @@ -1537,9 +1541,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64f6ab28b72b664c21a27b22a2ff815fd390ed224c26e89a93b5a8154a4e8607" +checksum = "a466b15632befddfeac68c125f0260f569ff315c6831538cbb40db754134e0df" dependencies = [ "arrow", "bigdecimal", @@ -1554,9 +1558,9 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "061efc0937f0ce3abb37ed0d56cfa01dd0e654b90e408656d05e846c8b7599fe" +checksum = "f2f3973b1a4f6e9ee7fd99a22d58e1c06e6723a28dc911a60df575974c8339aa" dependencies = [ "async-recursion", "async-trait", @@ -1590,20 +1594,20 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] name = "dyn-clone" -version = "1.0.18" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "feeef44e73baff3a26d371801df019877a9866a8c493d315ab00177843314f35" +checksum = "1c7a8fb8a9fbf66c1f703fe16184d10ca0ee9d23be5b4436400408ba54a95005" [[package]] name = "either" -version = "1.13.0" +version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" [[package]] name = "equivalent" @@ -1613,9 +1617,9 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "errno" -version = "0.3.10" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" +checksum = "cea14ef9355e3beab063703aa9dab15afd25f0667c341310c1e5274bb1d0da18" dependencies = [ "libc", "windows-sys 0.59.0", @@ -1662,9 +1666,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "foldhash" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" [[package]] name = "form_urlencoded" @@ -1731,7 +1735,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] @@ -1785,9 +1789,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" dependencies = [ "cfg-if", "js-sys", @@ -1798,14 +1802,16 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.3.1" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8" +checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" dependencies = [ "cfg-if", + "js-sys", "libc", - "wasi 0.13.3+wasi-0.2.2", - "windows-targets", + "r-efi", + "wasi 0.14.2+wasi-0.2.4", + "wasm-bindgen", ] [[package]] @@ -1822,9 +1828,9 @@ checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" [[package]] name = "h2" -version = "0.4.7" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e" +checksum = "a9421a676d1b147b16b82c9225157dc629087ef8ec4d5e2960f9437a90dac0a5" dependencies = [ "atomic-waker", "bytes", @@ -1862,9 +1868,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.2" +version = "0.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" +checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3" dependencies = [ "allocator-api2", "equivalent", @@ -1885,9 +1891,9 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "http" -version = "1.2.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f16ca2af56261c99fba8bac40a10251ce8188205a4c448fbb745a2e4daa76fea" +checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565" dependencies = [ "bytes", "fnv", @@ -1906,12 +1912,12 @@ dependencies = [ [[package]] name = "http-body-util" -version = "0.1.2" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" dependencies = [ "bytes", - "futures-util", + "futures-core", "http", "http-body", "pin-project-lite", @@ -1919,15 +1925,15 @@ dependencies = [ [[package]] name = "httparse" -version = "1.10.0" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2d708df4e7140240a16cd6ab0ab65c972d7433ab77819ea693fde9c43811e2a" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" [[package]] name = "humantime" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +checksum = "9b112acc8b3adf4b107a8ec20977da0273a8c386765a3ec0229bd500a1443f9f" [[package]] name = "hyper" @@ -1969,9 +1975,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.10" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df2dcfbe0677734ab2f3ffa7fa7bfd4706bfdc1ef393f2ee30184aed67e631b4" +checksum = "497bbc33a26fdd4af9ed9c70d63f61cf56a938375fbb32df34db9b1cd6d643f2" dependencies = [ "bytes", "futures-channel", @@ -1979,6 +1985,7 @@ dependencies = [ "http", "http-body", "hyper", + "libc", "pin-project-lite", "socket2", "tokio", @@ -1988,14 +1995,15 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.61" +version = "0.1.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220" +checksum = "b0c919e5debc312ad217002b8048a17b7d83f80703865bbfcfebb0458b0b27d8" dependencies = [ "android_system_properties", "core-foundation-sys", "iana-time-zone-haiku", "js-sys", + "log", "wasm-bindgen", "windows-core", ] @@ -2011,21 +2019,22 @@ dependencies = [ [[package]] name = "icu_collections" -version = "1.5.0" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" dependencies = [ "displaydoc", + "potential_utf", "yoke", "zerofrom", "zerovec", ] [[package]] -name = "icu_locid" -version = "1.5.0" +name = "icu_locale_core" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" dependencies = [ "displaydoc", "litemap", @@ -2034,31 +2043,11 @@ dependencies = [ "zerovec", ] -[[package]] -name = "icu_locid_transform" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" -dependencies = [ - "displaydoc", - "icu_locid", - "icu_locid_transform_data", - "icu_provider", - "tinystr", - "zerovec", -] - -[[package]] -name = "icu_locid_transform_data" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" - [[package]] name = "icu_normalizer" -version = "1.5.0" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" +checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" dependencies = [ "displaydoc", "icu_collections", @@ -2066,67 +2055,54 @@ dependencies = [ "icu_properties", "icu_provider", "smallvec", - "utf16_iter", - "utf8_iter", - "write16", "zerovec", ] [[package]] name = "icu_normalizer_data" -version = "1.5.0" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" +checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" [[package]] name = "icu_properties" -version = "1.5.1" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" +checksum = "2549ca8c7241c82f59c80ba2a6f415d931c5b58d24fb8412caa1a1f02c49139a" dependencies = [ "displaydoc", "icu_collections", - "icu_locid_transform", + "icu_locale_core", "icu_properties_data", "icu_provider", - "tinystr", + "potential_utf", + "zerotrie", "zerovec", ] [[package]] name = "icu_properties_data" -version = "1.5.0" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" +checksum = "8197e866e47b68f8f7d95249e172903bec06004b18b2937f1095d40a0c57de04" [[package]] name = "icu_provider" -version = "1.5.0" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" dependencies = [ "displaydoc", - "icu_locid", - "icu_provider_macros", + "icu_locale_core", "stable_deref_trait", "tinystr", "writeable", "yoke", "zerofrom", + "zerotrie", "zerovec", ] -[[package]] -name = "icu_provider_macros" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.100", -] - [[package]] name = "idna" version = "1.0.3" @@ -2140,9 +2116,9 @@ dependencies = [ [[package]] name = "idna_adapter" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" dependencies = [ "icu_normalizer", "icu_properties", @@ -2155,14 +2131,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" dependencies = [ "equivalent", - "hashbrown 0.15.2", + "hashbrown 0.15.3", ] [[package]] name = "indoc" -version = "2.0.5" +version = "2.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" +checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" [[package]] name = "integer-encoding" @@ -2196,16 +2172,17 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.14" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] name = "jobserver" -version = "0.1.32" +version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" dependencies = [ + "getrandom 0.3.3", "libc", ] @@ -2291,9 +2268,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.171" +version = "0.2.172" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" [[package]] name = "libflate" @@ -2331,15 +2308,15 @@ dependencies = [ [[package]] name = "libm" -version = "0.2.11" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" +checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" [[package]] name = "libmimalloc-sys" -version = "0.1.39" +version = "0.1.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44" +checksum = "ec9d6fac27761dabcd4ee73571cdb06b7022dc99089acbe5435691edffaac0f4" dependencies = [ "cc", "libc", @@ -2356,15 +2333,15 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.4.15" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" +checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" [[package]] name = "litemap" -version = "0.7.4" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" +checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" [[package]] name = "lock_api" @@ -2378,9 +2355,15 @@ dependencies = [ [[package]] name = "log" -version = "0.4.25" +version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" + +[[package]] +name = "lru-slab" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" [[package]] name = "lz4_flex" @@ -2429,9 +2412,9 @@ dependencies = [ [[package]] name = "mimalloc" -version = "0.1.43" +version = "0.1.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633" +checksum = "995942f432bbb4822a7e9c3faa87a695185b0d09273ba85f097b54f4e458f2af" dependencies = [ "libmimalloc-sys", ] @@ -2464,9 +2447,9 @@ dependencies = [ [[package]] name = "multimap" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "defc4c55412d89136f966bbb339008b474350e5e6e78d2714439c386b3137a03" +checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" [[package]] name = "num" @@ -2554,9 +2537,9 @@ dependencies = [ [[package]] name = "object_store" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9ce831b09395f933addbc56d894d889e4b226eba304d4e7adbab591e26daf1e" +checksum = "d94ac16b433c0ccf75326388c893d2835ab7457ea35ab8ba5d745c053ef5fa16" dependencies = [ "async-trait", "base64 0.22.1", @@ -2574,25 +2557,27 @@ dependencies = [ "parking_lot", "percent-encoding", "quick-xml", - "rand", + "rand 0.9.1", "reqwest", "ring", "rustls-pemfile", "serde", "serde_json", "serde_urlencoded", - "thiserror 2.0.11", + "thiserror 2.0.12", "tokio", "tracing", "url", "walkdir", + "wasm-bindgen-futures", + "web-time", ] [[package]] name = "once_cell" -version = "1.20.3" +version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "openssl-probe" @@ -2629,14 +2614,14 @@ dependencies = [ "libc", "redox_syscall", "smallvec", - "windows-targets", + "windows-targets 0.52.6", ] [[package]] name = "parquet" -version = "55.0.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd31a8290ac5b19f09ad77ee7a1e6a541f1be7674ad410547d5f1eef6eef4a9c" +checksum = "be7b2d778f6b841d37083ebdf32e33a524acde1266b5884a8ca29bf00dfa1231" dependencies = [ "ahash", "arrow-array", @@ -2653,7 +2638,7 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown 0.15.2", + "hashbrown 0.15.3", "lz4_flex", "num", "num-bigint", @@ -2736,6 +2721,18 @@ dependencies = [ "indexmap", ] +[[package]] +name = "petgraph" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54acf3a685220b533e437e264e4d932cfbdc4cc7ec0cd232ed73c08d03b8a7ca" +dependencies = [ + "fixedbitset", + "hashbrown 0.15.3", + "indexmap", + "serde", +] + [[package]] name = "phf" version = "0.11.3" @@ -2762,7 +2759,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" dependencies = [ "phf_shared", - "rand", + "rand 0.8.5", ] [[package]] @@ -2788,21 +2785,30 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" [[package]] name = "portable-atomic" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" +checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" + +[[package]] +name = "potential_utf" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585" +dependencies = [ + "zerovec", +] [[package]] name = "ppv-lite86" -version = "0.2.20" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" dependencies = [ "zerocopy", ] @@ -2814,14 +2820,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "664ec5419c51e34154eec046ebcba56312d5a2fc3b09a06da188e1ad21afadf6" dependencies = [ "proc-macro2", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] name = "proc-macro2" -version = "1.0.93" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" dependencies = [ "unicode-ident", ] @@ -2843,16 +2849,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" dependencies = [ "heck", - "itertools 0.13.0", + "itertools 0.14.0", "log", "multimap", "once_cell", - "petgraph", + "petgraph 0.7.1", "prettyplease", "prost", "prost-types", "regex", - "syn 2.0.100", + "syn 2.0.101", "tempfile", ] @@ -2863,10 +2869,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools 0.13.0", + "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] @@ -2889,18 +2895,18 @@ dependencies = [ [[package]] name = "psm" -version = "0.1.25" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f58e5423e24c18cc840e1c98370b3993c6649cd1678b4d24318bcf0a083cbe88" +checksum = "6e944464ec8536cd1beb0bbfd96987eb5e3b72f2ecdafdc5c769a37f1fa2ae1f" dependencies = [ "cc", ] [[package]] name = "pyo3" -version = "0.24.1" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17da310086b068fbdcefbba30aeb3721d5bb9af8db4987d6735b2183ca567229" +checksum = "e5203598f366b11a02b13aa20cab591229ff0a89fd121a308a5df751d5fc9219" dependencies = [ "cfg-if", "indoc", @@ -2929,9 +2935,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.24.1" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e27165889bd793000a098bb966adc4300c312497ea25cf7a690a9f0ac5aa5fc1" +checksum = "99636d423fa2ca130fa5acde3059308006d46f98caac629418e53f7ebb1e9999" dependencies = [ "once_cell", "target-lexicon", @@ -2939,9 +2945,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.24.1" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05280526e1dbf6b420062f3ef228b78c0c54ba94e157f5cb724a609d0f2faabc" +checksum = "78f9cf92ba9c409279bc3305b5409d90db2d2c22392d443a87df3a1adad59e33" dependencies = [ "libc", "pyo3-build-config", @@ -2949,27 +2955,27 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.24.1" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c3ce5686aa4d3f63359a5100c62a127c9f15e8398e5fdeb5deef1fed5cd5f44" +checksum = "0b999cb1a6ce21f9a6b147dcf1be9ffedf02e0043aec74dc390f3007047cecd9" dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] name = "pyo3-macros-backend" -version = "0.24.1" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4cf6faa0cbfb0ed08e89beb8103ae9724eb4750e3a78084ba4017cbe94f3855" +checksum = "822ece1c7e1012745607d5cf0bcb2874769f0f7cb34c4cde03b9358eb9ef911a" dependencies = [ "heck", "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] @@ -2980,9 +2986,9 @@ checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" [[package]] name = "quick-xml" -version = "0.37.2" +version = "0.37.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "165859e9e55f79d67b96c5d96f4e88b6f2695a1972849c15a6a3f5c59fc2c003" +checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb" dependencies = [ "memchr", "serde", @@ -2990,37 +2996,40 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.6" +version = "0.11.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62e96808277ec6f97351a2380e6c25114bc9e67037775464979f3037c92d05ef" +checksum = "626214629cda6781b6dc1d316ba307189c85ba657213ce642d9c77670f8202c8" dependencies = [ "bytes", + "cfg_aliases", "pin-project-lite", "quinn-proto", "quinn-udp", "rustc-hash", "rustls", "socket2", - "thiserror 2.0.11", + "thiserror 2.0.12", "tokio", "tracing", + "web-time", ] [[package]] name = "quinn-proto" -version = "0.11.9" +version = "0.11.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d" +checksum = "49df843a9161c85bb8aae55f101bc0bac8bcafd637a620d9122fd7e0b2f7422e" dependencies = [ "bytes", - "getrandom 0.2.15", - "rand", + "getrandom 0.3.3", + "lru-slab", + "rand 0.9.1", "ring", "rustc-hash", "rustls", "rustls-pki-types", "slab", - "thiserror 2.0.11", + "thiserror 2.0.12", "tinyvec", "tracing", "web-time", @@ -3028,9 +3037,9 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.10" +version = "0.5.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e46f3055866785f6b92bc6164b76be02ca8f2eb4b002c0354b28cf4c119e5944" +checksum = "ee4e529991f949c5e25755532370b8af5d114acae52326361d68d47af64aa842" dependencies = [ "cfg_aliases", "libc", @@ -3049,6 +3058,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" + [[package]] name = "rand" version = "0.8.5" @@ -3056,8 +3071,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha", - "rand_core", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.3", ] [[package]] @@ -3067,7 +3092,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.3", ] [[package]] @@ -3076,7 +3111,16 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom 0.2.15", + "getrandom 0.2.16", +] + +[[package]] +name = "rand_core" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +dependencies = [ + "getrandom 0.3.3", ] [[package]] @@ -3096,14 +3140,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] name = "redox_syscall" -version = "0.5.8" +version = "0.5.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" +checksum = "928fca9cf2aa042393a8325b9ead81d2f0df4cb12e1e24cef072922ccd99c5af" dependencies = [ "bitflags", ] @@ -3149,7 +3193,7 @@ version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ef7fa9ed0256d64a688a3747d0fef7a88851c18a5e1d57f115f38ec2e09366" dependencies = [ - "hashbrown 0.15.2", + "hashbrown 0.15.3", "memchr", ] @@ -3164,9 +3208,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.12" +version = "0.12.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da" +checksum = "d19c46a6fdd48bc4dab94b6103fccc55d34c67cc0ad04653aad4ea2a07cd7bbb" dependencies = [ "base64 0.22.1", "bytes", @@ -3210,13 +3254,13 @@ dependencies = [ [[package]] name = "ring" -version = "0.17.9" +version = "0.17.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e75ec5e92c4d8aede845126adc388046234541629e76029599ed35a003c7ed24" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" dependencies = [ "cc", "cfg-if", - "getrandom 0.2.15", + "getrandom 0.2.16", "libc", "untrusted", "windows-sys 0.52.0", @@ -3251,9 +3295,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.44" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" +checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266" dependencies = [ "bitflags", "errno", @@ -3264,9 +3308,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.23" +version = "0.23.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47796c98c480fce5406ef69d1c76378375492c3b0a0de587be0c1d9feb12f395" +checksum = "730944ca083c1c233a75c09f199e973ca499344a2b7ba9e755c457e86fb4a321" dependencies = [ "once_cell", "ring", @@ -3299,18 +3343,19 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.11.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c" +checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79" dependencies = [ "web-time", + "zeroize", ] [[package]] name = "rustls-webpki" -version = "0.102.8" +version = "0.103.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" +checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435" dependencies = [ "ring", "rustls-pki-types", @@ -3319,15 +3364,15 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.19" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" +checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2" [[package]] name = "ryu" -version = "1.0.19" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" [[package]] name = "same-file" @@ -3368,7 +3413,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] @@ -3411,9 +3456,9 @@ dependencies = [ [[package]] name = "seq-macro" -version = "0.3.5" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" [[package]] name = "serde" @@ -3426,9 +3471,9 @@ dependencies = [ [[package]] name = "serde_bytes" -version = "0.11.15" +version = "0.11.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "387cc504cb06bb40a96c8e04e951fe01854cf6bc921053c954e4a606d9675c6a" +checksum = "8437fd221bde2d4ca316d61b90e337e9e702b3820b87d63caa9ba6c02bd06d96" dependencies = [ "serde", ] @@ -3441,7 +3486,7 @@ checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] @@ -3452,7 +3497,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] @@ -3476,7 +3521,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] @@ -3506,9 +3551,9 @@ dependencies = [ [[package]] name = "sha2" -version = "0.10.8" +version = "0.10.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", "cpufeatures", @@ -3544,9 +3589,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.14.0" +version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd" +checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" [[package]] name = "snap" @@ -3556,9 +3601,9 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "socket2" -version = "0.5.8" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c970269d99b64e60ec3bd6ad27270092a5394c4e309314b18ae3fe575695fbe8" +checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef" dependencies = [ "libc", "windows-sys 0.52.0", @@ -3583,7 +3628,7 @@ checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] @@ -3594,9 +3639,9 @@ checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" [[package]] name = "stacker" -version = "0.1.18" +version = "0.1.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d08feb8f695b465baed819b03c128dc23f57a694510ab1f06c77f763975685e" +checksum = "cddb07e32ddb770749da91081d8d0ac3a16f1a569a18b20348cd371f5dead06b" dependencies = [ "cc", "cfg-if", @@ -3627,14 +3672,14 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] name = "substrait" -version = "0.55.1" +version = "0.56.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "048fe52a3664881ccdfdc9bdb0f4e8805f3444ee64abf299d365c54f6a2ffabb" +checksum = "13de2e20128f2a018dab1cfa30be83ae069219a65968c6f89df66ad124de2397" dependencies = [ "heck", "pbjson", @@ -3651,7 +3696,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.100", + "syn 2.0.101", "typify", "walkdir", ] @@ -3675,9 +3720,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.100" +version = "2.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" +checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" dependencies = [ "proc-macro2", "quote", @@ -3695,13 +3740,13 @@ dependencies = [ [[package]] name = "synstructure" -version = "0.13.1" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] @@ -3712,13 +3757,12 @@ checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a" [[package]] name = "tempfile" -version = "3.16.0" +version = "3.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38c246215d7d24f48ae091a2902398798e05d978b24315d6efbc00ede9a8bb91" +checksum = "e8a64e3985349f2441a1a9ef0b853f869006c3855f2cda6862a94d26ebb9d6a1" dependencies = [ - "cfg-if", "fastrand", - "getrandom 0.3.1", + "getrandom 0.3.3", "once_cell", "rustix", "windows-sys 0.59.0", @@ -3735,11 +3779,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.11" +version = "2.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" +checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" dependencies = [ - "thiserror-impl 2.0.11", + "thiserror-impl 2.0.12", ] [[package]] @@ -3750,18 +3794,18 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] name = "thiserror-impl" -version = "2.0.11" +version = "2.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" +checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] @@ -3786,9 +3830,9 @@ dependencies = [ [[package]] name = "tinystr" -version = "0.7.6" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" dependencies = [ "displaydoc", "zerovec", @@ -3796,9 +3840,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.8.1" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "022db8904dfa342efe721985167e9fcd16c29b226db4397ed752a761cfce81e8" +checksum = "09b3661f17e86524eccd4371ab0429194e0d7c008abb45f7a7495b1719463c71" dependencies = [ "tinyvec_macros", ] @@ -3811,9 +3855,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.44.2" +version = "1.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6b88822cbe49de4185e3a4cbf8321dd487cf5fe0c5c65695fef6346371e9c48" +checksum = "2513ca694ef9ede0fb23fe71a4ee4107cb102b9dc1930f6d0fd77aae068ae165" dependencies = [ "backtrace", "bytes", @@ -3833,14 +3877,14 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] name = "tokio-rustls" -version = "0.26.1" +version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37" +checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b" dependencies = [ "rustls", "tokio", @@ -3848,9 +3892,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.14" +version = "0.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b9590b93e6fcc1739458317cccd391ad3955e2bde8913edf6f95f9e65a8f034" +checksum = "66a539a9ad6d5d281510d5bd368c973d636c02dbf8a67300bfb6b950696ad7df" dependencies = [ "bytes", "futures-core", @@ -3905,7 +3949,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] @@ -3977,20 +4021,20 @@ checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] name = "typenum" -version = "1.17.0" +version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" +checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" [[package]] name = "typify" -version = "0.3.0" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e03ba3643450cfd95a1aca2e1938fef63c1c1994489337998aff4ad771f21ef8" +checksum = "6c6c647a34e851cf0260ccc14687f17cdcb8302ff1a8a687a24b97ca0f82406f" dependencies = [ "typify-impl", "typify-macro", @@ -3998,9 +4042,9 @@ dependencies = [ [[package]] name = "typify-impl" -version = "0.3.0" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bce48219a2f3154aaa2c56cbf027728b24a3c8fe0a47ed6399781de2b3f3eeaf" +checksum = "741b7f1e2e1338c0bee5ad5a7d3a9bbd4e24c33765c08b7691810e68d879365d" dependencies = [ "heck", "log", @@ -4011,16 +4055,16 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.100", - "thiserror 2.0.11", + "syn 2.0.101", + "thiserror 2.0.12", "unicode-ident", ] [[package]] name = "typify-macro" -version = "0.3.0" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68b5780d745920ed73c5b7447496a9b5c42ed2681a9b70859377aec423ecf02b" +checksum = "7560adf816a1e8dad7c63d8845ef6e31e673e39eab310d225636779230cbedeb" dependencies = [ "proc-macro2", "quote", @@ -4029,15 +4073,15 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.100", + "syn 2.0.101", "typify-impl", ] [[package]] name = "unicode-ident" -version = "1.0.16" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" [[package]] name = "unicode-segmentation" @@ -4053,9 +4097,9 @@ checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" [[package]] name = "unindent" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" +checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" [[package]] name = "unsafe-libyaml" @@ -4080,12 +4124,6 @@ dependencies = [ "percent-encoding", ] -[[package]] -name = "utf16_iter" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" - [[package]] name = "utf8_iter" version = "1.0.4" @@ -4094,11 +4132,11 @@ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] name = "uuid" -version = "1.16.0" +version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9" +checksum = "3cf4199d1e5d15ddd86a694e4d0dffa9c323ce759fea589f00fef9d81cc1931d" dependencies = [ - "getrandom 0.3.1", + "getrandom 0.3.3", "js-sys", "serde", "wasm-bindgen", @@ -4137,9 +4175,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasi" -version = "0.13.3+wasi-0.2.2" +version = "0.14.2+wasi-0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2" +checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" dependencies = [ "wit-bindgen-rt", ] @@ -4166,7 +4204,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", "wasm-bindgen-shared", ] @@ -4201,7 +4239,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4281,11 +4319,37 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-core" -version = "0.52.0" +version = "0.61.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46ec44dc15085cea82cf9c78f85a9114c463a369786585ad2882d1ff0b0acf40" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings 0.4.1", +] + +[[package]] +name = "windows-implement" +version = "0.60.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.101", +] + +[[package]] +name = "windows-interface" +version = "0.59.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" dependencies = [ - "windows-targets", + "proc-macro2", + "quote", + "syn 2.0.101", ] [[package]] @@ -4296,32 +4360,40 @@ checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38" [[package]] name = "windows-registry" -version = "0.2.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0" +checksum = "4286ad90ddb45071efd1a66dfa43eb02dd0dfbae1545ad6cc3c51cf34d7e8ba3" dependencies = [ "windows-result", - "windows-strings", - "windows-targets", + "windows-strings 0.3.1", + "windows-targets 0.53.0", ] [[package]] name = "windows-result" -version = "0.2.0" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" +checksum = "4b895b5356fc36103d0f64dd1e94dfa7ac5633f1c9dd6e80fe9ec4adef69e09d" dependencies = [ - "windows-targets", + "windows-link", ] [[package]] name = "windows-strings" -version = "0.1.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" +checksum = "87fa48cc5d406560701792be122a10132491cff9d0aeb23583cc2dcafc847319" dependencies = [ - "windows-result", - "windows-targets", + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a7ab927b2637c19b3dbe0965e75d8f2d30bdd697a1516191cad2ec4df8fb28a" +dependencies = [ + "windows-link", ] [[package]] @@ -4330,7 +4402,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -4339,7 +4411,7 @@ version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -4348,14 +4420,30 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1e4c7e8ceaaf9cb7d7507c974735728ab453b67ef8f18febdd7c11fe59dca8b" +dependencies = [ + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", ] [[package]] @@ -4364,68 +4452,110 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + [[package]] name = "windows_i686_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" + [[package]] name = "wit-bindgen-rt" -version = "0.33.0" +version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" +checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" dependencies = [ "bitflags", ] -[[package]] -name = "write16" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" - [[package]] name = "writeable" -version = "0.5.5" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" +checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" [[package]] name = "xz2" @@ -4438,9 +4568,9 @@ dependencies = [ [[package]] name = "yoke" -version = "0.7.5" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" +checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" dependencies = [ "serde", "stable_deref_trait", @@ -4450,55 +4580,54 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.7.5" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" +checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", "synstructure", ] [[package]] name = "zerocopy" -version = "0.7.35" +version = "0.8.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb" dependencies = [ - "byteorder", "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.7.35" +version = "0.8.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] name = "zerofrom" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" dependencies = [ "zerofrom-derive", ] [[package]] name = "zerofrom-derive" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", "synstructure", ] @@ -4508,11 +4637,22 @@ version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" +[[package]] +name = "zerotrie" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + [[package]] name = "zerovec" -version = "0.10.4" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" +checksum = "4a05eb080e015ba39cc9e23bbe5e7fb04d5fb040350f99f34e338d5fdd294428" dependencies = [ "yoke", "zerofrom", @@ -4521,13 +4661,13 @@ dependencies = [ [[package]] name = "zerovec-derive" -version = "0.10.3" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" +checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.100", + "syn 2.0.101", ] [[package]] @@ -4538,27 +4678,27 @@ checksum = "868b928d7949e09af2f6086dfc1e01936064cc7a819253bce650d4e2a2d63ba8" [[package]] name = "zstd" -version = "0.13.2" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" dependencies = [ "zstd-safe", ] [[package]] name = "zstd-safe" -version = "7.2.1" +version = "7.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" dependencies = [ "zstd-sys", ] [[package]] name = "zstd-sys" -version = "2.0.13+zstd.1.5.6" +version = "2.0.15+zstd.1.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" +checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237" dependencies = [ "cc", "pkg-config", diff --git a/Cargo.toml b/Cargo.toml index 2c4188bb0..4135e64e2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion-python" -version = "46.0.0" +version = "47.0.0" homepage = "https://datafusion.apache.org/python" repository = "https://github.com/apache/datafusion-python" authors = ["Apache DataFusion "] @@ -34,20 +34,20 @@ protoc = [ "datafusion-substrait/protoc" ] substrait = ["dep:datafusion-substrait"] [dependencies] -tokio = { version = "1.44", features = ["macros", "rt", "rt-multi-thread", "sync"] } +tokio = { version = "1.45", features = ["macros", "rt", "rt-multi-thread", "sync"] } pyo3 = { version = "0.24", features = ["extension-module", "abi3", "abi3-py39"] } pyo3-async-runtimes = { version = "0.24", features = ["tokio-runtime"]} -arrow = { version = "55.0.0", features = ["pyarrow"] } -datafusion = { version = "47.0.0", features = ["avro", "unicode_expressions"] } -datafusion-substrait = { version = "47.0.0", optional = true } -datafusion-proto = { version = "47.0.0" } -datafusion-ffi = { version = "47.0.0" } +arrow = { version = "55.1.0", features = ["pyarrow"] } +datafusion = { version = "48.0.0", features = ["avro", "unicode_expressions"] } +datafusion-substrait = { version = "48.0.0", optional = true } +datafusion-proto = { version = "48.0.0" } +datafusion-ffi = { version = "48.0.0" } prost = "0.13.1" # keep in line with `datafusion-substrait` uuid = { version = "1.16", features = ["v4"] } mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] } async-trait = "0.1.88" futures = "0.3" -object_store = { version = "0.12.0", features = ["aws", "gcp", "azure", "http"] } +object_store = { version = "0.12.1", features = ["aws", "gcp", "azure", "http"] } url = "2" [build-dependencies] diff --git a/dev/changelog/47.0.0.md b/dev/changelog/47.0.0.md new file mode 100644 index 000000000..a7ed90313 --- /dev/null +++ b/dev/changelog/47.0.0.md @@ -0,0 +1,64 @@ + + +# Apache DataFusion Python 47.0.0 Changelog + +This release consists of 23 commits from 5 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: support unparser [#1088](https://github.com/apache/datafusion-python/pull/1088) (chenkovsky) +- feat: update datafusion dependency 47 [#1107](https://github.com/apache/datafusion-python/pull/1107) (timsaucer) +- feat: alias with metadata [#1111](https://github.com/apache/datafusion-python/pull/1111) (chenkovsky) +- feat: add missing PyLogicalPlan to_variant [#1085](https://github.com/apache/datafusion-python/pull/1085) (chenkovsky) +- feat: add user defined table function support [#1113](https://github.com/apache/datafusion-python/pull/1113) (timsaucer) + +**Fixed bugs:** + +- fix: recursive import [#1117](https://github.com/apache/datafusion-python/pull/1117) (chenkovsky) + +**Other:** + +- Update changelog and version number [#1089](https://github.com/apache/datafusion-python/pull/1089) (timsaucer) +- Documentation updates: mention correct dataset on basics page [#1081](https://github.com/apache/datafusion-python/pull/1081) (floscha) +- Add Configurable HTML Table Formatter for DataFusion DataFrames in Python [#1100](https://github.com/apache/datafusion-python/pull/1100) (kosiew) +- Add DataFrame usage guide with HTML rendering customization options [#1108](https://github.com/apache/datafusion-python/pull/1108) (kosiew) +- 1075/enhancement/Make col class with __getattr__ [#1076](https://github.com/apache/datafusion-python/pull/1076) (deanm0000) +- 1064/enhancement/add functions to Expr class [#1074](https://github.com/apache/datafusion-python/pull/1074) (deanm0000) +- ci: require approving review [#1122](https://github.com/apache/datafusion-python/pull/1122) (timsaucer) +- Partial fix for 1078: Enhance DataFrame Formatter Configuration with Memory and Display Controls [#1119](https://github.com/apache/datafusion-python/pull/1119) (kosiew) +- Add fill_null method to DataFrame API for handling missing values [#1019](https://github.com/apache/datafusion-python/pull/1019) (kosiew) +- minor: reduce error size [#1126](https://github.com/apache/datafusion-python/pull/1126) (timsaucer) +- Move the udf module to user_defined [#1112](https://github.com/apache/datafusion-python/pull/1112) (timsaucer) +- add unit tests for expression functions [#1121](https://github.com/apache/datafusion-python/pull/1121) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 12 Tim Saucer + 4 Chen Chongchen + 4 kosiew + 2 deanm0000 + 1 Florian Schäfer +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/dev/release/README.md b/dev/release/README.md index f0b333999..692473930 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -56,6 +56,8 @@ Before creating a new release: - a PR should be created and merged to update the major version number of the project - A new release branch should be created, such as `branch-0.8` +## Preparing a Release Candidate + ### Change Log We maintain a `CHANGELOG.md` so our users know what has been changed between releases. @@ -76,21 +78,17 @@ Categorizing pull requests Generating changelog content ``` -This process is not fully automated, so there are some additional manual steps: - -- Add the ASF header to the generated file -- Add a link to this changelog from the top-level `/datafusion/CHANGELOG.md` -- Add the following content (copy from the previous version's changelog and update as appropriate: +### Update the version number -``` -## [24.0.0](https://github.com/apache/datafusion-python/tree/24.0.0) (2023-05-06) +The only place you should need to update the version is in the root `Cargo.toml`. -[Full Changelog](https://github.com/apache/datafusion-python/compare/23.0.0...24.0.0) -``` +### Tag the Repository -### Preparing a Release Candidate +Commit the changes to the changelog and version. -### Tag the Repository +Assuming you have set up a remote to the `apache` repository rather than your personal fork, +you need to push a tag to start the CI process for release candidates. The following assumes +the upstream repository is called `apache`. ```bash git tag 0.8.0-rc1 diff --git a/docs/source/api/dataframe.rst b/docs/source/api/dataframe.rst new file mode 100644 index 000000000..a9e9e47c8 --- /dev/null +++ b/docs/source/api/dataframe.rst @@ -0,0 +1,387 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +================= +DataFrame API +================= + +Overview +-------- + +The ``DataFrame`` class is the core abstraction in DataFusion that represents tabular data and operations +on that data. DataFrames provide a flexible API for transforming data through various operations such as +filtering, projection, aggregation, joining, and more. + +A DataFrame represents a logical plan that is lazily evaluated. The actual execution occurs only when +terminal operations like ``collect()``, ``show()``, or ``to_pandas()`` are called. + +Creating DataFrames +------------------- + +DataFrames can be created in several ways: + +* From SQL queries via a ``SessionContext``: + + .. code-block:: python + + from datafusion import SessionContext + + ctx = SessionContext() + df = ctx.sql("SELECT * FROM your_table") + +* From registered tables: + + .. code-block:: python + + df = ctx.table("your_table") + +* From various data sources: + + .. code-block:: python + + # From CSV files (see :ref:`io_csv` for detailed options) + df = ctx.read_csv("path/to/data.csv") + + # From Parquet files (see :ref:`io_parquet` for detailed options) + df = ctx.read_parquet("path/to/data.parquet") + + # From JSON files (see :ref:`io_json` for detailed options) + df = ctx.read_json("path/to/data.json") + + # From Avro files (see :ref:`io_avro` for detailed options) + df = ctx.read_avro("path/to/data.avro") + + # From Pandas DataFrame + import pandas as pd + pandas_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df = ctx.from_pandas(pandas_df) + + # From Arrow data + import pyarrow as pa + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"] + ) + df = ctx.from_arrow(batch) + + For detailed information about reading from different data sources, see the :doc:`I/O Guide <../user-guide/io/index>`. + For custom data sources, see :ref:`io_custom_table_provider`. + +Common DataFrame Operations +--------------------------- + +DataFusion's DataFrame API offers a wide range of operations: + +.. code-block:: python + + from datafusion import column, literal + + # Select specific columns + df = df.select("col1", "col2") + + # Select with expressions + df = df.select(column("a") + column("b"), column("a") - column("b")) + + # Filter rows + df = df.filter(column("age") > literal(25)) + + # Add computed columns + df = df.with_column("full_name", column("first_name") + literal(" ") + column("last_name")) + + # Multiple column additions + df = df.with_columns( + (column("a") + column("b")).alias("sum"), + (column("a") * column("b")).alias("product") + ) + + # Sort data + df = df.sort(column("age").sort(ascending=False)) + + # Join DataFrames + df = df1.join(df2, on="user_id", how="inner") + + # Aggregate data + from datafusion import functions as f + df = df.aggregate( + [], # Group by columns (empty for global aggregation) + [f.sum(column("amount")).alias("total_amount")] + ) + + # Limit rows + df = df.limit(100) + + # Drop columns + df = df.drop("temporary_column") + +Terminal Operations +------------------- + +To materialize the results of your DataFrame operations: + +.. code-block:: python + + # Collect all data as PyArrow RecordBatches + result_batches = df.collect() + + # Convert to various formats + pandas_df = df.to_pandas() # Pandas DataFrame + polars_df = df.to_polars() # Polars DataFrame + arrow_table = df.to_arrow_table() # PyArrow Table + py_dict = df.to_pydict() # Python dictionary + py_list = df.to_pylist() # Python list of dictionaries + + # Display results + df.show() # Print tabular format to console + + # Count rows + count = df.count() + +HTML Rendering in Jupyter +------------------------- + +When working in Jupyter notebooks or other environments that support rich HTML display, +DataFusion DataFrames automatically render as nicely formatted HTML tables. This functionality +is provided by the ``_repr_html_`` method, which is automatically called by Jupyter. + +Basic HTML Rendering +~~~~~~~~~~~~~~~~~~~~ + +In a Jupyter environment, simply displaying a DataFrame object will trigger HTML rendering: + +.. code-block:: python + + # Will display as HTML table in Jupyter + df + + # Explicit display also uses HTML rendering + display(df) + +HTML Rendering Customization +---------------------------- + +DataFusion provides extensive customization options for HTML table rendering through the +``datafusion.html_formatter`` module. + +Configuring the HTML Formatter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can customize how DataFrames are rendered by configuring the formatter: + +.. code-block:: python + + from datafusion.html_formatter import configure_formatter + + configure_formatter( + max_cell_length=30, # Maximum length of cell content before truncation + max_width=800, # Maximum width of table in pixels + max_height=400, # Maximum height of table in pixels + max_memory_bytes=2 * 1024 * 1024,# Maximum memory used for rendering (2MB) + min_rows_display=10, # Minimum rows to display + repr_rows=20, # Number of rows to display in representation + enable_cell_expansion=True, # Allow cells to be expandable on click + custom_css=None, # Custom CSS to apply + show_truncation_message=True, # Show message when data is truncated + style_provider=None, # Custom style provider class + use_shared_styles=True # Share styles across tables to reduce duplication + ) + +Custom Style Providers +~~~~~~~~~~~~~~~~~~~~~~ + +For advanced styling needs, you can create a custom style provider class: + +.. code-block:: python + + from datafusion.html_formatter import configure_formatter + + class CustomStyleProvider: + def get_cell_style(self) -> str: + return "background-color: #f5f5f5; color: #333; padding: 8px; border: 1px solid #ddd;" + + def get_header_style(self) -> str: + return "background-color: #4285f4; color: white; font-weight: bold; padding: 10px;" + + # Apply custom styling + configure_formatter(style_provider=CustomStyleProvider()) + +Custom Type Formatters +~~~~~~~~~~~~~~~~~~~~~~ + +You can register custom formatters for specific data types: + +.. code-block:: python + + from datafusion.html_formatter import get_formatter + + formatter = get_formatter() + + # Format integers with color based on value + def format_int(value): + return f' 100 else "blue"}">{value}' + + formatter.register_formatter(int, format_int) + + # Format date values + def format_date(value): + return f'{value.isoformat()}' + + formatter.register_formatter(datetime.date, format_date) + +Custom Cell Builders +~~~~~~~~~~~~~~~~~~~~ + +For complete control over cell rendering: + +.. code-block:: python + + formatter = get_formatter() + + def custom_cell_builder(value, row, col, table_id): + try: + num_value = float(value) + if num_value > 0: # Positive values get green + return f'{value}' + if num_value < 0: # Negative values get red + return f'{value}' + except (ValueError, TypeError): + pass + + # Default styling for non-numeric or zero values + return f'{value}' + + formatter.set_custom_cell_builder(custom_cell_builder) + +Custom Header Builders +~~~~~~~~~~~~~~~~~~~~~~ + +Similarly, you can customize the rendering of table headers: + +.. code-block:: python + + def custom_header_builder(field): + tooltip = f"Type: {field.type}" + return f'{field.name}' + + formatter.set_custom_header_builder(custom_header_builder) + +Managing Formatter State +-----------------------~ + +The HTML formatter maintains global state that can be managed: + +.. code-block:: python + + from datafusion.html_formatter import reset_formatter, reset_styles_loaded_state, get_formatter + + # Reset the formatter to default settings + reset_formatter() + + # Reset only the styles loaded state (useful when styles were loaded but need reloading) + reset_styles_loaded_state() + + # Get the current formatter instance to make changes + formatter = get_formatter() + +Advanced Example: Dashboard-Style Formatting +------------------------------------------~~ + +This example shows how to create a dashboard-like styling for your DataFrames: + +.. code-block:: python + + from datafusion.html_formatter import configure_formatter, get_formatter + + # Define custom CSS + custom_css = """ + .datafusion-table { + font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; + border-collapse: collapse; + width: 100%; + box-shadow: 0 2px 3px rgba(0,0,0,0.1); + } + .datafusion-table th { + position: sticky; + top: 0; + z-index: 10; + } + .datafusion-table tr:hover td { + background-color: #f1f7fa !important; + } + .datafusion-table .numeric-positive { + color: #0a7c00; + } + .datafusion-table .numeric-negative { + color: #d13438; + } + """ + + class DashboardStyleProvider: + def get_cell_style(self) -> str: + return "padding: 8px 12px; border-bottom: 1px solid #e0e0e0;" + + def get_header_style(self) -> str: + return ("background-color: #0078d4; color: white; font-weight: 600; " + "padding: 12px; text-align: left; border-bottom: 2px solid #005a9e;") + + # Apply configuration + configure_formatter( + max_height=500, + enable_cell_expansion=True, + custom_css=custom_css, + style_provider=DashboardStyleProvider(), + max_cell_length=50 + ) + + # Add custom formatters for numbers + formatter = get_formatter() + + def format_number(value): + try: + num = float(value) + cls = "numeric-positive" if num > 0 else "numeric-negative" if num < 0 else "" + return f'{value:,}' if cls else f'{value:,}' + except (ValueError, TypeError): + return str(value) + + formatter.register_formatter(int, format_number) + formatter.register_formatter(float, format_number) + +Best Practices +-------------- + +1. **Memory Management**: For large datasets, use ``max_memory_bytes`` to limit memory usage. + +2. **Responsive Design**: Set reasonable ``max_width`` and ``max_height`` values to ensure tables display well on different screens. + +3. **Style Optimization**: Use ``use_shared_styles=True`` to avoid duplicate style definitions when displaying multiple tables. + +4. **Reset When Needed**: Call ``reset_formatter()`` when you want to start fresh with default settings. + +5. **Cell Expansion**: Use ``enable_cell_expansion=True`` when cells might contain longer content that users may want to see in full. + +Additional Resources +-------------------- + +* :doc:`../user-guide/dataframe` - Complete guide to using DataFrames +* :doc:`../user-guide/io/index` - I/O Guide for reading data from various sources +* :doc:`../user-guide/data-sources` - Comprehensive data sources guide +* :ref:`io_csv` - CSV file reading +* :ref:`io_parquet` - Parquet file reading +* :ref:`io_json` - JSON file reading +* :ref:`io_avro` - Avro file reading +* :ref:`io_custom_table_provider` - Custom table providers +* `API Reference `_ - Full API reference diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst new file mode 100644 index 000000000..7f58227ca --- /dev/null +++ b/docs/source/api/index.rst @@ -0,0 +1,27 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +============= +API Reference +============= + +This section provides detailed API documentation for the DataFusion Python library. + +.. toctree:: + :maxdepth: 2 + + dataframe diff --git a/docs/source/conf.py b/docs/source/conf.py index 0be03d81d..28db17d35 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -71,6 +71,7 @@ autoapi_member_order = "groupwise" suppress_warnings = ["autoapi.python_import_resolution"] autoapi_python_class_content = "both" +autoapi_keep_files = False # set to True for debugging generated files def autoapi_skip_member_fn(app, what, name, obj, skip, options) -> bool: # noqa: ARG001 @@ -79,6 +80,9 @@ def autoapi_skip_member_fn(app, what, name, obj, skip, options) -> bool: # noqa ("class", "datafusion.DataFrame"), ("class", "datafusion.SessionContext"), ("module", "datafusion.common"), + # Duplicate modules (skip module-level docs to avoid duplication) + ("module", "datafusion.col"), + ("module", "datafusion.udf"), # Deprecated ("class", "datafusion.substrait.serde"), ("class", "datafusion.substrait.plan"), diff --git a/docs/source/index.rst b/docs/source/index.rst index c18793822..ff1e47280 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -93,3 +93,5 @@ Example :hidden: :maxdepth: 1 :caption: API + + api/index diff --git a/docs/source/user-guide/common-operations/functions.rst b/docs/source/user-guide/common-operations/functions.rst index 12097be8f..ccb47a4e7 100644 --- a/docs/source/user-guide/common-operations/functions.rst +++ b/docs/source/user-guide/common-operations/functions.rst @@ -129,3 +129,24 @@ The function :py:func:`~datafusion.functions.in_list` allows to check a column f .limit(20) .to_pandas() ) + + +Handling Missing Values +======================= + +DataFusion provides methods to handle missing values in DataFrames: + +fill_null +--------- + +The ``fill_null()`` method replaces NULL values in specified columns with a provided value: + +.. code-block:: python + + # Fill all NULL values with 0 where possible + df = df.fill_null(0) + + # Fill NULL values only in specific string columns + df = df.fill_null("missing", subset=["name", "category"]) + +The fill value will be cast to match each column's type. If casting fails for a column, that column remains unchanged. diff --git a/docs/source/user-guide/common-operations/udf-and-udfa.rst b/docs/source/user-guide/common-operations/udf-and-udfa.rst index ffd7a05cb..0830fa81c 100644 --- a/docs/source/user-guide/common-operations/udf-and-udfa.rst +++ b/docs/source/user-guide/common-operations/udf-and-udfa.rst @@ -26,7 +26,7 @@ Scalar Functions When writing a user-defined function that can operate on a row by row basis, these are called Scalar Functions. You can define your own scalar function by calling -:py:func:`~datafusion.udf.ScalarUDF.udf` . +:py:func:`~datafusion.user_defined.ScalarUDF.udf` . The basic definition of a scalar UDF is a python function that takes one or more `pyarrow `_ arrays and returns a single array as @@ -93,9 +93,9 @@ converting to Python objects to do the evaluation. Aggregate Functions ------------------- -The :py:func:`~datafusion.udf.AggregateUDF.udaf` function allows you to define User-Defined +The :py:func:`~datafusion.user_defined.AggregateUDF.udaf` function allows you to define User-Defined Aggregate Functions (UDAFs). To use this you must implement an -:py:class:`~datafusion.udf.Accumulator` that determines how the aggregation is performed. +:py:class:`~datafusion.user_defined.Accumulator` that determines how the aggregation is performed. When defining a UDAF there are four methods you need to implement. The ``update`` function takes the array(s) of input and updates the internal state of the accumulator. You should define this function @@ -153,8 +153,8 @@ Window Functions ---------------- To implement a User-Defined Window Function (UDWF) you must call the -:py:func:`~datafusion.udf.WindowUDF.udwf` function using a class that implements the abstract -class :py:class:`~datafusion.udf.WindowEvaluator`. +:py:func:`~datafusion.user_defined.WindowUDF.udwf` function using a class that implements the abstract +class :py:class:`~datafusion.user_defined.WindowEvaluator`. There are three methods of evaluation of UDWFs. @@ -207,7 +207,7 @@ determine which evaluate functions are called. import pyarrow as pa from datafusion import udwf, col, SessionContext - from datafusion.udf import WindowEvaluator + from datafusion.user_defined import WindowEvaluator class ExponentialSmooth(WindowEvaluator): def __init__(self, alpha: float) -> None: @@ -242,3 +242,35 @@ determine which evaluate functions are called. }) df.select("a", exp_smooth(col("a")).alias("smooth_a")).show() + +Table Functions +--------------- + +User Defined Table Functions are slightly different than the other functions +described here. These functions take any number of `Expr` arguments, but only +literal expressions are supported. Table functions must return a Table +Provider as described in the ref:`_io_custom_table_provider` page. + +Once you have a table function, you can register it with the session context +by using :py:func:`datafusion.context.SessionContext.register_udtf`. + +There are examples of both rust backed and python based table functions in the +examples folder of the repository. If you have a rust backed table function +that you wish to expose via PyO3, you need to expose it as a ``PyCapsule``. + +.. code-block:: rust + + #[pymethods] + impl MyTableFunction { + fn __datafusion_table_function__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let name = cr"datafusion_table_function".into(); + + let func = self.clone(); + let provider = FFI_TableFunction::new(Arc::new(func), None); + + PyCapsule::new(py, provider, Some(name)) + } + } diff --git a/docs/source/user-guide/dataframe.rst b/docs/source/user-guide/dataframe.rst index 11e3d7e72..23c65b5f6 100644 --- a/docs/source/user-guide/dataframe.rst +++ b/docs/source/user-guide/dataframe.rst @@ -122,7 +122,8 @@ Performance Optimization with Shared Styles The ``use_shared_styles`` parameter (enabled by default) optimizes performance when displaying multiple DataFrames in notebook environments: - .. code-block:: python +.. code-block:: python + from datafusion.html_formatter import StyleProvider, configure_formatter # Default: Use shared styles (recommended for notebooks) configure_formatter(use_shared_styles=True) diff --git a/examples/ffi-table-provider/.cargo/config.toml b/examples/datafusion-ffi-example/.cargo/config.toml similarity index 100% rename from examples/ffi-table-provider/.cargo/config.toml rename to examples/datafusion-ffi-example/.cargo/config.toml diff --git a/examples/ffi-table-provider/Cargo.lock b/examples/datafusion-ffi-example/Cargo.lock similarity index 71% rename from examples/ffi-table-provider/Cargo.lock rename to examples/datafusion-ffi-example/Cargo.lock index 8d0edd515..075ebd5a1 100644 --- a/examples/ffi-table-provider/Cargo.lock +++ b/examples/datafusion-ffi-example/Cargo.lock @@ -67,13 +67,13 @@ checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" [[package]] name = "ahash" -version = "0.8.11" +version = "0.8.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", "const-random", - "getrandom", + "getrandom 0.3.3", "once_cell", "version_check", "zerocopy", @@ -105,9 +105,9 @@ dependencies = [ [[package]] name = "allocator-api2" -version = "0.2.20" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45862d1c77f2228b9e10bc609d5bc203d86ebc9b87ad8d5d5167a6c9abf739d9" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" [[package]] name = "android-tzdata" @@ -126,9 +126,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.93" +version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" [[package]] name = "arrayref" @@ -144,9 +144,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "54.1.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6422e12ac345a0678d7a17e316238e3a40547ae7f92052b77bd86d5e0239f3fc" +checksum = "b1bb018b6960c87fd9d025009820406f74e83281185a8bdcb44880d2aa5c9a87" dependencies = [ "arrow-arith", "arrow-array", @@ -165,9 +165,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "54.1.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23cf34bb1f48c41d3475927bcc7be498665b8e80b379b88f62a840337f8b8248" +checksum = "44de76b51473aa888ecd6ad93ceb262fb8d40d1f1154a4df2f069b3590aa7575" dependencies = [ "arrow-array", "arrow-buffer", @@ -179,9 +179,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "54.1.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb4a06d507f54b70a277be22a127c8ffe0cec6cd98c0ad8a48e77779bbda8223" +checksum = "29ed77e22744475a9a53d00026cf8e166fe73cf42d89c4c4ae63607ee1cfcc3f" dependencies = [ "ahash", "arrow-buffer", @@ -190,15 +190,15 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown 0.15.1", + "hashbrown 0.15.3", "num", ] [[package]] name = "arrow-buffer" -version = "54.1.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d69d326d5ad1cb82dcefa9ede3fee8fdca98f9982756b16f9cb142f4aa6edc89" +checksum = "b0391c96eb58bf7389171d1e103112d3fc3e5625ca6b372d606f2688f1ea4cce" dependencies = [ "bytes", "half", @@ -207,9 +207,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "54.1.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "626e65bd42636a84a238bed49d09c8777e3d825bf81f5087a70111c2831d9870" +checksum = "f39e1d774ece9292697fcbe06b5584401b26bd34be1bec25c33edae65c2420ff" dependencies = [ "arrow-array", "arrow-buffer", @@ -228,9 +228,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "54.1.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71c8f959f7a1389b1dbd883cdcd37c3ed12475329c111912f7f69dad8195d8c6" +checksum = "9055c972a07bf12c2a827debfd34f88d3b93da1941d36e1d9fee85eebe38a12a" dependencies = [ "arrow-array", "arrow-cast", @@ -244,9 +244,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "54.1.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1858e7c7d01c44cf71c21a85534fd1a54501e8d60d1195d0d6fbcc00f4b10754" +checksum = "cf75ac27a08c7f48b88e5c923f267e980f27070147ab74615ad85b5c5f90473d" dependencies = [ "arrow-buffer", "arrow-schema", @@ -256,9 +256,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "54.1.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6bb3f727f049884c7603f0364bc9315363f356b59e9f605ea76541847e06a1e" +checksum = "a222f0d93772bd058d1268f4c28ea421a603d66f7979479048c429292fac7b2e" dependencies = [ "arrow-array", "arrow-buffer", @@ -270,9 +270,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "54.1.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35de94f165ed8830aede72c35f238763794f0d49c69d30c44d49c9834267ff8c" +checksum = "9085342bbca0f75e8cb70513c0807cc7351f1fbf5cb98192a67d5e3044acb033" dependencies = [ "arrow-array", "arrow-buffer", @@ -283,16 +283,18 @@ dependencies = [ "half", "indexmap", "lexical-core", + "memchr", "num", "serde", "serde_json", + "simdutf8", ] [[package]] name = "arrow-ord" -version = "54.1.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8aa06e5f267dc53efbacb933485c79b6fc1685d3ffbe870a16ce4e696fb429da" +checksum = "ab2f1065a5cad7b9efa9e22ce5747ce826aa3855766755d4904535123ef431e7" dependencies = [ "arrow-array", "arrow-buffer", @@ -303,9 +305,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "54.1.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66f1144bb456a2f9d82677bd3abcea019217e572fc8f07de5a7bac4b2c56eb2c" +checksum = "3703a0e3e92d23c3f756df73d2dc9476873f873a76ae63ef9d3de17fda83b2d8" dependencies = [ "arrow-array", "arrow-buffer", @@ -316,18 +318,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "54.1.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "105f01ec0090259e9a33a9263ec18ff223ab91a0ea9fbc18042f7e38005142f6" +checksum = "73a47aa0c771b5381de2b7f16998d351a6f4eb839f1e13d48353e17e873d969b" dependencies = [ - "bitflags 2.6.0", + "bitflags", ] [[package]] name = "arrow-select" -version = "54.1.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f690752fdbd2dee278b5f1636fefad8f2f7134c85e20fd59c4199e15a39a6807" +checksum = "24b7b85575702b23b85272b01bc1c25a01c9b9852305e5d0078c79ba25d995d4" dependencies = [ "ahash", "arrow-array", @@ -339,9 +341,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "54.1.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0fff9cd745a7039b66c47ecaf5954460f9fa12eed628f65170117ea93e64ee0" +checksum = "9260fddf1cdf2799ace2b4c2fc0356a9789fa7551e0953e35435536fecefebbd" dependencies = [ "arrow-array", "arrow-buffer", @@ -368,11 +370,11 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.17" +version = "0.4.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cb8f1d480b0ea3783ab015936d2a55c87e219676f0c0b7dec61494043f21857" +checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" dependencies = [ - "bzip2 0.4.4", + "bzip2", "flate2", "futures-core", "memchr", @@ -394,13 +396,13 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.83" +version = "0.1.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" +checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.101", ] [[package]] @@ -420,9 +422,9 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "backtrace" -version = "0.3.74" +version = "0.3.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a" +checksum = "6806a6321ec58106fea15becdad98371e28d92ccbc7c8f1b3b6dd724fe8f1002" dependencies = [ "addr2line", "cfg-if", @@ -441,9 +443,9 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] name = "bigdecimal" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f31f3af01c5c65a07985c804d3366560e6fa7883d640a122819b14ec327482c" +checksum = "1a22f228ab7a1b23027ccc6c350b72868017af7ea8356fbdf19f8d991c690013" dependencies = [ "autocfg", "libm", @@ -454,15 +456,9 @@ dependencies = [ [[package]] name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - -[[package]] -name = "bitflags" -version = "2.6.0" +version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" [[package]] name = "blake2" @@ -475,9 +471,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.4" +version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82033247fd8e890df8f740e407ad4d038debb9eb1f40533fffb32e7d17dc6f7" +checksum = "3888aaa89e4b2a40fca9848e400f6a658a5a3978de7be858e209cafa8be9a4a0" dependencies = [ "arrayref", "arrayvec", @@ -497,9 +493,9 @@ dependencies = [ [[package]] name = "brotli" -version = "7.0.0" +version = "8.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc97b8f16f944bba54f0433f07e30be199b6dc2bd25937444bbad560bcea29bd" +checksum = "9991eea70ea4f293524138648e41ee89b0b2b12ddef3b255effa43c8056e0e0d" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -508,9 +504,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "4.0.1" +version = "5.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a45bd2e4095a8b518033b128020dd4a55aab1c0a381ba4404a472630f4bc362" +checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -518,9 +514,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.16.0" +version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" [[package]] name = "byteorder" @@ -530,46 +526,34 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.8.0" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" - -[[package]] -name = "bzip2" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" -dependencies = [ - "bzip2-sys", - "libc", -] +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" [[package]] name = "bzip2" -version = "0.5.0" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bafdbf26611df8c14810e268ddceda071c297570a5fb360ceddf617fe417ef58" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" dependencies = [ "bzip2-sys", - "libc", ] [[package]] name = "bzip2-sys" -version = "0.1.11+1.0.8" +version = "0.1.13+1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" dependencies = [ "cc", - "libc", "pkg-config", ] [[package]] name = "cc" -version = "1.1.37" +version = "1.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40545c26d092346d8a8dab71ee48e7685a7a9cba76e634790c215b41a4a7b4cf" +checksum = "5f4ac86a9e5bc1e2b3449ab9d7d3a6a405e3d1bb28d7b9be8614f55846ae3766" dependencies = [ "jobserver", "libc", @@ -584,21 +568,21 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.38" +version = "0.4.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" +checksum = "c469d952047f47f91b68d1cba3f10d63c11d73e4636f24f08daf0278abf01c4d" dependencies = [ "android-tzdata", "iana-time-zone", "num-traits", - "windows-targets", + "windows-link", ] [[package]] name = "chrono-tz" -version = "0.10.0" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd6dd8046d00723a59a2f8c5f295c515b9bb9a331ee4f8f3d4dd49e428acd3b6" +checksum = "efdce149c370f133a071ca8ef6ea340b7b88748ab0810097a9e2976eaa34b4f3" dependencies = [ "chrono", "chrono-tz-build", @@ -607,9 +591,9 @@ dependencies = [ [[package]] name = "chrono-tz-build" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94fea34d77a245229e7746bd2beb786cd2a896f306ff491fb8cecb3074b10a7" +checksum = "8f10f8c9340e31fc120ff885fcdb54a0b48e474bbd77cab557f0c30a3e569402" dependencies = [ "parse-zoneinfo", "phf_codegen", @@ -617,12 +601,11 @@ dependencies = [ [[package]] name = "comfy-table" -version = "7.1.1" +version = "7.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7" +checksum = "4a65ebfec4fb190b6f90e944a817d60499ee0744e582530e2c9900a22e591d9a" dependencies = [ - "strum", - "strum_macros", + "unicode-segmentation", "unicode-width", ] @@ -641,16 +624,16 @@ version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" dependencies = [ - "getrandom", + "getrandom 0.2.16", "once_cell", "tiny-keccak", ] [[package]] name = "const_panic" -version = "0.2.10" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "013b6c2c3a14d678f38cd23994b02da3a1a1b6a5d1eedddfe63a5a5f11b13a81" +checksum = "2459fc9262a1aa204eb4b5764ad4f189caec88aea9634389c0a25f8be7f6265e" [[package]] name = "constant_time_eq" @@ -681,9 +664,9 @@ checksum = "69f3b219d28b6e3b4ac87bc1fc522e0803ab22e055da177bff0068c4150c61a6" [[package]] name = "cpufeatures" -version = "0.2.14" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "608697df725056feaccfa42cffdaeeec3fccc4ffc38358ecd19b243e716a78e0" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" dependencies = [ "libc", ] @@ -699,24 +682,24 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.13" +version = "0.5.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" dependencies = [ "crossbeam-utils", ] [[package]] name = "crossbeam-utils" -version = "0.8.20" +version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crunchy" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" [[package]] name = "crypto-common" @@ -742,9 +725,9 @@ dependencies = [ [[package]] name = "csv-core" -version = "0.1.11" +version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" dependencies = [ "memchr", ] @@ -765,39 +748,44 @@ dependencies = [ [[package]] name = "datafusion" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eae420e7a5b0b7f1c39364cc76cbcd0f5fdc416b2514ae3847c2676bbd60702a" +checksum = "ffe060b978f74ab446be722adb8a274e052e005bf6dfd171caadc3abaad10080" dependencies = [ "arrow", - "arrow-array", "arrow-ipc", "arrow-schema", - "async-compression", "async-trait", "bytes", - "bzip2 0.5.0", + "bzip2", "chrono", "datafusion-catalog", + "datafusion-catalog-listing", "datafusion-common", "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-datasource-parquet", "datafusion-execution", "datafusion-expr", + "datafusion-expr-common", "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-nested", "datafusion-functions-table", "datafusion-functions-window", + "datafusion-macros", "datafusion-optimizer", "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-optimizer", "datafusion-physical-plan", + "datafusion-session", "datafusion-sql", "flate2", "futures", - "glob", - "itertools 0.14.0", + "itertools", "log", "object_store", "parking_lot", @@ -807,7 +795,6 @@ dependencies = [ "sqlparser", "tempfile", "tokio", - "tokio-util", "url", "uuid", "xz2", @@ -816,37 +803,62 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f27987bc22b810939e8dfecc55571e9d50355d6ea8ec1c47af8383a76a6d0e1" +checksum = "61fe34f401bd03724a1f96d12108144f8cd495a3cdda2bf5e091822fb80b7e66" dependencies = [ "arrow", "async-trait", "dashmap", "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", "datafusion-execution", "datafusion-expr", + "datafusion-physical-expr", "datafusion-physical-plan", + "datafusion-session", "datafusion-sql", "futures", - "itertools 0.14.0", + "itertools", "log", + "object_store", "parking_lot", - "sqlparser", + "tokio", +] + +[[package]] +name = "datafusion-catalog-listing" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4411b8e3bce5e0fc7521e44f201def2e2d5d1b5f176fb56e8cdc9942c890f00" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "log", + "object_store", + "tokio", ] [[package]] name = "datafusion-common" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3f6d5b8c9408cc692f7c194b8aa0c0f9b253e065a8d960ad9cdc2a13e697602" +checksum = "0734015d81c8375eb5d4869b7f7ecccc2ee8d6cb81948ef737cd0e7b743bd69c" dependencies = [ "ahash", "arrow", - "arrow-array", - "arrow-buffer", "arrow-ipc", - "arrow-schema", "base64", "half", "hashbrown 0.14.5", @@ -864,25 +876,143 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "45.0.0" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5167bb1d2ccbb87c6bc36c295274d7a0519b14afcfdaf401d53cbcaa4ef4968b" +dependencies = [ + "futures", + "log", + "tokio", +] + +[[package]] +name = "datafusion-datasource" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e602dcdf2f50c2abf297cc2203c73531e6f48b29516af7695d338cf2a778b1" +dependencies = [ + "arrow", + "async-compression", + "async-trait", + "bytes", + "bzip2", + "chrono", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "flate2", + "futures", + "glob", + "itertools", + "log", + "object_store", + "parquet", + "rand", + "tempfile", + "tokio", + "tokio-util", + "url", + "xz2", + "zstd", +] + +[[package]] +name = "datafusion-datasource-csv" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bb2253952dc32296ed5b84077cb2e0257fea4be6373e1c376426e17ead4ef6" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "regex", + "tokio", +] + +[[package]] +name = "datafusion-datasource-json" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b8c7f47a5d2fe03bfa521ec9bafdb8a5c82de8377f60967c3663f00c8790352" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "serde_json", + "tokio", +] + +[[package]] +name = "datafusion-datasource-parquet" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d4603c8e8a4baf77660ab7074cc66fc15cc8a18f2ce9dfadb755fc6ee294e48" +checksum = "27d15868ea39ed2dc266728b554f6304acd473de2142281ecfa1294bb7415923" dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools", "log", + "object_store", + "parking_lot", + "parquet", + "rand", "tokio", ] [[package]] name = "datafusion-doc" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5bf4bc68623a5cf231eed601ed6eb41f46a37c4d15d11a0bff24cbc8396cd66" +checksum = "a91f8c2c5788ef32f48ff56c68e5b545527b744822a284373ac79bba1ba47292" [[package]] name = "datafusion-execution" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88b491c012cdf8e051053426013429a76f74ee3c2db68496c79c323ca1084d27" +checksum = "06f004d100f49a3658c9da6fb0c3a9b760062d96cd4ad82ccc3b7b69a9fb2f84" dependencies = [ "arrow", "dashmap", @@ -899,9 +1029,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5a181408d4fc5dc22f9252781a8f39f2d0e5d1b33ec9bde242844980a2689c1" +checksum = "7a4e4ce3802609be38eeb607ee72f6fe86c3091460de9dbfae9e18db423b3964" dependencies = [ "arrow", "chrono", @@ -920,25 +1050,25 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1129b48e8534d8c03c6543bcdccef0b55c8ac0c1272a15a56c67068b6eb1885" +checksum = "422ac9cf3b22bbbae8cdf8ceb33039107fde1b5492693168f13bd566b1bcc839" dependencies = [ "arrow", "datafusion-common", - "itertools 0.14.0", + "indexmap", + "itertools", "paste", ] [[package]] name = "datafusion-ffi" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff47a79d442207c168c6e3e1d970c248589c148e4800e5b285ac1b2cb1a230f8" +checksum = "5cf3fe9ab492c56daeb7beed526690d33622d388b8870472e0b7b7f55490338c" dependencies = [ "abi_stable", "arrow", - "arrow-array", "arrow-schema", "async-ffi", "async-trait", @@ -953,9 +1083,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6125874e4856dfb09b59886784fcb74cde5cfc5930b3a80a1a728ef7a010df6b" +checksum = "2ddf0a0a2db5d2918349c978d42d80926c6aa2459cd8a3c533a84ec4bb63479e" dependencies = [ "arrow", "arrow-buffer", @@ -969,9 +1099,8 @@ dependencies = [ "datafusion-expr", "datafusion-expr-common", "datafusion-macros", - "hashbrown 0.14.5", "hex", - "itertools 0.14.0", + "itertools", "log", "md-5", "rand", @@ -983,14 +1112,12 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3add7b1d3888e05e7c95f2b281af900ca69ebdcb21069ba679b33bde8b3b9d6" +checksum = "408a05dafdc70d05a38a29005b8b15e21b0238734dab1e98483fcb58038c5aba" dependencies = [ "ahash", "arrow", - "arrow-buffer", - "arrow-schema", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -1006,9 +1133,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e18baa4cfc3d2f144f74148ed68a1f92337f5072b6dde204a0dbbdf3324989c" +checksum = "756d21da2dd6c9bef97af1504970ff56cbf35d03fbd4ffd62827f02f4d2279d4" dependencies = [ "ahash", "arrow", @@ -1019,15 +1146,12 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ec5ee8cecb0dc370291279673097ddabec03a011f73f30d7f1096457127e03e" +checksum = "8d8d50f6334b378930d992d801a10ac5b3e93b846b39e4a05085742572844537" dependencies = [ "arrow", - "arrow-array", - "arrow-buffer", "arrow-ord", - "arrow-schema", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -1036,16 +1160,16 @@ dependencies = [ "datafusion-functions-aggregate", "datafusion-macros", "datafusion-physical-expr-common", - "itertools 0.14.0", + "itertools", "log", "paste", ] [[package]] name = "datafusion-functions-table" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c403ddd473bbb0952ba880008428b3c7febf0ed3ce1eec35a205db20efb2a36" +checksum = "cc9a97220736c8fff1446e936be90d57216c06f28969f9ffd3b72ac93c958c8a" dependencies = [ "arrow", "async-trait", @@ -1059,9 +1183,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ab18c2fb835614d06a75f24a9e09136d3a8c12a92d97c95a6af316a1787a9c5" +checksum = "cefc2d77646e1aadd1d6a9c40088937aedec04e68c5f0465939912e1291f8193" dependencies = [ "datafusion-common", "datafusion-doc", @@ -1076,9 +1200,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a77b73bc15e7d1967121fdc7a55d819bfb9d6c03766a6c322247dce9094a53a4" +checksum = "dd4aff082c42fa6da99ce0698c85addd5252928c908eb087ca3cfa64ff16b313" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1086,20 +1210,20 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09369b8d962291e808977cf94d495fd8b5b38647232d7ef562c27ac0f495b0af" +checksum = "df6f88d7ee27daf8b108ba910f9015176b36fbc72902b1ca5c2a5f1d1717e1a1" dependencies = [ "datafusion-expr", "quote", - "syn 2.0.87", + "syn 2.0.101", ] [[package]] name = "datafusion-optimizer" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2403a7e4a84637f3de7d8d4d7a9ccc0cc4be92d89b0161ba3ee5be82f0531c54" +checksum = "084d9f979c4b155346d3c34b18f4256e6904ded508e9554d90fed416415c3515" dependencies = [ "arrow", "chrono", @@ -1107,7 +1231,7 @@ dependencies = [ "datafusion-expr", "datafusion-physical-expr", "indexmap", - "itertools 0.14.0", + "itertools", "log", "recursive", "regex", @@ -1116,15 +1240,12 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86ff72ac702b62dbf2650c4e1d715ebd3e4aab14e3885e72e8549e250307347c" +checksum = "64c536062b0076f4e30084065d805f389f9fe38af0ca75bcbac86bc5e9fbab65" dependencies = [ "ahash", "arrow", - "arrow-array", - "arrow-buffer", - "arrow-schema", "datafusion-common", "datafusion-expr", "datafusion-expr-common", @@ -1133,7 +1254,7 @@ dependencies = [ "half", "hashbrown 0.14.5", "indexmap", - "itertools 0.14.0", + "itertools", "log", "paste", "petgraph", @@ -1141,27 +1262,25 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60982b7d684e25579ee29754b4333057ed62e2cc925383c5f0bd8cab7962f435" +checksum = "f8a92b53b3193fac1916a1c5b8e3f4347c526f6822e56b71faa5fb372327a863" dependencies = [ "ahash", "arrow", - "arrow-buffer", "datafusion-common", "datafusion-expr-common", "hashbrown 0.14.5", - "itertools 0.14.0", + "itertools", ] [[package]] name = "datafusion-physical-optimizer" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac5e85c189d5238a5cf181a624e450c4cd4c66ac77ca551d6f3ff9080bac90bb" +checksum = "6fa0a5ac94c7cf3da97bedabd69d6bbca12aef84b9b37e6e9e8c25286511b5e2" dependencies = [ "arrow", - "arrow-schema", "datafusion-common", "datafusion-execution", "datafusion-expr", @@ -1169,23 +1288,19 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", - "futures", - "itertools 0.14.0", + "itertools", "log", "recursive", - "url", ] [[package]] name = "datafusion-physical-plan" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c36bf163956d7e2542657c78b3383fdc78f791317ef358a359feffcdb968106f" +checksum = "690c615db468c2e5fe5085b232d8b1c088299a6c63d87fd960a354a71f7acb55" dependencies = [ "ahash", "arrow", - "arrow-array", - "arrow-buffer", "arrow-ord", "arrow-schema", "async-trait", @@ -1201,7 +1316,7 @@ dependencies = [ "half", "hashbrown 0.14.5", "indexmap", - "itertools 0.14.0", + "itertools", "log", "parking_lot", "pin-project-lite", @@ -1210,9 +1325,9 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2db5d79f0c974041787b899d24dc91bdab2ff112d1942dd71356a4ce3b407e6c" +checksum = "a4a1afb2bdb05de7ff65be6883ebfd4ec027bd9f1f21c46aa3afd01927160a83" dependencies = [ "arrow", "chrono", @@ -1226,24 +1341,46 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de21bde1603aac0ff32cf478e47081be6e3583c6861fe8f57034da911efe7578" +checksum = "35b7a5876ebd6b564fb9a1fd2c3a2a9686b787071a256b47e4708f0916f9e46f" dependencies = [ "arrow", "datafusion-common", "prost", ] +[[package]] +name = "datafusion-session" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad229a134c7406c057ece00c8743c0c34b97f4e72f78b475fe17b66c5e14fa4f" +dependencies = [ + "arrow", + "async-trait", + "dashmap", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-sql", + "futures", + "itertools", + "log", + "object_store", + "parking_lot", + "tokio", +] + [[package]] name = "datafusion-sql" -version = "45.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13caa4daede211ecec53c78b13c503b592794d125f9a3cc3afe992edf9e7f43" +checksum = "64f6ab28b72b664c21a27b22a2ff815fd390ed224c26e89a93b5a8154a4e8607" dependencies = [ "arrow", - "arrow-array", - "arrow-schema", "bigdecimal", "datafusion-common", "datafusion-expr", @@ -1273,36 +1410,36 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.101", ] [[package]] name = "either" -version = "1.13.0" +version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" [[package]] name = "equivalent" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "errno" -version = "0.3.9" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +checksum = "cea14ef9355e3beab063703aa9dab15afd25f0667c341310c1e5274bb1d0da18" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys", ] [[package]] name = "fastrand" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "ffi-table-provider" @@ -1325,24 +1462,31 @@ checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" [[package]] name = "flatbuffers" -version = "24.12.23" +version = "25.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" +checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1" dependencies = [ - "bitflags 1.3.2", + "bitflags", "rustc_version", ] [[package]] name = "flate2" -version = "1.0.34" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0" +checksum = "7ced92e76e966ca2fd84c8f7aa01a4aea65b0eb6648d72f7c8f3e2764a67fece" dependencies = [ "crc32fast", + "libz-rs-sys", "miniz_oxide", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -1408,7 +1552,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.101", ] [[package]] @@ -1462,13 +1606,25 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.15" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", +] + +[[package]] +name = "getrandom" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" dependencies = [ "cfg-if", "libc", - "wasi", + "r-efi", + "wasi 0.14.2+wasi-0.2.4", ] [[package]] @@ -1479,15 +1635,15 @@ checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] name = "glob" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" [[package]] name = "half" -version = "2.4.1" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" dependencies = [ "cfg-if", "crunchy", @@ -1506,9 +1662,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.1" +version = "0.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3" +checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3" [[package]] name = "heck" @@ -1522,22 +1678,34 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "http" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + [[package]] name = "humantime" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +checksum = "9b112acc8b3adf4b107a8ec20977da0273a8c386765a3ec0229bd500a1443f9f" [[package]] name = "iana-time-zone" -version = "0.1.61" +version = "0.1.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220" +checksum = "b0c919e5debc312ad217002b8048a17b7d83f80703865bbfcfebb0458b0b27d8" dependencies = [ "android_system_properties", "core-foundation-sys", "iana-time-zone-haiku", "js-sys", + "log", "wasm-bindgen", "windows-core", ] @@ -1553,21 +1721,22 @@ dependencies = [ [[package]] name = "icu_collections" -version = "1.5.0" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" dependencies = [ "displaydoc", + "potential_utf", "yoke", "zerofrom", "zerovec", ] [[package]] -name = "icu_locid" -version = "1.5.0" +name = "icu_locale_core" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" dependencies = [ "displaydoc", "litemap", @@ -1576,31 +1745,11 @@ dependencies = [ "zerovec", ] -[[package]] -name = "icu_locid_transform" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" -dependencies = [ - "displaydoc", - "icu_locid", - "icu_locid_transform_data", - "icu_provider", - "tinystr", - "zerovec", -] - -[[package]] -name = "icu_locid_transform_data" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" - [[package]] name = "icu_normalizer" -version = "1.5.0" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" +checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" dependencies = [ "displaydoc", "icu_collections", @@ -1608,67 +1757,54 @@ dependencies = [ "icu_properties", "icu_provider", "smallvec", - "utf16_iter", - "utf8_iter", - "write16", "zerovec", ] [[package]] name = "icu_normalizer_data" -version = "1.5.0" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" +checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" [[package]] name = "icu_properties" -version = "1.5.1" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" +checksum = "2549ca8c7241c82f59c80ba2a6f415d931c5b58d24fb8412caa1a1f02c49139a" dependencies = [ "displaydoc", "icu_collections", - "icu_locid_transform", + "icu_locale_core", "icu_properties_data", "icu_provider", - "tinystr", + "potential_utf", + "zerotrie", "zerovec", ] [[package]] name = "icu_properties_data" -version = "1.5.0" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" +checksum = "8197e866e47b68f8f7d95249e172903bec06004b18b2937f1095d40a0c57de04" [[package]] name = "icu_provider" -version = "1.5.0" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" dependencies = [ "displaydoc", - "icu_locid", - "icu_provider_macros", + "icu_locale_core", "stable_deref_trait", "tinystr", "writeable", "yoke", "zerofrom", + "zerotrie", "zerovec", ] -[[package]] -name = "icu_provider_macros" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.87", -] - [[package]] name = "idna" version = "1.0.3" @@ -1682,9 +1818,9 @@ dependencies = [ [[package]] name = "idna_adapter" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" dependencies = [ "icu_normalizer", "icu_properties", @@ -1692,19 +1828,19 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.7.1" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" +checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" dependencies = [ "equivalent", - "hashbrown 0.15.1", + "hashbrown 0.15.3", ] [[package]] name = "indoc" -version = "2.0.5" +version = "2.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" +checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" [[package]] name = "integer-encoding" @@ -1712,15 +1848,6 @@ version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" -[[package]] -name = "itertools" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" -dependencies = [ - "either", -] - [[package]] name = "itertools" version = "0.14.0" @@ -1732,25 +1859,27 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.11" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] name = "jobserver" -version = "0.1.32" +version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" dependencies = [ + "getrandom 0.3.3", "libc", ] [[package]] name = "js-sys" -version = "0.3.72" +version = "0.3.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" dependencies = [ + "once_cell", "wasm-bindgen", ] @@ -1762,9 +1891,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lexical-core" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0431c65b318a590c1de6b8fd6e72798c92291d27762d94c9e6c37ed7a73d8458" +checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -1775,9 +1904,9 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb17a4bdb9b418051aa59d41d65b1c9be5affab314a872e5ad7f06231fb3b4e0" +checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2" dependencies = [ "lexical-parse-integer", "lexical-util", @@ -1786,9 +1915,9 @@ dependencies = [ [[package]] name = "lexical-parse-integer" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5df98f4a4ab53bf8b175b363a34c7af608fe31f93cc1fb1bf07130622ca4ef61" +checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e" dependencies = [ "lexical-util", "static_assertions", @@ -1796,18 +1925,18 @@ dependencies = [ [[package]] name = "lexical-util" -version = "1.0.3" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85314db53332e5c192b6bca611fb10c114a80d1b831ddac0af1e9be1b9232ca0" +checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3" dependencies = [ "static_assertions", ] [[package]] name = "lexical-write-float" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e7c3ad4e37db81c1cbe7cf34610340adc09c322871972f74877a712abc6c809" +checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" dependencies = [ "lexical-util", "lexical-write-integer", @@ -1816,9 +1945,9 @@ dependencies = [ [[package]] name = "lexical-write-integer" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb89e9f6958b83258afa3deed90b5de9ef68eef090ad5086c791cd2345610162" +checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" dependencies = [ "lexical-util", "static_assertions", @@ -1826,9 +1955,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.162" +version = "0.2.172" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18d287de67fe55fd7e1581fe933d965a5a9477b38e949cfa9f8574ef01506398" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" [[package]] name = "libloading" @@ -1842,21 +1971,30 @@ dependencies = [ [[package]] name = "libm" -version = "0.2.11" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" + +[[package]] +name = "libz-rs-sys" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" +checksum = "6489ca9bd760fe9642d7644e827b0c9add07df89857b0416ee15c1cc1a3b8c5a" +dependencies = [ + "zlib-rs", +] [[package]] name = "linux-raw-sys" -version = "0.4.14" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" [[package]] name = "litemap" -version = "0.7.3" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704" +checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" [[package]] name = "lock_api" @@ -1870,9 +2008,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.22" +version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" [[package]] name = "lz4_flex" @@ -1880,7 +2018,7 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5" dependencies = [ - "twox-hash", + "twox-hash 1.6.3", ] [[package]] @@ -1921,9 +2059,9 @@ dependencies = [ [[package]] name = "miniz_oxide" -version = "0.8.0" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +checksum = "3be647b768db090acb35d5ec5db2b0e1f1de11133ca123b9eacf5137868f892a" dependencies = [ "adler2", ] @@ -2004,39 +2142,42 @@ dependencies = [ [[package]] name = "object" -version = "0.36.5" +version = "0.36.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" dependencies = [ "memchr", ] [[package]] name = "object_store" -version = "0.11.1" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6eb4c22c6154a1e759d7099f9ffad7cc5ef8245f9efbab4a41b92623079c82f3" +checksum = "d94ac16b433c0ccf75326388c893d2835ab7457ea35ab8ba5d745c053ef5fa16" dependencies = [ "async-trait", "bytes", "chrono", "futures", + "http", "humantime", - "itertools 0.13.0", + "itertools", "parking_lot", "percent-encoding", - "snafu", + "thiserror", "tokio", "tracing", "url", "walkdir", + "wasm-bindgen-futures", + "web-time", ] [[package]] name = "once_cell" -version = "1.20.2" +version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "ordered-float" @@ -2072,9 +2213,9 @@ dependencies = [ [[package]] name = "parquet" -version = "54.1.0" +version = "55.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a01a0efa30bbd601ae85b375c728efdb211ade54390281628a7b16708beb235" +checksum = "be7b2d778f6b841d37083ebdf32e33a524acde1266b5884a8ca29bf00dfa1231" dependencies = [ "ahash", "arrow-array", @@ -2091,7 +2232,7 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown 0.15.1", + "hashbrown 0.15.3", "lz4_flex", "num", "num-bigint", @@ -2102,9 +2243,8 @@ dependencies = [ "snap", "thrift", "tokio", - "twox-hash", + "twox-hash 2.1.0", "zstd", - "zstd-sys", ] [[package]] @@ -2140,18 +2280,18 @@ dependencies = [ [[package]] name = "phf" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" dependencies = [ "phf_shared", ] [[package]] name = "phf_codegen" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" dependencies = [ "phf_generator", "phf_shared", @@ -2159,9 +2299,9 @@ dependencies = [ [[package]] name = "phf_generator" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" dependencies = [ "phf_shared", "rand", @@ -2169,18 +2309,18 @@ dependencies = [ [[package]] name = "phf_shared" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" dependencies = [ "siphasher", ] [[package]] name = "pin-project-lite" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" [[package]] name = "pin-utils" @@ -2190,39 +2330,48 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" [[package]] name = "portable-atomic" -version = "1.9.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" +checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" + +[[package]] +name = "potential_utf" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585" +dependencies = [ + "zerovec", +] [[package]] name = "ppv-lite86" -version = "0.2.20" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" dependencies = [ "zerocopy", ] [[package]] name = "proc-macro2" -version = "1.0.89" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" dependencies = [ "unicode-ident", ] [[package]] name = "prost" -version = "0.13.3" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f" +checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" dependencies = [ "bytes", "prost-derive", @@ -2230,31 +2379,31 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.13.3" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5" +checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools 0.13.0", + "itertools", "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.101", ] [[package]] name = "psm" -version = "0.1.24" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "200b9ff220857e53e184257720a14553b2f4aa02577d2ed9842d45d4b9654810" +checksum = "6e944464ec8536cd1beb0bbfd96987eb5e3b72f2ecdafdc5c769a37f1fa2ae1f" dependencies = [ "cc", ] [[package]] name = "pyo3" -version = "0.23.4" +version = "0.23.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57fe09249128b3173d092de9523eaa75136bf7ba85e0d69eca241c7939c933cc" +checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872" dependencies = [ "cfg-if", "indoc", @@ -2270,9 +2419,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.23.4" +version = "0.23.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd3927b5a78757a0d71aa9dff669f903b1eb64b54142a9bd9f757f8fde65fd7" +checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb" dependencies = [ "once_cell", "target-lexicon", @@ -2280,9 +2429,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.23.4" +version = "0.23.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dab6bb2102bd8f991e7749f130a70d05dd557613e39ed2deeee8e9ca0c4d548d" +checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d" dependencies = [ "libc", "pyo3-build-config", @@ -2290,38 +2439,44 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.23.4" +version = "0.23.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91871864b353fd5ffcb3f91f2f703a22a9797c91b9ab497b1acac7b07ae509c7" +checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da" dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.87", + "syn 2.0.101", ] [[package]] name = "pyo3-macros-backend" -version = "0.23.4" +version = "0.23.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43abc3b80bc20f3facd86cd3c60beed58c3e2aa26213f3cda368de39c60a27e4" +checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028" dependencies = [ "heck", "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.87", + "syn 2.0.101", ] [[package]] name = "quote" -version = "1.0.37" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" + [[package]] name = "rand" version = "0.8.5" @@ -2349,7 +2504,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.16", ] [[package]] @@ -2369,16 +2524,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.87", + "syn 2.0.101", ] [[package]] name = "redox_syscall" -version = "0.5.7" +version = "0.5.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" +checksum = "928fca9cf2aa042393a8325b9ead81d2f0df4cb12e1e24cef072922ccd99c5af" dependencies = [ - "bitflags 2.6.0", + "bitflags", ] [[package]] @@ -2395,9 +2550,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", @@ -2436,28 +2591,28 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.40" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99e4ea3e1cdc4b559b8e5650f9c8e5998e3e5c1343b4eaf034565f32318d63c0" +checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266" dependencies = [ - "bitflags 2.6.0", + "bitflags", "errno", "libc", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys", ] [[package]] name = "rustversion" -version = "1.0.18" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248" +checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2" [[package]] name = "ryu" -version = "1.0.18" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" [[package]] name = "same-file" @@ -2476,41 +2631,41 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "semver" -version = "1.0.25" +version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03" +checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" [[package]] name = "seq-macro" -version = "0.3.5" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" [[package]] name = "serde" -version = "1.0.214" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f55c3193aca71c12ad7890f1785d2b73e1b9f63a0bbc353c08ef26fe03fc56b5" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.214" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.101", ] [[package]] name = "serde_json" -version = "1.0.132" +version = "1.0.140" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" dependencies = [ "itoa", "memchr", @@ -2520,9 +2675,9 @@ dependencies = [ [[package]] name = "sha2" -version = "0.10.8" +version = "0.10.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", "cpufeatures", @@ -2543,9 +2698,9 @@ checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" [[package]] name = "siphasher" -version = "0.3.11" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" [[package]] name = "slab" @@ -2558,30 +2713,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.13.2" +version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" - -[[package]] -name = "snafu" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "223891c85e2a29c3fe8fb900c1fae5e69c2e42415e3177752e8718475efa5019" -dependencies = [ - "snafu-derive", -] - -[[package]] -name = "snafu-derive" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03c3c6b7927ffe7ecaa769ee0e3994da3b8cafc8f444578982c83ecb161af917" -dependencies = [ - "heck", - "proc-macro2", - "quote", - "syn 2.0.87", -] +checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" [[package]] name = "snap" @@ -2591,11 +2725,12 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "sqlparser" -version = "0.53.0" +version = "0.55.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05a528114c392209b3264855ad491fcce534b94a38771b0a0b97a79379275ce8" +checksum = "c4521174166bac1ff04fe16ef4524c70144cd29682a45978978ca3d7f4e0be11" dependencies = [ "log", + "recursive", "sqlparser_derive", ] @@ -2607,7 +2742,7 @@ checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.101", ] [[package]] @@ -2618,15 +2753,15 @@ checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" [[package]] name = "stacker" -version = "0.1.17" +version = "0.1.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "799c883d55abdb5e98af1a7b3f23b9b6de8ecada0ecac058672d7635eb48ca7b" +checksum = "cddb07e32ddb770749da91081d8d0ac3a16f1a569a18b20348cd371f5dead06b" dependencies = [ "cc", "cfg-if", "libc", "psm", - "windows-sys 0.59.0", + "windows-sys", ] [[package]] @@ -2635,25 +2770,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" -[[package]] -name = "strum" -version = "0.26.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" - -[[package]] -name = "strum_macros" -version = "0.26.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" -dependencies = [ - "heck", - "proc-macro2", - "quote", - "rustversion", - "syn 2.0.87", -] - [[package]] name = "subtle" version = "2.6.1" @@ -2673,9 +2789,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.87" +version = "2.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" dependencies = [ "proc-macro2", "quote", @@ -2684,13 +2800,13 @@ dependencies = [ [[package]] name = "synstructure" -version = "0.13.1" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.101", ] [[package]] @@ -2701,15 +2817,35 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.14.0" +version = "3.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" +checksum = "e8a64e3985349f2441a1a9ef0b853f869006c3855f2cda6862a94d26ebb9d6a1" dependencies = [ - "cfg-if", "fastrand", + "getrandom 0.3.3", "once_cell", "rustix", - "windows-sys 0.59.0", + "windows-sys", +] + +[[package]] +name = "thiserror" +version = "2.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.101", ] [[package]] @@ -2734,9 +2870,9 @@ dependencies = [ [[package]] name = "tinystr" -version = "0.7.6" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" dependencies = [ "displaydoc", "zerovec", @@ -2744,9 +2880,9 @@ dependencies = [ [[package]] name = "tokio" -version = "1.41.1" +version = "1.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22cfb5bee7a6a52939ca9224d6ac897bb669134078daa8735560897f69de4d33" +checksum = "2513ca694ef9ede0fb23fe71a4ee4107cb102b9dc1930f6d0fd77aae068ae165" dependencies = [ "backtrace", "bytes", @@ -2756,20 +2892,20 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" +checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.101", ] [[package]] name = "tokio-util" -version = "0.7.12" +version = "0.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" +checksum = "66a539a9ad6d5d281510d5bd368c973d636c02dbf8a67300bfb6b950696ad7df" dependencies = [ "bytes", "futures-core", @@ -2780,9 +2916,9 @@ dependencies = [ [[package]] name = "tracing" -version = "0.1.40" +version = "0.1.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" dependencies = [ "pin-project-lite", "tracing-attributes", @@ -2791,20 +2927,20 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.27" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.101", ] [[package]] name = "tracing-core" -version = "0.1.32" +version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" dependencies = [ "once_cell", ] @@ -2834,6 +2970,12 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "twox-hash" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7b17f197b3050ba473acf9181f7b1d3b66d1cf7356c6cc57886662276e65908" + [[package]] name = "typed-arena" version = "2.0.2" @@ -2842,15 +2984,15 @@ checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" [[package]] name = "typenum" -version = "1.17.0" +version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" +checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" [[package]] name = "unicode-ident" -version = "1.0.13" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" [[package]] name = "unicode-segmentation" @@ -2860,15 +3002,15 @@ checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-width" -version = "0.1.14" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" [[package]] name = "unindent" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" +checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" [[package]] name = "url" @@ -2881,12 +3023,6 @@ dependencies = [ "percent-encoding", ] -[[package]] -name = "utf16_iter" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" - [[package]] name = "utf8_iter" version = "1.0.4" @@ -2895,11 +3031,13 @@ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] name = "uuid" -version = "1.11.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" +checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9" dependencies = [ - "getrandom", + "getrandom 0.3.3", + "js-sys", + "wasm-bindgen", ] [[package]] @@ -2924,37 +3062,59 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasi" +version = "0.14.2+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" +dependencies = [ + "wit-bindgen-rt", +] + [[package]] name = "wasm-bindgen" -version = "0.2.95" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" dependencies = [ "cfg-if", "once_cell", + "rustversion", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" -version = "0.2.95" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" dependencies = [ "bumpalo", "log", - "once_cell", "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.101", "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61" +dependencies = [ + "cfg-if", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" -version = "0.2.95" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2962,22 +3122,35 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.95" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.101", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.95" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-sys" +version = "0.3.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" +checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" +dependencies = [ + "js-sys", + "wasm-bindgen", +] [[package]] name = "web-time" @@ -3011,7 +3184,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.59.0", + "windows-sys", ] [[package]] @@ -3022,20 +3195,61 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-core" -version = "0.52.0" +version = "0.61.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +checksum = "46ec44dc15085cea82cf9c78f85a9114c463a369786585ad2882d1ff0b0acf40" dependencies = [ - "windows-targets", + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", ] [[package]] -name = "windows-sys" -version = "0.52.0" +name = "windows-implement" +version = "0.60.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836" dependencies = [ - "windows-targets", + "proc-macro2", + "quote", + "syn 2.0.101", +] + +[[package]] +name = "windows-interface" +version = "0.59.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.101", +] + +[[package]] +name = "windows-link" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38" + +[[package]] +name = "windows-result" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b895b5356fc36103d0f64dd1e94dfa7ac5633f1c9dd6e80fe9ec4adef69e09d" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a7ab927b2637c19b3dbe0965e75d8f2d30bdd697a1516191cad2ec4df8fb28a" +dependencies = [ + "windows-link", ] [[package]] @@ -3112,16 +3326,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] -name = "write16" -version = "1.0.0" +name = "wit-bindgen-rt" +version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" +checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" +dependencies = [ + "bitflags", +] [[package]] name = "writeable" -version = "0.5.5" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" +checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" [[package]] name = "xz2" @@ -3134,9 +3351,9 @@ dependencies = [ [[package]] name = "yoke" -version = "0.7.4" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5" +checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" dependencies = [ "serde", "stable_deref_trait", @@ -3146,63 +3363,73 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.7.4" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95" +checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.101", "synstructure", ] [[package]] name = "zerocopy" -version = "0.7.35" +version = "0.8.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb" dependencies = [ - "byteorder", "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.7.35" +version = "0.8.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.101", ] [[package]] name = "zerofrom" -version = "0.1.4" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" dependencies = [ "zerofrom-derive", ] [[package]] name = "zerofrom-derive" -version = "0.1.4" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.101", "synstructure", ] +[[package]] +name = "zerotrie" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + [[package]] name = "zerovec" -version = "0.10.4" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" +checksum = "4a05eb080e015ba39cc9e23bbe5e7fb04d5fb040350f99f34e338d5fdd294428" dependencies = [ "yoke", "zerofrom", @@ -3211,38 +3438,44 @@ dependencies = [ [[package]] name = "zerovec-derive" -version = "0.10.3" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" +checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.101", ] +[[package]] +name = "zlib-rs" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "868b928d7949e09af2f6086dfc1e01936064cc7a819253bce650d4e2a2d63ba8" + [[package]] name = "zstd" -version = "0.13.2" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" dependencies = [ "zstd-safe", ] [[package]] name = "zstd-safe" -version = "7.2.1" +version = "7.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" dependencies = [ "zstd-sys", ] [[package]] name = "zstd-sys" -version = "2.0.13+zstd.1.5.6" +version = "2.0.15+zstd.1.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" +checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237" dependencies = [ "cc", "pkg-config", diff --git a/examples/ffi-table-provider/Cargo.toml b/examples/datafusion-ffi-example/Cargo.toml similarity index 83% rename from examples/ffi-table-provider/Cargo.toml rename to examples/datafusion-ffi-example/Cargo.toml index f4e4fda79..0e17567b9 100644 --- a/examples/ffi-table-provider/Cargo.toml +++ b/examples/datafusion-ffi-example/Cargo.toml @@ -21,16 +21,16 @@ version = "0.1.0" edition = "2021" [dependencies] -datafusion = { version = "45.0.0" } -datafusion-ffi = { version = "45.0.0" } +datafusion = { version = "47.0.0" } +datafusion-ffi = { version = "47.0.0" } pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] } -arrow = { version = "54" } -arrow-array = { version = "54" } -arrow-schema = { version = "54" } +arrow = { version = "55.0.0" } +arrow-array = { version = "55.0.0" } +arrow-schema = { version = "55.0.0" } [build-dependencies] pyo3-build-config = "0.23" [lib] -name = "ffi_table_provider" +name = "datafusion_ffi_example" crate-type = ["cdylib", "rlib"] diff --git a/examples/ffi-table-provider/build.rs b/examples/datafusion-ffi-example/build.rs similarity index 100% rename from examples/ffi-table-provider/build.rs rename to examples/datafusion-ffi-example/build.rs diff --git a/examples/ffi-table-provider/pyproject.toml b/examples/datafusion-ffi-example/pyproject.toml similarity index 97% rename from examples/ffi-table-provider/pyproject.toml rename to examples/datafusion-ffi-example/pyproject.toml index 9cd25b423..0c54df95c 100644 --- a/examples/ffi-table-provider/pyproject.toml +++ b/examples/datafusion-ffi-example/pyproject.toml @@ -20,7 +20,7 @@ requires = ["maturin>=1.6,<2.0"] build-backend = "maturin" [project] -name = "ffi_table_provider" +name = "datafusion_ffi_example" requires-python = ">=3.9" classifiers = [ "Programming Language :: Rust", diff --git a/examples/datafusion-ffi-example/python/tests/_test_table_function.py b/examples/datafusion-ffi-example/python/tests/_test_table_function.py new file mode 100644 index 000000000..f3c56a90a --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/_test_table_function.py @@ -0,0 +1,134 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pyarrow as pa +from datafusion import Expr, SessionContext, udtf +from datafusion_ffi_example import MyTableFunction, MyTableProvider + +if TYPE_CHECKING: + from datafusion.context import TableProviderExportable + + +def test_ffi_table_function_register(): + ctx = SessionContext() + table_func = MyTableFunction() + table_udtf = udtf(table_func, "my_table_func") + ctx.register_udtf(table_udtf) + result = ctx.sql("select * from my_table_func()").collect() + + assert len(result) == 2 + assert result[0].num_columns == 4 + print(result) + + result = [r.column(0) for r in result] + expected = [ + pa.array([0, 1, 2], type=pa.int32()), + pa.array([3, 4, 5, 6], type=pa.int32()), + ] + + assert result == expected + + +def test_ffi_table_function_call_directly(): + ctx = SessionContext() + table_func = MyTableFunction() + table_udtf = udtf(table_func, "my_table_func") + + my_table = table_udtf() + ctx.register_table_provider("t", my_table) + result = ctx.table("t").collect() + + assert len(result) == 2 + assert result[0].num_columns == 4 + print(result) + + result = [r.column(0) for r in result] + expected = [ + pa.array([0, 1, 2], type=pa.int32()), + pa.array([3, 4, 5, 6], type=pa.int32()), + ] + + assert result == expected + + +class PythonTableFunction: + """Python based table function. + + This class is used as a Python implementation of a table function. + We use the existing TableProvider to create the underlying + provider, and this function takes no arguments + """ + + def __call__( + self, num_cols: Expr, num_rows: Expr, num_batches: Expr + ) -> TableProviderExportable: + args = [ + num_cols.to_variant().value_i64(), + num_rows.to_variant().value_i64(), + num_batches.to_variant().value_i64(), + ] + return MyTableProvider(*args) + + +def common_table_function_test(test_ctx: SessionContext) -> None: + result = test_ctx.sql("select * from my_table_func(3,2,4)").collect() + + assert len(result) == 4 + assert result[0].num_columns == 3 + print(result) + + result = [r.column(0) for r in result] + expected = [ + pa.array([0, 1], type=pa.int32()), + pa.array([2, 3, 4], type=pa.int32()), + pa.array([4, 5, 6, 7], type=pa.int32()), + pa.array([6, 7, 8, 9, 10], type=pa.int32()), + ] + + assert result == expected + + +def test_python_table_function(): + ctx = SessionContext() + table_func = PythonTableFunction() + table_udtf = udtf(table_func, "my_table_func") + ctx.register_udtf(table_udtf) + + common_table_function_test(ctx) + + +def test_python_table_function_decorator(): + ctx = SessionContext() + + @udtf("my_table_func") + def my_udtf( + num_cols: Expr, num_rows: Expr, num_batches: Expr + ) -> TableProviderExportable: + args = [ + num_cols.to_variant().value_i64(), + num_rows.to_variant().value_i64(), + num_batches.to_variant().value_i64(), + ] + return MyTableProvider(*args) + + ctx.register_udtf(my_udtf) + + common_table_function_test(ctx) diff --git a/examples/ffi-table-provider/python/tests/_test_table_provider.py b/examples/datafusion-ffi-example/python/tests/_test_table_provider.py similarity index 94% rename from examples/ffi-table-provider/python/tests/_test_table_provider.py rename to examples/datafusion-ffi-example/python/tests/_test_table_provider.py index 0db3ec561..6b24da06c 100644 --- a/examples/ffi-table-provider/python/tests/_test_table_provider.py +++ b/examples/datafusion-ffi-example/python/tests/_test_table_provider.py @@ -15,9 +15,11 @@ # specific language governing permissions and limitations # under the License. +from __future__ import annotations + import pyarrow as pa from datafusion import SessionContext -from ffi_table_provider import MyTableProvider +from datafusion_ffi_example import MyTableProvider def test_table_loading(): diff --git a/examples/datafusion-ffi-example/src/lib.rs b/examples/datafusion-ffi-example/src/lib.rs new file mode 100644 index 000000000..ae08c3b65 --- /dev/null +++ b/examples/datafusion-ffi-example/src/lib.rs @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::table_function::MyTableFunction; +use crate::table_provider::MyTableProvider; +use pyo3::prelude::*; + +pub(crate) mod table_function; +pub(crate) mod table_provider; + +#[pymodule] +fn datafusion_ffi_example(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + Ok(()) +} diff --git a/examples/datafusion-ffi-example/src/table_function.rs b/examples/datafusion-ffi-example/src/table_function.rs new file mode 100644 index 000000000..2d7b356e3 --- /dev/null +++ b/examples/datafusion-ffi-example/src/table_function.rs @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::table_provider::MyTableProvider; +use datafusion::catalog::{TableFunctionImpl, TableProvider}; +use datafusion::error::Result as DataFusionResult; +use datafusion::prelude::Expr; +use datafusion_ffi::udtf::FFI_TableFunction; +use pyo3::types::PyCapsule; +use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; +use std::sync::Arc; + +#[pyclass(name = "MyTableFunction", module = "datafusion_ffi_example", subclass)] +#[derive(Debug, Clone)] +pub(crate) struct MyTableFunction {} + +#[pymethods] +impl MyTableFunction { + #[new] + fn new() -> Self { + Self {} + } + + fn __datafusion_table_function__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let name = cr"datafusion_table_function".into(); + + let func = self.clone(); + let provider = FFI_TableFunction::new(Arc::new(func), None); + + PyCapsule::new(py, provider, Some(name)) + } +} + +impl TableFunctionImpl for MyTableFunction { + fn call(&self, _args: &[Expr]) -> DataFusionResult> { + let provider = MyTableProvider::new(4, 3, 2).create_table()?; + Ok(Arc::new(provider)) + } +} diff --git a/examples/ffi-table-provider/src/lib.rs b/examples/datafusion-ffi-example/src/table_provider.rs similarity index 71% rename from examples/ffi-table-provider/src/lib.rs rename to examples/datafusion-ffi-example/src/table_provider.rs index 88deeece2..e884585b5 100644 --- a/examples/ffi-table-provider/src/lib.rs +++ b/examples/datafusion-ffi-example/src/table_provider.rs @@ -15,25 +15,21 @@ // specific language governing permissions and limitations // under the License. -use std::{ffi::CString, sync::Arc}; - -use arrow_array::ArrayRef; -use datafusion::{ - arrow::{ - array::RecordBatch, - datatypes::{DataType, Field, Schema}, - }, - datasource::MemTable, - error::{DataFusionError, Result}, -}; +use arrow_array::{ArrayRef, RecordBatch}; +use arrow_schema::{DataType, Field, Schema}; +use datafusion::catalog::MemTable; +use datafusion::error::{DataFusionError, Result as DataFusionResult}; use datafusion_ffi::table_provider::FFI_TableProvider; -use pyo3::{exceptions::PyRuntimeError, prelude::*, types::PyCapsule}; +use pyo3::exceptions::PyRuntimeError; +use pyo3::types::PyCapsule; +use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; +use std::sync::Arc; /// In order to provide a test that demonstrates different sized record batches, /// the first batch will have num_rows, the second batch num_rows+1, and so on. -#[pyclass(name = "MyTableProvider", module = "ffi_table_provider", subclass)] +#[pyclass(name = "MyTableProvider", module = "datafusion_ffi_example", subclass)] #[derive(Clone)] -struct MyTableProvider { +pub(crate) struct MyTableProvider { num_cols: usize, num_rows: usize, num_batches: usize, @@ -44,21 +40,19 @@ fn create_record_batch( num_cols: usize, start_value: i32, num_values: usize, -) -> Result { +) -> DataFusionResult { let end_value = start_value + num_values as i32; let row_values: Vec = (start_value..end_value).collect(); let columns: Vec<_> = (0..num_cols) - .map(|_| { - std::sync::Arc::new(arrow::array::Int32Array::from(row_values.clone())) as ArrayRef - }) + .map(|_| Arc::new(arrow::array::Int32Array::from(row_values.clone())) as ArrayRef) .collect(); RecordBatch::try_new(Arc::clone(schema), columns).map_err(DataFusionError::from) } impl MyTableProvider { - fn create_table(&self) -> Result { + pub fn create_table(&self) -> DataFusionResult { let fields: Vec<_> = (0..self.num_cols) .map(|idx| (b'A' + idx as u8) as char) .map(|col_name| Field::new(col_name, DataType::Int32, true)) @@ -66,7 +60,7 @@ impl MyTableProvider { let schema = Arc::new(Schema::new(fields)); - let batches: Result> = (0..self.num_batches) + let batches: DataFusionResult> = (0..self.num_batches) .map(|batch_idx| { let start_value = batch_idx * self.num_rows; create_record_batch( @@ -85,7 +79,7 @@ impl MyTableProvider { #[pymethods] impl MyTableProvider { #[new] - fn new(num_cols: usize, num_rows: usize, num_batches: usize) -> Self { + pub fn new(num_cols: usize, num_rows: usize, num_batches: usize) -> Self { Self { num_cols, num_rows, @@ -93,23 +87,17 @@ impl MyTableProvider { } } - fn __datafusion_table_provider__<'py>( + pub fn __datafusion_table_provider__<'py>( &self, py: Python<'py>, ) -> PyResult> { - let name = CString::new("datafusion_table_provider").unwrap(); + let name = cr"datafusion_table_provider".into(); let provider = self .create_table() .map_err(|e| PyRuntimeError::new_err(e.to_string()))?; let provider = FFI_TableProvider::new(Arc::new(provider), false, None); - PyCapsule::new_bound(py, provider, Some(name.clone())) + PyCapsule::new(py, provider, Some(name)) } } - -#[pymodule] -fn ffi_table_provider(m: &Bound<'_, PyModule>) -> PyResult<()> { - m.add_class::()?; - Ok(()) -} diff --git a/examples/python-udwf.py b/examples/python-udwf.py index 98d118bf2..645ded188 100644 --- a/examples/python-udwf.py +++ b/examples/python-udwf.py @@ -22,7 +22,7 @@ from datafusion import col, lit, udwf from datafusion import functions as f from datafusion.expr import WindowFrame -from datafusion.udf import WindowEvaluator +from datafusion.user_defined import WindowEvaluator # This example creates five different examples of user defined window functions in order # to demonstrate the variety of ways a user may need to implement. diff --git a/pyproject.toml b/pyproject.toml index d86b657ec..728cedb2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,6 +63,10 @@ exclude = [".github/**", "ci/**", ".asf.yaml"] locked = true features = ["substrait"] +[tool.pytest.ini_options] +asyncio_mode = "auto" +asyncio_default_fixture_loop_scope = "function" + # Enable docstring linting using the google style guide [tool.ruff.lint] select = ["ALL" ] diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index fd7cd000a..16d65f685 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -21,6 +21,10 @@ See https://datafusion.apache.org/python for more information. """ +from __future__ import annotations + +from typing import Any + try: import importlib.metadata as importlib_metadata except ImportError: @@ -51,7 +55,17 @@ from .io import read_avro, read_csv, read_json, read_parquet from .plan import ExecutionPlan, LogicalPlan from .record_batch import RecordBatch, RecordBatchStream -from .udf import Accumulator, AggregateUDF, ScalarUDF, WindowUDF, udaf, udf, udwf +from .user_defined import ( + Accumulator, + AggregateUDF, + ScalarUDF, + TableFunction, + WindowUDF, + udaf, + udf, + udtf, + udwf, +) __version__ = importlib_metadata.version(__name__) @@ -76,6 +90,7 @@ "SessionConfig", "SessionContext", "Table", + "TableFunction", "WindowFrame", "WindowUDF", "col", @@ -94,6 +109,7 @@ "substrait", "udaf", "udf", + "udtf", "udwf", "unparser", ] @@ -120,3 +136,18 @@ def str_lit(value): def lit(value) -> Expr: """Create a literal expression.""" return Expr.literal(value) + + +def literal_with_metadata(value: Any, metadata: dict[str, str]) -> Expr: + """Creates a new expression representing a scalar value with metadata. + + Args: + value: A valid PyArrow scalar value or easily castable to one. + metadata: Metadata to attach to the expression. + """ + return Expr.literal_with_metadata(value, metadata) + + +def lit_with_metadata(value: Any, metadata: dict[str, str]) -> Expr: + """Alias for literal_with_metadata.""" + return literal_with_metadata(value, metadata) diff --git a/python/datafusion/catalog.py b/python/datafusion/catalog.py index 6c3f188cc..67ab3ead2 100644 --- a/python/datafusion/catalog.py +++ b/python/datafusion/catalog.py @@ -34,6 +34,10 @@ def __init__(self, catalog: df_internal.Catalog) -> None: """This constructor is not typically called by the end user.""" self.catalog = catalog + def __repr__(self) -> str: + """Print a string representation of the catalog.""" + return self.catalog.__repr__() + def names(self) -> list[str]: """Returns the list of databases in this catalog.""" return self.catalog.names() @@ -50,6 +54,10 @@ def __init__(self, db: df_internal.Database) -> None: """This constructor is not typically called by the end user.""" self.db = db + def __repr__(self) -> str: + """Print a string representation of the database.""" + return self.db.__repr__() + def names(self) -> set[str]: """Returns the list of all tables in this database.""" return self.db.names() @@ -66,6 +74,10 @@ def __init__(self, table: df_internal.Table) -> None: """This constructor is not typically called by the end user.""" self.table = table + def __repr__(self) -> str: + """Print a string representation of the table.""" + return self.table.__repr__() + @property def schema(self) -> pa.Schema: """Returns the schema associated with this table.""" diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 1429a4975..5b99b0d26 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -19,8 +19,11 @@ from __future__ import annotations +import warnings from typing import TYPE_CHECKING, Any, Protocol +import pyarrow as pa + try: from warnings import deprecated # Python 3.13+ except ImportError: @@ -30,7 +33,7 @@ from datafusion.dataframe import DataFrame from datafusion.expr import Expr, SortExpr, sort_list_to_raw_sort_list from datafusion.record_batch import RecordBatchStream -from datafusion.udf import AggregateUDF, ScalarUDF, WindowUDF +from datafusion.user_defined import AggregateUDF, ScalarUDF, TableFunction, WindowUDF from ._internal import RuntimeEnvBuilder as RuntimeEnvBuilderInternal from ._internal import SessionConfig as SessionConfigInternal @@ -42,7 +45,6 @@ import pandas as pd import polars as pl - import pyarrow as pa from datafusion.plan import ExecutionPlan, LogicalPlan @@ -496,6 +498,10 @@ def __init__( self.ctx = SessionContextInternal(config, runtime) + def __repr__(self) -> str: + """Print a string representation of the Session Context.""" + return self.ctx.__repr__() + @classmethod def global_ctx(cls) -> SessionContext: """Retrieve the global context as a `SessionContext` wrapper. @@ -535,7 +541,7 @@ def register_listing_table( self, name: str, path: str | pathlib.Path, - table_partition_cols: list[tuple[str, str]] | None = None, + table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None, file_extension: str = ".parquet", schema: pa.Schema | None = None, file_sort_order: list[list[Expr | SortExpr]] | None = None, @@ -556,6 +562,7 @@ def register_listing_table( """ if table_partition_cols is None: table_partition_cols = [] + table_partition_cols = self._convert_table_partition_cols(table_partition_cols) file_sort_order_raw = ( [sort_list_to_raw_sort_list(f) for f in file_sort_order] if file_sort_order is not None @@ -752,6 +759,10 @@ def register_table_provider( """ self.ctx.register_table_provider(name, provider) + def register_udtf(self, func: TableFunction) -> None: + """Register a user defined table function.""" + self.ctx.register_udtf(func._udtf) + def register_record_batches( self, name: str, partitions: list[list[pa.RecordBatch]] ) -> None: @@ -770,7 +781,7 @@ def register_parquet( self, name: str, path: str | pathlib.Path, - table_partition_cols: list[tuple[str, str]] | None = None, + table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None, parquet_pruning: bool = True, file_extension: str = ".parquet", skip_metadata: bool = True, @@ -798,6 +809,7 @@ def register_parquet( """ if table_partition_cols is None: table_partition_cols = [] + table_partition_cols = self._convert_table_partition_cols(table_partition_cols) self.ctx.register_parquet( name, str(path), @@ -861,7 +873,7 @@ def register_json( schema: pa.Schema | None = None, schema_infer_max_records: int = 1000, file_extension: str = ".json", - table_partition_cols: list[tuple[str, str]] | None = None, + table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None, file_compression_type: str | None = None, ) -> None: """Register a JSON file as a table. @@ -882,6 +894,7 @@ def register_json( """ if table_partition_cols is None: table_partition_cols = [] + table_partition_cols = self._convert_table_partition_cols(table_partition_cols) self.ctx.register_json( name, str(path), @@ -898,7 +911,7 @@ def register_avro( path: str | pathlib.Path, schema: pa.Schema | None = None, file_extension: str = ".avro", - table_partition_cols: list[tuple[str, str]] | None = None, + table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None, ) -> None: """Register an Avro file as a table. @@ -914,6 +927,7 @@ def register_avro( """ if table_partition_cols is None: table_partition_cols = [] + table_partition_cols = self._convert_table_partition_cols(table_partition_cols) self.ctx.register_avro( name, str(path), schema, file_extension, table_partition_cols ) @@ -973,7 +987,7 @@ def read_json( schema: pa.Schema | None = None, schema_infer_max_records: int = 1000, file_extension: str = ".json", - table_partition_cols: list[tuple[str, str]] | None = None, + table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None, file_compression_type: str | None = None, ) -> DataFrame: """Read a line-delimited JSON data source. @@ -993,6 +1007,7 @@ def read_json( """ if table_partition_cols is None: table_partition_cols = [] + table_partition_cols = self._convert_table_partition_cols(table_partition_cols) return DataFrame( self.ctx.read_json( str(path), @@ -1012,7 +1027,7 @@ def read_csv( delimiter: str = ",", schema_infer_max_records: int = 1000, file_extension: str = ".csv", - table_partition_cols: list[tuple[str, str]] | None = None, + table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None, file_compression_type: str | None = None, ) -> DataFrame: """Read a CSV data source. @@ -1037,6 +1052,7 @@ def read_csv( """ if table_partition_cols is None: table_partition_cols = [] + table_partition_cols = self._convert_table_partition_cols(table_partition_cols) path = [str(p) for p in path] if isinstance(path, list) else str(path) @@ -1056,7 +1072,7 @@ def read_csv( def read_parquet( self, path: str | pathlib.Path, - table_partition_cols: list[tuple[str, str]] | None = None, + table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None, parquet_pruning: bool = True, file_extension: str = ".parquet", skip_metadata: bool = True, @@ -1085,6 +1101,7 @@ def read_parquet( """ if table_partition_cols is None: table_partition_cols = [] + table_partition_cols = self._convert_table_partition_cols(table_partition_cols) file_sort_order = ( [sort_list_to_raw_sort_list(f) for f in file_sort_order] if file_sort_order is not None @@ -1106,7 +1123,7 @@ def read_avro( self, path: str | pathlib.Path, schema: pa.Schema | None = None, - file_partition_cols: list[tuple[str, str]] | None = None, + file_partition_cols: list[tuple[str, str | pa.DataType]] | None = None, file_extension: str = ".avro", ) -> DataFrame: """Create a :py:class:`DataFrame` for reading Avro data source. @@ -1122,6 +1139,7 @@ def read_avro( """ if file_partition_cols is None: file_partition_cols = [] + file_partition_cols = self._convert_table_partition_cols(file_partition_cols) return DataFrame( self.ctx.read_avro(str(path), schema, file_partition_cols, file_extension) ) @@ -1138,3 +1156,41 @@ def read_table(self, table: Table) -> DataFrame: def execute(self, plan: ExecutionPlan, partitions: int) -> RecordBatchStream: """Execute the ``plan`` and return the results.""" return RecordBatchStream(self.ctx.execute(plan._raw_plan, partitions)) + + @staticmethod + def _convert_table_partition_cols( + table_partition_cols: list[tuple[str, str | pa.DataType]], + ) -> list[tuple[str, pa.DataType]]: + warn = False + converted_table_partition_cols = [] + + for col, data_type in table_partition_cols: + if isinstance(data_type, str): + warn = True + if data_type == "string": + converted_data_type = pa.string() + elif data_type == "int": + converted_data_type = pa.int32() + else: + message = ( + f"Unsupported literal data type '{data_type}' for partition " + "column. Supported types are 'string' and 'int'" + ) + raise ValueError(message) + else: + converted_data_type = data_type + + converted_table_partition_cols.append((col, converted_data_type)) + + if warn: + message = ( + "using literals for table_partition_cols data types is deprecated," + "use pyarrow types instead" + ) + warnings.warn( + message, + category=DeprecationWarning, + stacklevel=2, + ) + + return converted_table_partition_cols diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 014331541..769271c7e 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -37,6 +37,10 @@ except ImportError: from typing_extensions import deprecated # Python 3.12 +from datafusion._internal import DataFrame as DataFrameInternal +from datafusion._internal import ParquetColumnOptions as ParquetColumnOptionsInternal +from datafusion._internal import ParquetWriterOptions as ParquetWriterOptionsInternal +from datafusion.expr import Expr, SortExpr, sort_or_default from datafusion.plan import ExecutionPlan, LogicalPlan from datafusion.record_batch import RecordBatchStream @@ -53,10 +57,6 @@ from enum import Enum -from datafusion._internal import ParquetColumnOptions as ParquetColumnOptionsInternal -from datafusion._internal import ParquetWriterOptions as ParquetWriterOptionsInternal -from datafusion.expr import Expr, SortExpr, sort_or_default - # excerpt from deltalake # https://github.com/apache/datafusion-python/pull/981#discussion_r1905619163 @@ -1090,3 +1090,25 @@ def within_limit(df: DataFrame, limit: int) -> DataFrame: DataFrame: After applying func to the original dataframe. """ return func(self, *args) + + def fill_null(self, value: Any, subset: list[str] | None = None) -> DataFrame: + """Fill null values in specified columns with a value. + + Args: + value: Value to replace nulls with. Will be cast to match column type. + subset: Optional list of column names to fill. If None, fills all columns. + + Returns: + DataFrame with null values replaced where type casting is possible + + Examples: + >>> df = df.fill_null(0) # Fill all nulls with 0 where possible + >>> # Fill nulls in specific string columns + >>> df = df.fill_null("missing", subset=["name", "category"]) + + Notes: + - Only fills nulls in columns where the value can be cast to the column type + - For columns where casting fails, the original column is kept unchanged + - For columns not in subset, the original column is kept unchanged + """ + return DataFrame(self.df.fill_null(value, subset)) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 9e58873d0..e785cab06 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -435,6 +435,20 @@ def literal(value: Any) -> Expr: value = pa.scalar(value) return Expr(expr_internal.RawExpr.literal(value)) + @staticmethod + def literal_with_metadata(value: Any, metadata: dict[str, str]) -> Expr: + """Creates a new expression representing a scalar value with metadata. + + Args: + value: A valid PyArrow scalar value or easily castable to one. + metadata: Metadata to attach to the expression. + """ + if isinstance(value, str): + value = pa.scalar(value, type=pa.string_view()) + value = value if isinstance(value, pa.Scalar) else pa.scalar(value) + + return Expr(expr_internal.RawExpr.literal_with_metadata(value, metadata)) + @staticmethod def string_literal(value: str) -> Expr: """Creates a new expression representing a UTF8 literal value. @@ -1172,6 +1186,10 @@ def __init__( end_bound = end_bound.cast(pa.uint64()) self.window_frame = expr_internal.WindowFrame(units, start_bound, end_bound) + def __repr__(self) -> str: + """Print a string representation of the window frame.""" + return self.window_frame.__repr__() + def get_frame_units(self) -> str: """Returns the window frame units for the bounds.""" return self.window_frame.get_frame_units() diff --git a/python/datafusion/io.py b/python/datafusion/io.py index ef5ebf96f..551e20a6f 100644 --- a/python/datafusion/io.py +++ b/python/datafusion/io.py @@ -34,7 +34,7 @@ def read_parquet( path: str | pathlib.Path, - table_partition_cols: list[tuple[str, str]] | None = None, + table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None, parquet_pruning: bool = True, file_extension: str = ".parquet", skip_metadata: bool = True, @@ -83,7 +83,7 @@ def read_json( schema: pa.Schema | None = None, schema_infer_max_records: int = 1000, file_extension: str = ".json", - table_partition_cols: list[tuple[str, str]] | None = None, + table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None, file_compression_type: str | None = None, ) -> DataFrame: """Read a line-delimited JSON data source. @@ -124,7 +124,7 @@ def read_csv( delimiter: str = ",", schema_infer_max_records: int = 1000, file_extension: str = ".csv", - table_partition_cols: list[tuple[str, str]] | None = None, + table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None, file_compression_type: str | None = None, ) -> DataFrame: """Read a CSV data source. @@ -171,7 +171,7 @@ def read_csv( def read_avro( path: str | pathlib.Path, schema: pa.Schema | None = None, - file_partition_cols: list[tuple[str, str]] | None = None, + file_partition_cols: list[tuple[str, str | pa.DataType]] | None = None, file_extension: str = ".avro", ) -> DataFrame: """Create a :py:class:`DataFrame` for reading Avro data source. diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index e93a34ca5..c7265fa09 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -15,753 +15,15 @@ # specific language governing permissions and limitations # under the License. -"""Provides the user-defined functions for evaluation of dataframes.""" +"""Deprecated module for user defined functions.""" -from __future__ import annotations +import warnings -import functools -from abc import ABCMeta, abstractmethod -from enum import Enum -from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, overload +from datafusion.user_defined import * # noqa: F403 -import pyarrow as pa - -import datafusion._internal as df_internal -from datafusion.expr import Expr - -if TYPE_CHECKING: - _R = TypeVar("_R", bound=pa.DataType) - - -class Volatility(Enum): - """Defines how stable or volatile a function is. - - When setting the volatility of a function, you can either pass this - enumeration or a ``str``. The ``str`` equivalent is the lower case value of the - name (`"immutable"`, `"stable"`, or `"volatile"`). - """ - - Immutable = 1 - """An immutable function will always return the same output when given the - same input. - - DataFusion will attempt to inline immutable functions during planning. - """ - - Stable = 2 - """ - Returns the same value for a given input within a single queries. - - A stable function may return different values given the same input across - different queries but must return the same value for a given input within a - query. An example of this is the ``Now`` function. DataFusion will attempt to - inline ``Stable`` functions during planning, when possible. For query - ``select col1, now() from t1``, it might take a while to execute but ``now()`` - column will be the same for each output row, which is evaluated during - planning. - """ - - Volatile = 3 - """A volatile function may change the return value from evaluation to - evaluation. - - Multiple invocations of a volatile function may return different results - when used in the same query. An example of this is the random() function. - DataFusion can not evaluate such functions during planning. In the query - ``select col1, random() from t1``, ``random()`` function will be evaluated - for each output row, resulting in a unique random value for each row. - """ - - def __str__(self) -> str: - """Returns the string equivalent.""" - return self.name.lower() - - -class ScalarUDF: - """Class for performing scalar user-defined functions (UDF). - - Scalar UDFs operate on a row by row basis. See also :py:class:`AggregateUDF` for - operating on a group of rows. - """ - - def __init__( - self, - name: str, - func: Callable[..., _R], - input_types: pa.DataType | list[pa.DataType], - return_type: _R, - volatility: Volatility | str, - ) -> None: - """Instantiate a scalar user-defined function (UDF). - - See helper method :py:func:`udf` for argument details. - """ - if isinstance(input_types, pa.DataType): - input_types = [input_types] - self._udf = df_internal.ScalarUDF( - name, func, input_types, return_type, str(volatility) - ) - - def __call__(self, *args: Expr) -> Expr: - """Execute the UDF. - - This function is not typically called by an end user. These calls will - occur during the evaluation of the dataframe. - """ - args_raw = [arg.expr for arg in args] - return Expr(self._udf.__call__(*args_raw)) - - @overload - @staticmethod - def udf( - input_types: list[pa.DataType], - return_type: _R, - volatility: Volatility | str, - name: Optional[str] = None, - ) -> Callable[..., ScalarUDF]: ... - - @overload - @staticmethod - def udf( - func: Callable[..., _R], - input_types: list[pa.DataType], - return_type: _R, - volatility: Volatility | str, - name: Optional[str] = None, - ) -> ScalarUDF: ... - - @staticmethod - def udf(*args: Any, **kwargs: Any): # noqa: D417 - """Create a new User-Defined Function (UDF). - - This class can be used both as a **function** and as a **decorator**. - - Usage: - - **As a function**: Call `udf(func, input_types, return_type, volatility, - name)`. - - **As a decorator**: Use `@udf(input_types, return_type, volatility, - name)`. In this case, do **not** pass `func` explicitly. - - Args: - func (Callable, optional): **Only needed when calling as a function.** - Skip this argument when using `udf` as a decorator. - input_types (list[pa.DataType]): The data types of the arguments - to `func`. This list must be of the same length as the number of - arguments. - return_type (_R): The data type of the return value from the function. - volatility (Volatility | str): See `Volatility` for allowed values. - name (Optional[str]): A descriptive name for the function. - - Returns: - A user-defined function that can be used in SQL expressions, - data aggregation, or window function calls. - - Example: - **Using `udf` as a function:** - ``` - def double_func(x): - return x * 2 - double_udf = udf(double_func, [pa.int32()], pa.int32(), - "volatile", "double_it") - ``` - - **Using `udf` as a decorator:** - ``` - @udf([pa.int32()], pa.int32(), "volatile", "double_it") - def double_udf(x): - return x * 2 - ``` - """ - - def _function( - func: Callable[..., _R], - input_types: list[pa.DataType], - return_type: _R, - volatility: Volatility | str, - name: Optional[str] = None, - ) -> ScalarUDF: - if not callable(func): - msg = "`func` argument must be callable" - raise TypeError(msg) - if name is None: - if hasattr(func, "__qualname__"): - name = func.__qualname__.lower() - else: - name = func.__class__.__name__.lower() - return ScalarUDF( - name=name, - func=func, - input_types=input_types, - return_type=return_type, - volatility=volatility, - ) - - def _decorator( - input_types: list[pa.DataType], - return_type: _R, - volatility: Volatility | str, - name: Optional[str] = None, - ) -> Callable: - def decorator(func: Callable): - udf_caller = ScalarUDF.udf( - func, input_types, return_type, volatility, name - ) - - @functools.wraps(func) - def wrapper(*args: Any, **kwargs: Any): - return udf_caller(*args, **kwargs) - - return wrapper - - return decorator - - if args and callable(args[0]): - # Case 1: Used as a function, require the first parameter to be callable - return _function(*args, **kwargs) - # Case 2: Used as a decorator with parameters - return _decorator(*args, **kwargs) - - -class Accumulator(metaclass=ABCMeta): - """Defines how an :py:class:`AggregateUDF` accumulates values.""" - - @abstractmethod - def state(self) -> list[pa.Scalar]: - """Return the current state.""" - - @abstractmethod - def update(self, *values: pa.Array) -> None: - """Evaluate an array of values and update state.""" - - @abstractmethod - def merge(self, states: list[pa.Array]) -> None: - """Merge a set of states.""" - - @abstractmethod - def evaluate(self) -> pa.Scalar: - """Return the resultant value.""" - - -class AggregateUDF: - """Class for performing scalar user-defined functions (UDF). - - Aggregate UDFs operate on a group of rows and return a single value. See - also :py:class:`ScalarUDF` for operating on a row by row basis. - """ - - def __init__( - self, - name: str, - accumulator: Callable[[], Accumulator], - input_types: list[pa.DataType], - return_type: pa.DataType, - state_type: list[pa.DataType], - volatility: Volatility | str, - ) -> None: - """Instantiate a user-defined aggregate function (UDAF). - - See :py:func:`udaf` for a convenience function and argument - descriptions. - """ - self._udaf = df_internal.AggregateUDF( - name, - accumulator, - input_types, - return_type, - state_type, - str(volatility), - ) - - def __call__(self, *args: Expr) -> Expr: - """Execute the UDAF. - - This function is not typically called by an end user. These calls will - occur during the evaluation of the dataframe. - """ - args_raw = [arg.expr for arg in args] - return Expr(self._udaf.__call__(*args_raw)) - - @overload - @staticmethod - def udaf( - input_types: pa.DataType | list[pa.DataType], - return_type: pa.DataType, - state_type: list[pa.DataType], - volatility: Volatility | str, - name: Optional[str] = None, - ) -> Callable[..., AggregateUDF]: ... - - @overload - @staticmethod - def udaf( - accum: Callable[[], Accumulator], - input_types: pa.DataType | list[pa.DataType], - return_type: pa.DataType, - state_type: list[pa.DataType], - volatility: Volatility | str, - name: Optional[str] = None, - ) -> AggregateUDF: ... - - @staticmethod - def udaf(*args: Any, **kwargs: Any): # noqa: D417 - """Create a new User-Defined Aggregate Function (UDAF). - - This class allows you to define an **aggregate function** that can be used in - data aggregation or window function calls. - - Usage: - - **As a function**: Call `udaf(accum, input_types, return_type, state_type, - volatility, name)`. - - **As a decorator**: Use `@udaf(input_types, return_type, state_type, - volatility, name)`. - When using `udaf` as a decorator, **do not pass `accum` explicitly**. - - **Function example:** - - If your `:py:class:Accumulator` can be instantiated with no arguments, you - can simply pass it's type as `accum`. If you need to pass additional - arguments to it's constructor, you can define a lambda or a factory method. - During runtime the `:py:class:Accumulator` will be constructed for every - instance in which this UDAF is used. The following examples are all valid. - ``` - import pyarrow as pa - import pyarrow.compute as pc - - class Summarize(Accumulator): - def __init__(self, bias: float = 0.0): - self._sum = pa.scalar(bias) - - def state(self) -> list[pa.Scalar]: - return [self._sum] - - def update(self, values: pa.Array) -> None: - self._sum = pa.scalar(self._sum.as_py() + pc.sum(values).as_py()) - - def merge(self, states: list[pa.Array]) -> None: - self._sum = pa.scalar(self._sum.as_py() + pc.sum(states[0]).as_py()) - - def evaluate(self) -> pa.Scalar: - return self._sum - - def sum_bias_10() -> Summarize: - return Summarize(10.0) - - udaf1 = udaf(Summarize, pa.float64(), pa.float64(), [pa.float64()], - "immutable") - udaf2 = udaf(sum_bias_10, pa.float64(), pa.float64(), [pa.float64()], - "immutable") - udaf3 = udaf(lambda: Summarize(20.0), pa.float64(), pa.float64(), - [pa.float64()], "immutable") - ``` - - **Decorator example:** - ``` - @udaf(pa.float64(), pa.float64(), [pa.float64()], "immutable") - def udf4() -> Summarize: - return Summarize(10.0) - ``` - - Args: - accum: The accumulator python function. **Only needed when calling as a - function. Skip this argument when using `udaf` as a decorator.** - input_types: The data types of the arguments to ``accum``. - return_type: The data type of the return value. - state_type: The data types of the intermediate accumulation. - volatility: See :py:class:`Volatility` for allowed values. - name: A descriptive name for the function. - - Returns: - A user-defined aggregate function, which can be used in either data - aggregation or window function calls. - """ - - def _function( - accum: Callable[[], Accumulator], - input_types: pa.DataType | list[pa.DataType], - return_type: pa.DataType, - state_type: list[pa.DataType], - volatility: Volatility | str, - name: Optional[str] = None, - ) -> AggregateUDF: - if not callable(accum): - msg = "`func` must be callable." - raise TypeError(msg) - if not isinstance(accum(), Accumulator): - msg = "Accumulator must implement the abstract base class Accumulator" - raise TypeError(msg) - if name is None: - name = accum().__class__.__qualname__.lower() - if isinstance(input_types, pa.DataType): - input_types = [input_types] - return AggregateUDF( - name=name, - accumulator=accum, - input_types=input_types, - return_type=return_type, - state_type=state_type, - volatility=volatility, - ) - - def _decorator( - input_types: pa.DataType | list[pa.DataType], - return_type: pa.DataType, - state_type: list[pa.DataType], - volatility: Volatility | str, - name: Optional[str] = None, - ) -> Callable[..., Callable[..., Expr]]: - def decorator(accum: Callable[[], Accumulator]) -> Callable[..., Expr]: - udaf_caller = AggregateUDF.udaf( - accum, input_types, return_type, state_type, volatility, name - ) - - @functools.wraps(accum) - def wrapper(*args: Any, **kwargs: Any) -> Expr: - return udaf_caller(*args, **kwargs) - - return wrapper - - return decorator - - if args and callable(args[0]): - # Case 1: Used as a function, require the first parameter to be callable - return _function(*args, **kwargs) - # Case 2: Used as a decorator with parameters - return _decorator(*args, **kwargs) - - -class WindowEvaluator: - """Evaluator class for user-defined window functions (UDWF). - - It is up to the user to decide which evaluate function is appropriate. - - +------------------------+--------------------------------+------------------+---------------------------+ - | ``uses_window_frame`` | ``supports_bounded_execution`` | ``include_rank`` | function_to_implement | - +========================+================================+==================+===========================+ - | False (default) | False (default) | False (default) | ``evaluate_all`` | - +------------------------+--------------------------------+------------------+---------------------------+ - | False | True | False | ``evaluate`` | - +------------------------+--------------------------------+------------------+---------------------------+ - | False | True/False | True | ``evaluate_all_with_rank``| - +------------------------+--------------------------------+------------------+---------------------------+ - | True | True/False | True/False | ``evaluate`` | - +------------------------+--------------------------------+------------------+---------------------------+ - """ # noqa: W505, E501 - - def memoize(self) -> None: - """Perform a memoize operation to improve performance. - - When the window frame has a fixed beginning (e.g UNBOUNDED - PRECEDING), some functions such as FIRST_VALUE and - NTH_VALUE do not need the (unbounded) input once they have - seen a certain amount of input. - - `memoize` is called after each input batch is processed, and - such functions can save whatever they need - """ - - def get_range(self, idx: int, num_rows: int) -> tuple[int, int]: # noqa: ARG002 - """Return the range for the window fuction. - - If `uses_window_frame` flag is `false`. This method is used to - calculate required range for the window function during - stateful execution. - - Generally there is no required range, hence by default this - returns smallest range(current row). e.g seeing current row is - enough to calculate window result (such as row_number, rank, - etc) - - Args: - idx:: Current index - num_rows: Number of rows. - """ - return (idx, idx + 1) - - def is_causal(self) -> bool: - """Get whether evaluator needs future data for its result.""" - return False - - def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: - """Evaluate a window function on an entire input partition. - - This function is called once per input *partition* for window functions that - *do not use* values from the window frame, such as - :py:func:`~datafusion.functions.row_number`, - :py:func:`~datafusion.functions.rank`, - :py:func:`~datafusion.functions.dense_rank`, - :py:func:`~datafusion.functions.percent_rank`, - :py:func:`~datafusion.functions.cume_dist`, - :py:func:`~datafusion.functions.lead`, - and :py:func:`~datafusion.functions.lag`. - - It produces the result of all rows in a single pass. It - expects to receive the entire partition as the ``value`` and - must produce an output column with one output row for every - input row. - - ``num_rows`` is required to correctly compute the output in case - ``len(values) == 0`` - - Implementing this function is an optimization. Certain window - functions are not affected by the window frame definition or - the query doesn't have a frame, and ``evaluate`` skips the - (costly) window frame boundary calculation and the overhead of - calling ``evaluate`` for each output row. - - For example, the `LAG` built in window function does not use - the values of its window frame (it can be computed in one shot - on the entire partition with ``Self::evaluate_all`` regardless of the - window defined in the ``OVER`` clause) - - .. code-block:: text - - lag(x, 1) OVER (ORDER BY z ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING) - - However, ``avg()`` computes the average in the window and thus - does use its window frame. - - .. code-block:: text - - avg(x) OVER (PARTITION BY y ORDER BY z ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING) - """ # noqa: W505, E501 - - def evaluate( - self, values: list[pa.Array], eval_range: tuple[int, int] - ) -> pa.Scalar: - """Evaluate window function on a range of rows in an input partition. - - This is the simplest and most general function to implement - but also the least performant as it creates output one row at - a time. It is typically much faster to implement stateful - evaluation using one of the other specialized methods on this - trait. - - Returns a [`ScalarValue`] that is the value of the window - function within `range` for the entire partition. Argument - `values` contains the evaluation result of function arguments - and evaluation results of ORDER BY expressions. If function has a - single argument, `values[1..]` will contain ORDER BY expression results. - """ - - def evaluate_all_with_rank( - self, num_rows: int, ranks_in_partition: list[tuple[int, int]] - ) -> pa.Array: - """Called for window functions that only need the rank of a row. - - Evaluate the partition evaluator against the partition using - the row ranks. For example, ``rank(col("a"))`` produces - - .. code-block:: text - - a | rank - - + ---- - A | 1 - A | 1 - C | 3 - D | 4 - D | 4 - - For this case, `num_rows` would be `5` and the - `ranks_in_partition` would be called with - - .. code-block:: text - - [ - (0,1), - (2,2), - (3,4), - ] - - The user must implement this method if ``include_rank`` returns True. - """ - - def supports_bounded_execution(self) -> bool: - """Can the window function be incrementally computed using bounded memory?""" - return False - - def uses_window_frame(self) -> bool: - """Does the window function use the values from the window frame?""" - return False - - def include_rank(self) -> bool: - """Can this function be evaluated with (only) rank?""" - return False - - -class WindowUDF: - """Class for performing window user-defined functions (UDF). - - Window UDFs operate on a partition of rows. See - also :py:class:`ScalarUDF` for operating on a row by row basis. - """ - - def __init__( - self, - name: str, - func: Callable[[], WindowEvaluator], - input_types: list[pa.DataType], - return_type: pa.DataType, - volatility: Volatility | str, - ) -> None: - """Instantiate a user-defined window function (UDWF). - - See :py:func:`udwf` for a convenience function and argument - descriptions. - """ - self._udwf = df_internal.WindowUDF( - name, func, input_types, return_type, str(volatility) - ) - - def __call__(self, *args: Expr) -> Expr: - """Execute the UDWF. - - This function is not typically called by an end user. These calls will - occur during the evaluation of the dataframe. - """ - args_raw = [arg.expr for arg in args] - return Expr(self._udwf.__call__(*args_raw)) - - @overload - @staticmethod - def udwf( - input_types: pa.DataType | list[pa.DataType], - return_type: pa.DataType, - volatility: Volatility | str, - name: Optional[str] = None, - ) -> Callable[..., WindowUDF]: ... - - @overload - @staticmethod - def udwf( - func: Callable[[], WindowEvaluator], - input_types: pa.DataType | list[pa.DataType], - return_type: pa.DataType, - volatility: Volatility | str, - name: Optional[str] = None, - ) -> WindowUDF: ... - - @staticmethod - def udwf(*args: Any, **kwargs: Any): # noqa: D417 - """Create a new User-Defined Window Function (UDWF). - - This class can be used both as a **function** and as a **decorator**. - - Usage: - - **As a function**: Call `udwf(func, input_types, return_type, volatility, - name)`. - - **As a decorator**: Use `@udwf(input_types, return_type, volatility, - name)`. When using `udwf` as a decorator, **do not pass `func` - explicitly**. - - **Function example:** - ``` - import pyarrow as pa - - class BiasedNumbers(WindowEvaluator): - def __init__(self, start: int = 0) -> None: - self.start = start - - def evaluate_all(self, values: list[pa.Array], - num_rows: int) -> pa.Array: - return pa.array([self.start + i for i in range(num_rows)]) - - def bias_10() -> BiasedNumbers: - return BiasedNumbers(10) - - udwf1 = udwf(BiasedNumbers, pa.int64(), pa.int64(), "immutable") - udwf2 = udwf(bias_10, pa.int64(), pa.int64(), "immutable") - udwf3 = udwf(lambda: BiasedNumbers(20), pa.int64(), pa.int64(), "immutable") - - ``` - - **Decorator example:** - ``` - @udwf(pa.int64(), pa.int64(), "immutable") - def biased_numbers() -> BiasedNumbers: - return BiasedNumbers(10) - ``` - - Args: - func: **Only needed when calling as a function. Skip this argument when - using `udwf` as a decorator.** - input_types: The data types of the arguments. - return_type: The data type of the return value. - volatility: See :py:class:`Volatility` for allowed values. - name: A descriptive name for the function. - - Returns: - A user-defined window function that can be used in window function calls. - """ - if args and callable(args[0]): - # Case 1: Used as a function, require the first parameter to be callable - return WindowUDF._create_window_udf(*args, **kwargs) - # Case 2: Used as a decorator with parameters - return WindowUDF._create_window_udf_decorator(*args, **kwargs) - - @staticmethod - def _create_window_udf( - func: Callable[[], WindowEvaluator], - input_types: pa.DataType | list[pa.DataType], - return_type: pa.DataType, - volatility: Volatility | str, - name: Optional[str] = None, - ) -> WindowUDF: - """Create a WindowUDF instance from function arguments.""" - if not callable(func): - msg = "`func` must be callable." - raise TypeError(msg) - if not isinstance(func(), WindowEvaluator): - msg = "`func` must implement the abstract base class WindowEvaluator" - raise TypeError(msg) - - name = name or func.__qualname__.lower() - input_types = ( - [input_types] if isinstance(input_types, pa.DataType) else input_types - ) - - return WindowUDF(name, func, input_types, return_type, volatility) - - @staticmethod - def _get_default_name(func: Callable) -> str: - """Get the default name for a function based on its attributes.""" - if hasattr(func, "__qualname__"): - return func.__qualname__.lower() - return func.__class__.__name__.lower() - - @staticmethod - def _normalize_input_types( - input_types: pa.DataType | list[pa.DataType], - ) -> list[pa.DataType]: - """Convert a single DataType to a list if needed.""" - if isinstance(input_types, pa.DataType): - return [input_types] - return input_types - - @staticmethod - def _create_window_udf_decorator( - input_types: pa.DataType | list[pa.DataType], - return_type: pa.DataType, - volatility: Volatility | str, - name: Optional[str] = None, - ) -> Callable[[Callable[[], WindowEvaluator]], Callable[..., Expr]]: - """Create a decorator for a WindowUDF.""" - - def decorator(func: Callable[[], WindowEvaluator]) -> Callable[..., Expr]: - udwf_caller = WindowUDF._create_window_udf( - func, input_types, return_type, volatility, name - ) - - @functools.wraps(func) - def wrapper(*args: Any, **kwargs: Any) -> Expr: - return udwf_caller(*args, **kwargs) - - return wrapper - - return decorator - - -# Convenience exports so we can import instead of treating as -# variables at the package root -udf = ScalarUDF.udf -udaf = AggregateUDF.udaf -udwf = WindowUDF.udwf +warnings.warn( + "The module 'udf' is deprecated and will be removed in the next release. " + "Please use 'user_defined' instead.", + DeprecationWarning, + stacklevel=2, +) diff --git a/python/datafusion/user_defined.py b/python/datafusion/user_defined.py new file mode 100644 index 000000000..dd634c7fb --- /dev/null +++ b/python/datafusion/user_defined.py @@ -0,0 +1,845 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Provides the user-defined functions for evaluation of dataframes.""" + +from __future__ import annotations + +import functools +from abc import ABCMeta, abstractmethod +from enum import Enum +from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, overload + +import pyarrow as pa + +import datafusion._internal as df_internal +from datafusion.expr import Expr + +if TYPE_CHECKING: + _R = TypeVar("_R", bound=pa.DataType) + + +class Volatility(Enum): + """Defines how stable or volatile a function is. + + When setting the volatility of a function, you can either pass this + enumeration or a ``str``. The ``str`` equivalent is the lower case value of the + name (`"immutable"`, `"stable"`, or `"volatile"`). + """ + + Immutable = 1 + """An immutable function will always return the same output when given the + same input. + + DataFusion will attempt to inline immutable functions during planning. + """ + + Stable = 2 + """ + Returns the same value for a given input within a single queries. + + A stable function may return different values given the same input across + different queries but must return the same value for a given input within a + query. An example of this is the ``Now`` function. DataFusion will attempt to + inline ``Stable`` functions during planning, when possible. For query + ``select col1, now() from t1``, it might take a while to execute but ``now()`` + column will be the same for each output row, which is evaluated during + planning. + """ + + Volatile = 3 + """A volatile function may change the return value from evaluation to + evaluation. + + Multiple invocations of a volatile function may return different results + when used in the same query. An example of this is the random() function. + DataFusion can not evaluate such functions during planning. In the query + ``select col1, random() from t1``, ``random()`` function will be evaluated + for each output row, resulting in a unique random value for each row. + """ + + def __str__(self) -> str: + """Returns the string equivalent.""" + return self.name.lower() + + +class ScalarUDF: + """Class for performing scalar user-defined functions (UDF). + + Scalar UDFs operate on a row by row basis. See also :py:class:`AggregateUDF` for + operating on a group of rows. + """ + + def __init__( + self, + name: str, + func: Callable[..., _R], + input_types: pa.DataType | list[pa.DataType], + return_type: _R, + volatility: Volatility | str, + ) -> None: + """Instantiate a scalar user-defined function (UDF). + + See helper method :py:func:`udf` for argument details. + """ + if isinstance(input_types, pa.DataType): + input_types = [input_types] + self._udf = df_internal.ScalarUDF( + name, func, input_types, return_type, str(volatility) + ) + + def __repr__(self) -> str: + """Print a string representation of the Scalar UDF.""" + return self._udf.__repr__() + + def __call__(self, *args: Expr) -> Expr: + """Execute the UDF. + + This function is not typically called by an end user. These calls will + occur during the evaluation of the dataframe. + """ + args_raw = [arg.expr for arg in args] + return Expr(self._udf.__call__(*args_raw)) + + @overload + @staticmethod + def udf( + input_types: list[pa.DataType], + return_type: _R, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> Callable[..., ScalarUDF]: ... + + @overload + @staticmethod + def udf( + func: Callable[..., _R], + input_types: list[pa.DataType], + return_type: _R, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> ScalarUDF: ... + + @staticmethod + def udf(*args: Any, **kwargs: Any): # noqa: D417 + """Create a new User-Defined Function (UDF). + + This class can be used both as either a function or a decorator. + + Usage: + - As a function: ``udf(func, input_types, return_type, volatility, name)``. + - As a decorator: ``@udf(input_types, return_type, volatility, name)``. + When used a decorator, do **not** pass ``func`` explicitly. + + Args: + func (Callable, optional): Only needed when calling as a function. + Skip this argument when using ``udf`` as a decorator. + input_types (list[pa.DataType]): The data types of the arguments + to ``func``. This list must be of the same length as the number of + arguments. + return_type (_R): The data type of the return value from the function. + volatility (Volatility | str): See `Volatility` for allowed values. + name (Optional[str]): A descriptive name for the function. + + Returns: + A user-defined function that can be used in SQL expressions, + data aggregation, or window function calls. + + Example: Using ``udf`` as a function:: + + def double_func(x): + return x * 2 + double_udf = udf(double_func, [pa.int32()], pa.int32(), + "volatile", "double_it") + + Example: Using ``udf`` as a decorator:: + + @udf([pa.int32()], pa.int32(), "volatile", "double_it") + def double_udf(x): + return x * 2 + """ + + def _function( + func: Callable[..., _R], + input_types: list[pa.DataType], + return_type: _R, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> ScalarUDF: + if not callable(func): + msg = "`func` argument must be callable" + raise TypeError(msg) + if name is None: + if hasattr(func, "__qualname__"): + name = func.__qualname__.lower() + else: + name = func.__class__.__name__.lower() + return ScalarUDF( + name=name, + func=func, + input_types=input_types, + return_type=return_type, + volatility=volatility, + ) + + def _decorator( + input_types: list[pa.DataType], + return_type: _R, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> Callable: + def decorator(func: Callable): + udf_caller = ScalarUDF.udf( + func, input_types, return_type, volatility, name + ) + + @functools.wraps(func) + def wrapper(*args: Any, **kwargs: Any): + return udf_caller(*args, **kwargs) + + return wrapper + + return decorator + + if args and callable(args[0]): + # Case 1: Used as a function, require the first parameter to be callable + return _function(*args, **kwargs) + # Case 2: Used as a decorator with parameters + return _decorator(*args, **kwargs) + + +class Accumulator(metaclass=ABCMeta): + """Defines how an :py:class:`AggregateUDF` accumulates values.""" + + @abstractmethod + def state(self) -> list[pa.Scalar]: + """Return the current state.""" + + @abstractmethod + def update(self, *values: pa.Array) -> None: + """Evaluate an array of values and update state.""" + + @abstractmethod + def merge(self, states: list[pa.Array]) -> None: + """Merge a set of states.""" + + @abstractmethod + def evaluate(self) -> pa.Scalar: + """Return the resultant value.""" + + +class AggregateUDF: + """Class for performing scalar user-defined functions (UDF). + + Aggregate UDFs operate on a group of rows and return a single value. See + also :py:class:`ScalarUDF` for operating on a row by row basis. + """ + + def __init__( + self, + name: str, + accumulator: Callable[[], Accumulator], + input_types: list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], + volatility: Volatility | str, + ) -> None: + """Instantiate a user-defined aggregate function (UDAF). + + See :py:func:`udaf` for a convenience function and argument + descriptions. + """ + self._udaf = df_internal.AggregateUDF( + name, + accumulator, + input_types, + return_type, + state_type, + str(volatility), + ) + + def __repr__(self) -> str: + """Print a string representation of the Aggregate UDF.""" + return self._udaf.__repr__() + + def __call__(self, *args: Expr) -> Expr: + """Execute the UDAF. + + This function is not typically called by an end user. These calls will + occur during the evaluation of the dataframe. + """ + args_raw = [arg.expr for arg in args] + return Expr(self._udaf.__call__(*args_raw)) + + @overload + @staticmethod + def udaf( + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], + volatility: Volatility | str, + name: Optional[str] = None, + ) -> Callable[..., AggregateUDF]: ... + + @overload + @staticmethod + def udaf( + accum: Callable[[], Accumulator], + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], + volatility: Volatility | str, + name: Optional[str] = None, + ) -> AggregateUDF: ... + + @staticmethod + def udaf(*args: Any, **kwargs: Any): # noqa: D417 + """Create a new User-Defined Aggregate Function (UDAF). + + This class allows you to define an aggregate function that can be used in + data aggregation or window function calls. + + Usage: + - As a function: ``udaf(accum, input_types, return_type, state_type, volatility, name)``. + - As a decorator: ``@udaf(input_types, return_type, state_type, volatility, name)``. + When using ``udaf`` as a decorator, do not pass ``accum`` explicitly. + + Function example: + + If your :py:class:`Accumulator` can be instantiated with no arguments, you + can simply pass it's type as `accum`. If you need to pass additional + arguments to it's constructor, you can define a lambda or a factory method. + During runtime the :py:class:`Accumulator` will be constructed for every + instance in which this UDAF is used. The following examples are all valid:: + + import pyarrow as pa + import pyarrow.compute as pc + + class Summarize(Accumulator): + def __init__(self, bias: float = 0.0): + self._sum = pa.scalar(bias) + + def state(self) -> list[pa.Scalar]: + return [self._sum] + + def update(self, values: pa.Array) -> None: + self._sum = pa.scalar(self._sum.as_py() + pc.sum(values).as_py()) + + def merge(self, states: list[pa.Array]) -> None: + self._sum = pa.scalar(self._sum.as_py() + pc.sum(states[0]).as_py()) + + def evaluate(self) -> pa.Scalar: + return self._sum + + def sum_bias_10() -> Summarize: + return Summarize(10.0) + + udaf1 = udaf(Summarize, pa.float64(), pa.float64(), [pa.float64()], + "immutable") + udaf2 = udaf(sum_bias_10, pa.float64(), pa.float64(), [pa.float64()], + "immutable") + udaf3 = udaf(lambda: Summarize(20.0), pa.float64(), pa.float64(), + [pa.float64()], "immutable") + + Decorator example::: + + @udaf(pa.float64(), pa.float64(), [pa.float64()], "immutable") + def udf4() -> Summarize: + return Summarize(10.0) + + Args: + accum: The accumulator python function. Only needed when calling as a + function. Skip this argument when using ``udaf`` as a decorator. + input_types: The data types of the arguments to ``accum``. + return_type: The data type of the return value. + state_type: The data types of the intermediate accumulation. + volatility: See :py:class:`Volatility` for allowed values. + name: A descriptive name for the function. + + Returns: + A user-defined aggregate function, which can be used in either data + aggregation or window function calls. + """ # noqa: E501 W505 + + def _function( + accum: Callable[[], Accumulator], + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], + volatility: Volatility | str, + name: Optional[str] = None, + ) -> AggregateUDF: + if not callable(accum): + msg = "`func` must be callable." + raise TypeError(msg) + if not isinstance(accum(), Accumulator): + msg = "Accumulator must implement the abstract base class Accumulator" + raise TypeError(msg) + if name is None: + name = accum().__class__.__qualname__.lower() + if isinstance(input_types, pa.DataType): + input_types = [input_types] + return AggregateUDF( + name=name, + accumulator=accum, + input_types=input_types, + return_type=return_type, + state_type=state_type, + volatility=volatility, + ) + + def _decorator( + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], + volatility: Volatility | str, + name: Optional[str] = None, + ) -> Callable[..., Callable[..., Expr]]: + def decorator(accum: Callable[[], Accumulator]) -> Callable[..., Expr]: + udaf_caller = AggregateUDF.udaf( + accum, input_types, return_type, state_type, volatility, name + ) + + @functools.wraps(accum) + def wrapper(*args: Any, **kwargs: Any) -> Expr: + return udaf_caller(*args, **kwargs) + + return wrapper + + return decorator + + if args and callable(args[0]): + # Case 1: Used as a function, require the first parameter to be callable + return _function(*args, **kwargs) + # Case 2: Used as a decorator with parameters + return _decorator(*args, **kwargs) + + +class WindowEvaluator: + """Evaluator class for user-defined window functions (UDWF). + + It is up to the user to decide which evaluate function is appropriate. + + +------------------------+--------------------------------+------------------+---------------------------+ + | ``uses_window_frame`` | ``supports_bounded_execution`` | ``include_rank`` | function_to_implement | + +========================+================================+==================+===========================+ + | False (default) | False (default) | False (default) | ``evaluate_all`` | + +------------------------+--------------------------------+------------------+---------------------------+ + | False | True | False | ``evaluate`` | + +------------------------+--------------------------------+------------------+---------------------------+ + | False | True/False | True | ``evaluate_all_with_rank``| + +------------------------+--------------------------------+------------------+---------------------------+ + | True | True/False | True/False | ``evaluate`` | + +------------------------+--------------------------------+------------------+---------------------------+ + """ # noqa: W505, E501 + + def memoize(self) -> None: + """Perform a memoize operation to improve performance. + + When the window frame has a fixed beginning (e.g UNBOUNDED + PRECEDING), some functions such as FIRST_VALUE and + NTH_VALUE do not need the (unbounded) input once they have + seen a certain amount of input. + + `memoize` is called after each input batch is processed, and + such functions can save whatever they need + """ + + def get_range(self, idx: int, num_rows: int) -> tuple[int, int]: # noqa: ARG002 + """Return the range for the window fuction. + + If `uses_window_frame` flag is `false`. This method is used to + calculate required range for the window function during + stateful execution. + + Generally there is no required range, hence by default this + returns smallest range(current row). e.g seeing current row is + enough to calculate window result (such as row_number, rank, + etc) + + Args: + idx:: Current index + num_rows: Number of rows. + """ + return (idx, idx + 1) + + def is_causal(self) -> bool: + """Get whether evaluator needs future data for its result.""" + return False + + def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: + """Evaluate a window function on an entire input partition. + + This function is called once per input *partition* for window functions that + *do not use* values from the window frame, such as + :py:func:`~datafusion.functions.row_number`, + :py:func:`~datafusion.functions.rank`, + :py:func:`~datafusion.functions.dense_rank`, + :py:func:`~datafusion.functions.percent_rank`, + :py:func:`~datafusion.functions.cume_dist`, + :py:func:`~datafusion.functions.lead`, + and :py:func:`~datafusion.functions.lag`. + + It produces the result of all rows in a single pass. It + expects to receive the entire partition as the ``value`` and + must produce an output column with one output row for every + input row. + + ``num_rows`` is required to correctly compute the output in case + ``len(values) == 0`` + + Implementing this function is an optimization. Certain window + functions are not affected by the window frame definition or + the query doesn't have a frame, and ``evaluate`` skips the + (costly) window frame boundary calculation and the overhead of + calling ``evaluate`` for each output row. + + For example, the `LAG` built in window function does not use + the values of its window frame (it can be computed in one shot + on the entire partition with ``Self::evaluate_all`` regardless of the + window defined in the ``OVER`` clause) + + .. code-block:: text + + lag(x, 1) OVER (ORDER BY z ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING) + + However, ``avg()`` computes the average in the window and thus + does use its window frame. + + .. code-block:: text + + avg(x) OVER (PARTITION BY y ORDER BY z ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING) + """ # noqa: W505, E501 + + def evaluate( + self, values: list[pa.Array], eval_range: tuple[int, int] + ) -> pa.Scalar: + """Evaluate window function on a range of rows in an input partition. + + This is the simplest and most general function to implement + but also the least performant as it creates output one row at + a time. It is typically much faster to implement stateful + evaluation using one of the other specialized methods on this + trait. + + Returns a [`ScalarValue`] that is the value of the window + function within `range` for the entire partition. Argument + `values` contains the evaluation result of function arguments + and evaluation results of ORDER BY expressions. If function has a + single argument, `values[1..]` will contain ORDER BY expression results. + """ + + def evaluate_all_with_rank( + self, num_rows: int, ranks_in_partition: list[tuple[int, int]] + ) -> pa.Array: + """Called for window functions that only need the rank of a row. + + Evaluate the partition evaluator against the partition using + the row ranks. For example, ``rank(col("a"))`` produces + + .. code-block:: text + + a | rank + - + ---- + A | 1 + A | 1 + C | 3 + D | 4 + D | 4 + + For this case, `num_rows` would be `5` and the + `ranks_in_partition` would be called with + + .. code-block:: text + + [ + (0,1), + (2,2), + (3,4), + ] + + The user must implement this method if ``include_rank`` returns True. + """ + + def supports_bounded_execution(self) -> bool: + """Can the window function be incrementally computed using bounded memory?""" + return False + + def uses_window_frame(self) -> bool: + """Does the window function use the values from the window frame?""" + return False + + def include_rank(self) -> bool: + """Can this function be evaluated with (only) rank?""" + return False + + +class WindowUDF: + """Class for performing window user-defined functions (UDF). + + Window UDFs operate on a partition of rows. See + also :py:class:`ScalarUDF` for operating on a row by row basis. + """ + + def __init__( + self, + name: str, + func: Callable[[], WindowEvaluator], + input_types: list[pa.DataType], + return_type: pa.DataType, + volatility: Volatility | str, + ) -> None: + """Instantiate a user-defined window function (UDWF). + + See :py:func:`udwf` for a convenience function and argument + descriptions. + """ + self._udwf = df_internal.WindowUDF( + name, func, input_types, return_type, str(volatility) + ) + + def __repr__(self) -> str: + """Print a string representation of the Window UDF.""" + return self._udwf.__repr__() + + def __call__(self, *args: Expr) -> Expr: + """Execute the UDWF. + + This function is not typically called by an end user. These calls will + occur during the evaluation of the dataframe. + """ + args_raw = [arg.expr for arg in args] + return Expr(self._udwf.__call__(*args_raw)) + + @overload + @staticmethod + def udwf( + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> Callable[..., WindowUDF]: ... + + @overload + @staticmethod + def udwf( + func: Callable[[], WindowEvaluator], + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> WindowUDF: ... + + @staticmethod + def udwf(*args: Any, **kwargs: Any): # noqa: D417 + """Create a new User-Defined Window Function (UDWF). + + This class can be used both as either a function or a decorator. + + Usage: + - As a function: ``udwf(func, input_types, return_type, volatility, name)``. + - As a decorator: ``@udwf(input_types, return_type, volatility, name)``. + When using ``udwf`` as a decorator, do not pass ``func`` explicitly. + + Function example:: + + import pyarrow as pa + + class BiasedNumbers(WindowEvaluator): + def __init__(self, start: int = 0) -> None: + self.start = start + + def evaluate_all(self, values: list[pa.Array], + num_rows: int) -> pa.Array: + return pa.array([self.start + i for i in range(num_rows)]) + + def bias_10() -> BiasedNumbers: + return BiasedNumbers(10) + + udwf1 = udwf(BiasedNumbers, pa.int64(), pa.int64(), "immutable") + udwf2 = udwf(bias_10, pa.int64(), pa.int64(), "immutable") + udwf3 = udwf(lambda: BiasedNumbers(20), pa.int64(), pa.int64(), "immutable") + + + Decorator example:: + + @udwf(pa.int64(), pa.int64(), "immutable") + def biased_numbers() -> BiasedNumbers: + return BiasedNumbers(10) + + Args: + func: Only needed when calling as a function. Skip this argument when + using ``udwf`` as a decorator. + input_types: The data types of the arguments. + return_type: The data type of the return value. + volatility: See :py:class:`Volatility` for allowed values. + name: A descriptive name for the function. + + Returns: + A user-defined window function that can be used in window function calls. + """ + if args and callable(args[0]): + # Case 1: Used as a function, require the first parameter to be callable + return WindowUDF._create_window_udf(*args, **kwargs) + # Case 2: Used as a decorator with parameters + return WindowUDF._create_window_udf_decorator(*args, **kwargs) + + @staticmethod + def _create_window_udf( + func: Callable[[], WindowEvaluator], + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> WindowUDF: + """Create a WindowUDF instance from function arguments.""" + if not callable(func): + msg = "`func` must be callable." + raise TypeError(msg) + if not isinstance(func(), WindowEvaluator): + msg = "`func` must implement the abstract base class WindowEvaluator" + raise TypeError(msg) + + name = name or func.__qualname__.lower() + input_types = ( + [input_types] if isinstance(input_types, pa.DataType) else input_types + ) + + return WindowUDF(name, func, input_types, return_type, volatility) + + @staticmethod + def _get_default_name(func: Callable) -> str: + """Get the default name for a function based on its attributes.""" + if hasattr(func, "__qualname__"): + return func.__qualname__.lower() + return func.__class__.__name__.lower() + + @staticmethod + def _normalize_input_types( + input_types: pa.DataType | list[pa.DataType], + ) -> list[pa.DataType]: + """Convert a single DataType to a list if needed.""" + if isinstance(input_types, pa.DataType): + return [input_types] + return input_types + + @staticmethod + def _create_window_udf_decorator( + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> Callable[[Callable[[], WindowEvaluator]], Callable[..., Expr]]: + """Create a decorator for a WindowUDF.""" + + def decorator(func: Callable[[], WindowEvaluator]) -> Callable[..., Expr]: + udwf_caller = WindowUDF._create_window_udf( + func, input_types, return_type, volatility, name + ) + + @functools.wraps(func) + def wrapper(*args: Any, **kwargs: Any) -> Expr: + return udwf_caller(*args, **kwargs) + + return wrapper + + return decorator + + +class TableFunction: + """Class for performing user-defined table functions (UDTF). + + Table functions generate new table providers based on the + input expressions. + """ + + def __init__( + self, + name: str, + func: Callable[[], any], + ) -> None: + """Instantiate a user-defined table function (UDTF). + + See :py:func:`udtf` for a convenience function and argument + descriptions. + """ + self._udtf = df_internal.TableFunction(name, func) + + def __call__(self, *args: Expr) -> Any: + """Execute the UDTF and return a table provider.""" + args_raw = [arg.expr for arg in args] + return self._udtf.__call__(*args_raw) + + @overload + @staticmethod + def udtf( + name: str, + ) -> Callable[..., Any]: ... + + @overload + @staticmethod + def udtf( + func: Callable[[], Any], + name: str, + ) -> TableFunction: ... + + @staticmethod + def udtf(*args: Any, **kwargs: Any): + """Create a new User-Defined Table Function (UDTF).""" + if args and callable(args[0]): + # Case 1: Used as a function, require the first parameter to be callable + return TableFunction._create_table_udf(*args, **kwargs) + if args and hasattr(args[0], "__datafusion_table_function__"): + # Case 2: We have a datafusion FFI provided function + return TableFunction(args[1], args[0]) + # Case 3: Used as a decorator with parameters + return TableFunction._create_table_udf_decorator(*args, **kwargs) + + @staticmethod + def _create_table_udf( + func: Callable[..., Any], + name: str, + ) -> TableFunction: + """Create a TableFunction instance from function arguments.""" + if not callable(func): + msg = "`func` must be callable." + raise TypeError(msg) + + return TableFunction(name, func) + + @staticmethod + def _create_table_udf_decorator( + name: Optional[str] = None, + ) -> Callable[[Callable[[], WindowEvaluator]], Callable[..., Expr]]: + """Create a decorator for a WindowUDF.""" + + def decorator(func: Callable[[], WindowEvaluator]) -> Callable[..., Expr]: + return TableFunction._create_table_udf(func, name) + + return decorator + + def __repr__(self) -> str: + """User printable representation.""" + return self._udtf.__repr__() + + +# Convenience exports so we can import instead of treating as +# variables at the package root +udf = ScalarUDF.udf +udaf = AggregateUDF.udaf +udwf = WindowUDF.udwf +udtf = TableFunction.udtf diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index daa4331df..404ce9545 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -14,8 +14,12 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import ctypes +import datetime import os import re +import threading +import time from typing import Any import pyarrow as pa @@ -136,6 +140,38 @@ def clean_formatter_state(): reset_formatter() +@pytest.fixture +def null_df(): + """Create a DataFrame with null values of different types.""" + ctx = SessionContext() + + # Create a RecordBatch with nulls across different types + batch = pa.RecordBatch.from_arrays( + [ + pa.array([1, None, 3, None], type=pa.int64()), + pa.array([4.5, 6.7, None, None], type=pa.float64()), + pa.array(["a", None, "c", None], type=pa.string()), + pa.array([True, None, False, None], type=pa.bool_()), + pa.array( + [10957, None, 18993, None], type=pa.date32() + ), # 2000-01-01, null, 2022-01-01, null + pa.array( + [946684800000, None, 1640995200000, None], type=pa.date64() + ), # 2000-01-01, null, 2022-01-01, null + ], + names=[ + "int_col", + "float_col", + "str_col", + "bool_col", + "date32_col", + "date64_col", + ], + ) + + return ctx.create_dataframe([[batch]]) + + # custom style for testing with html formatter class CustomStyleProvider: def get_cell_style(self) -> str: @@ -2168,3 +2204,354 @@ def test_html_formatter_manual_format_html(clean_formatter_state): assert "