Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 35 additions & 34 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 7 additions & 7 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -90,19 +90,19 @@ ahash = { version = "0.8", default-features = false, features = [
"runtime-rng",
] }
apache-avro = { version = "0.20", default-features = false }
arrow = { version = "56.0.0", features = [
arrow = { version = "56.1.0", features = [
"prettyprint",
"chrono-tz",
] }
arrow-buffer = { version = "56.0.0", default-features = false }
arrow-flight = { version = "56.0.0", features = [
arrow-buffer = { version = "56.1.0", default-features = false }
arrow-flight = { version = "56.1.0", features = [
"flight-sql-experimental",
] }
arrow-ipc = { version = "56.0.0", default-features = false, features = [
arrow-ipc = { version = "56.1.0", default-features = false, features = [
"lz4",
] }
arrow-ord = { version = "56.0.0", default-features = false }
arrow-schema = { version = "56.0.0", default-features = false }
arrow-ord = { version = "56.1.0", default-features = false }
arrow-schema = { version = "56.1.0", default-features = false }
async-trait = "0.1.89"
bigdecimal = "0.4.8"
bytes = "1.10"
Expand Down Expand Up @@ -157,7 +157,7 @@ itertools = "0.14"
log = "^0.4"
object_store = { version = "0.12.3", default-features = false }
parking_lot = "0.12"
parquet = { version = "56.0.0", default-features = false, features = [
parquet = { version = "56.1.0", default-features = false, features = [
"arrow",
"async",
"object_store",
Expand Down
12 changes: 6 additions & 6 deletions datafusion-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -571,15 +571,15 @@ mod tests {
let df = ctx.sql(sql).await?;
let rbs = df.collect().await?;

assert_snapshot!(batches_to_string(&rbs),@r#"
assert_snapshot!(batches_to_string(&rbs),@r"
+-----------------------------------+-----------------+---------------------+------+------------------+
| filename | file_size_bytes | metadata_size_bytes | hits | extra |
+-----------------------------------+-----------------+---------------------+------+------------------+
| alltypes_plain.parquet | 1851 | 10181 | 2 | page_index=false |
| alltypes_tiny_pages.parquet | 454233 | 881634 | 2 | page_index=true |
| alltypes_tiny_pages.parquet | 454233 | 881418 | 2 | page_index=true |
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really know why the in-memory size of the ParquetMetadata has decreased, but it seems like a good improvement to me

| lz4_raw_compressed_larger.parquet | 380836 | 2939 | 2 | page_index=false |
+-----------------------------------+-----------------+---------------------+------+------------------+
"#);
");

// increase the number of hits
ctx.sql("select * from alltypes_plain")
Expand All @@ -602,15 +602,15 @@ mod tests {
let df = ctx.sql(sql).await?;
let rbs = df.collect().await?;

assert_snapshot!(batches_to_string(&rbs),@r#"
assert_snapshot!(batches_to_string(&rbs),@r"
+-----------------------------------+-----------------+---------------------+------+------------------+
| filename | file_size_bytes | metadata_size_bytes | hits | extra |
+-----------------------------------+-----------------+---------------------+------+------------------+
| alltypes_plain.parquet | 1851 | 10181 | 5 | page_index=false |
| alltypes_tiny_pages.parquet | 454233 | 881634 | 2 | page_index=true |
| alltypes_tiny_pages.parquet | 454233 | 881418 | 2 | page_index=true |
| lz4_raw_compressed_larger.parquet | 380836 | 2939 | 3 | page_index=false |
+-----------------------------------+-----------------+---------------------+------+------------------+
"#);
");

Ok(())
}
Expand Down
8 changes: 8 additions & 0 deletions datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,14 @@ config_namespace! {
/// (reading) Use any available bloom filters when reading parquet files
pub bloom_filter_on_read: bool, default = true

/// (reading) The maximum predicate cache size, in bytes. When
/// `pushdown_filters` is enabled, sets the maximum memory used to cache
/// the results of predicate evaluation between filter evaluation and
/// output generation. Decreasing this value will reduce memory usage,
/// but may increase IO and CPU usage. None means use the default
/// parquet reader setting. 0 means no caching.
pub max_predicate_cache_size: Option<usize>, default = None

// The following options affect writing to parquet files
// and map to parquet::file::properties::WriterProperties

Expand Down
13 changes: 8 additions & 5 deletions datafusion/common/src/file_options/parquet_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ impl ParquetOptions {
binary_as_string: _, // not used for writer props
coerce_int96: _, // not used for writer props
skip_arrow_metadata: _,
max_predicate_cache_size: _,
} = self;

let mut builder = WriterProperties::builder()
Expand Down Expand Up @@ -400,6 +401,10 @@ pub(crate) fn parse_statistics_string(str_setting: &str) -> Result<EnabledStatis
#[cfg(feature = "parquet")]
#[cfg(test)]
mod tests {
use super::*;
use crate::config::{ParquetColumnOptions, ParquetEncryptionOptions, ParquetOptions};
#[cfg(feature = "parquet_encryption")]
use crate::encryption::map_encryption_to_config_encryption;
use parquet::{
basic::Compression,
file::properties::{
Expand All @@ -409,11 +414,6 @@ mod tests {
};
use std::collections::HashMap;

use super::*;
use crate::config::{ParquetColumnOptions, ParquetEncryptionOptions, ParquetOptions};
#[cfg(feature = "parquet_encryption")]
use crate::encryption::map_encryption_to_config_encryption;

const COL_NAME: &str = "configured";

/// Take the column defaults provided in [`ParquetOptions`], and generate a non-default col config.
Expand Down Expand Up @@ -475,6 +475,7 @@ mod tests {
binary_as_string: defaults.binary_as_string,
skip_arrow_metadata: defaults.skip_arrow_metadata,
coerce_int96: None,
max_predicate_cache_size: defaults.max_predicate_cache_size,
}
}

Expand Down Expand Up @@ -581,6 +582,8 @@ mod tests {
maximum_buffered_record_batches_per_stream: global_options_defaults
.maximum_buffered_record_batches_per_stream,
bloom_filter_on_read: global_options_defaults.bloom_filter_on_read,
max_predicate_cache_size: global_options_defaults
.max_predicate_cache_size,
schema_force_view_types: global_options_defaults.schema_force_view_types,
binary_as_string: global_options_defaults.binary_as_string,
skip_arrow_metadata: global_options_defaults.skip_arrow_metadata,
Expand Down
Loading