explanatory comments

parmesant · parmesant · commit 3f70c4ae291f · 2024-10-03T07:24:16.000+05:30
diff --git a/server/Cargo.toml b/server/Cargo.toml
@@ -15,9 +15,8 @@ arrow-array = { version = "53.0.0" }
 arrow-json = "53.0.0"
 arrow-ipc = { version = "53.0.0", features = ["zstd"] }
 arrow-select = "53.0.0"
-# datafusion = { git = "https://github.com/apache/datafusion.git", rev = "a64df83502821f18067fb4ff65dd217815b305c9" }
 datafusion = "42.0.0"
-object_store = { version = "0.11.0", features = ["cloud", "aws"] }  # cannot update object_store as datafusion has not caught up
+object_store = { version = "0.11.0", features = ["cloud", "aws"] }
 parquet = "53.0.0"
 arrow-flight = { version = "53.0.0", features = [ "tls" ] }
 tonic = {version = "0.12.1", features = ["tls", "transport", "gzip", "zstd"] }
diff --git a/server/src/cli.rs b/server/src/cli.rs
@@ -421,6 +421,9 @@ impl Cli {
                      .help("Set a fixed memory limit for query"),
              )
              .arg(
+                 // RowGroupSize controls the number of rows present in one row group
+                 // More rows = better compression but HIGHER Memory consumption during read/write
+                 // 1048576 is the default value for DataFusion 
                  Arg::new(Self::ROW_GROUP_SIZE)
                      .long(Self::ROW_GROUP_SIZE)
                      .env("P_PARQUET_ROW_GROUP_SIZE")
diff --git a/server/src/query.rs b/server/src/query.rs
@@ -86,9 +86,20 @@ impl Query {
             .with_prefer_existing_sort(true)
             .with_round_robin_repartition(true);
 
+        // For more details refer https://datafusion.apache.org/user-guide/configs.html
+
+        // Reduce the number of rows read (if possible)
         config.options_mut().execution.parquet.enable_page_index = true;
+
+        // Pushdown filters allows DF to push the filters as far down in the plan as possible
+        // and thus, reducing the number of rows decoded
         config.options_mut().execution.parquet.pushdown_filters = true;
+
+        // Reorder filters allows DF to decide the order of filters minimizing the cost of filter evaluation
         config.options_mut().execution.parquet.reorder_filters = true;
+
+        // Enable StringViewArray
+        // https://www.influxdata.com/blog/faster-queries-with-stringview-part-one-influxdb/
         config
             .options_mut()
             .execution