From 0821202018cf3fa0cd0b1809548eb2a4d387783a Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Thu, 29 Jul 2021 19:26:55 +0800
Subject: [PATCH 01/16] Extend source to enable read from remote storage

---
 .../core/src/serde/logical_plan/from_proto.rs |   4 +-
 .../src/serde/physical_plan/from_proto.rs     |  15 +-
 .../core/src/serde/physical_plan/to_proto.rs  |   2 +-
 ballista/rust/core/src/utils.rs               |   5 +
 ballista/rust/scheduler/src/lib.rs            |  28 +-
 benchmarks/src/bin/tpch.rs                    |   5 +-
 datafusion-examples/examples/flight_server.rs |   6 +-
 datafusion/src/datasource/csv.rs              |   7 +-
 datafusion/src/datasource/json.rs             |   6 +-
 datafusion/src/datasource/local.rs            | 126 +++++
 datafusion/src/datasource/mod.rs              | 253 +++++++++
 datafusion/src/datasource/object_store.rs     | 108 ++++
 datafusion/src/datasource/parquet.rs          | 338 +++++++++++-
 datafusion/src/execution/context.rs           |  40 +-
 datafusion/src/logical_plan/builder.rs        |   9 +-
 .../src/physical_optimizer/repartition.rs     |  14 +-
 datafusion/src/physical_plan/common.rs        |  38 --
 datafusion/src/physical_plan/csv.rs           |   6 +-
 datafusion/src/physical_plan/json.rs          |   6 +-
 datafusion/src/physical_plan/parquet.rs       | 492 ++++--------------
 20 files changed, 1001 insertions(+), 507 deletions(-)
 create mode 100644 datafusion/src/datasource/local.rs
 create mode 100644 datafusion/src/datasource/object_store.rs
diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs
index 31b8b6d3bcbc..24faddd9f34d 100644
--- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs
+++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs
@@ -159,7 +159,7 @@ impl TryInto<LogicalPlan> for &protobuf::LogicalPlanNode {
                 LogicalPlanBuilder::scan_parquet_with_name(
                     &scan.path,
                     projection,
-                    24,
+                    create_datafusion_context_concurrency(24),
                     &scan.table_name,
                 )? //TODO concurrency
                 .build()
@@ -1100,6 +1100,8 @@ impl TryInto<Field> for &protobuf::Field {
     }
 }
 
+use crate::utils::create_datafusion_context_concurrency;
+use datafusion::physical_plan::datetime_expressions::to_timestamp;
 use datafusion::physical_plan::{aggregates, windows};
 use datafusion::prelude::{
     array, date_part, date_trunc, length, lower, ltrim, md5, rtrim, sha224, sha256,
diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs
index 678bcde8fa73..1441f87bc0aa 100644
--- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs
+++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs
@@ -29,11 +29,13 @@ use crate::serde::protobuf::repartition_exec_node::PartitionMethod;
 use crate::serde::protobuf::ShuffleReaderPartition;
 use crate::serde::scheduler::PartitionLocation;
 use crate::serde::{from_proto_binary_op, proto_error, protobuf};
+use crate::utils::create_datafusion_context_concurrency;
 use crate::{convert_box_required, convert_required, into_required};
 use datafusion::arrow::datatypes::{DataType, Schema, SchemaRef};
 use datafusion::catalog::catalog::{
     CatalogList, CatalogProvider, MemoryCatalogList, MemoryCatalogProvider,
 };
+use datafusion::datasource::object_store::ObjectStoreRegistry;
 use datafusion::execution::context::{
     ExecutionConfig, ExecutionContextState, ExecutionProps,
 };
@@ -129,14 +131,13 @@ impl TryInto<Arc<dyn ExecutionPlan>> for &protobuf::PhysicalPlanNode {
             }
             PhysicalPlanType::ParquetScan(scan) => {
                 let projection = scan.projection.iter().map(|i| *i as usize).collect();
-                let filenames: Vec<&str> =
-                    scan.filename.iter().map(|s| s.as_str()).collect();
-                Ok(Arc::new(ParquetExec::try_from_files(
-                    &filenames,
+                let path: &str = scan.filename[0].as_str();
+                Ok(Arc::new(ParquetExec::try_from_path(
+                    path,
                     Some(projection),
                     None,
                     scan.batch_size as usize,
-                    scan.num_partitions as usize,
+                    create_datafusion_context_concurrency(scan.num_partitions as usize),
                     None,
                 )?))
             }
@@ -614,6 +615,9 @@ impl TryFrom<&protobuf::PhysicalExprNode> for Arc<dyn PhysicalExpr> {
 
                 let catalog_list =
                     Arc::new(MemoryCatalogList::new()) as Arc<dyn CatalogList>;
+
+                let object_store_registry = Arc::new(ObjectStoreRegistry::new());
+
                 let ctx_state = ExecutionContextState {
                     catalog_list,
                     scalar_functions: Default::default(),
@@ -621,6 +625,7 @@ impl TryFrom<&protobuf::PhysicalExprNode> for Arc<dyn PhysicalExpr> {
                     aggregate_functions: Default::default(),
                     config: ExecutionConfig::new(),
                     execution_props: ExecutionProps::new(),
+                    object_store_registry,
                 };
 
                 let fun_expr = functions::create_physical_fun(
diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs
index 48b21345525b..7b310cd076fa 100644
--- a/ballista/rust/core/src/serde/physical_plan/to_proto.rs
+++ b/ballista/rust/core/src/serde/physical_plan/to_proto.rs
@@ -259,7 +259,7 @@ impl TryInto<protobuf::PhysicalPlanNode> for Arc<dyn ExecutionPlan> {
             let filenames = exec
                 .partitions()
                 .iter()
-                .flat_map(|part| part.filenames().to_owned())
+                .flat_map(|part| part.filenames())
                 .collect();
             Ok(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::ParquetScan(
diff --git a/ballista/rust/core/src/utils.rs b/ballista/rust/core/src/utils.rs
index 4187faa6645a..e960b77575a9 100644
--- a/ballista/rust/core/src/utils.rs
+++ b/ballista/rust/core/src/utils.rs
@@ -252,6 +252,11 @@ pub fn create_datafusion_context(
     ExecutionContext::with_config(config)
 }
 
+/// Create a DataFusion context that is compatible with Ballista in concurrency
+pub fn create_datafusion_context_concurrency(concurrency: usize) -> ExecutionContext {
+    ExecutionContext::with_concurrency(concurrency)
+}
+
 pub struct BallistaQueryPlanner {
     scheduler_url: String,
     config: BallistaConfig,
diff --git a/ballista/rust/scheduler/src/lib.rs b/ballista/rust/scheduler/src/lib.rs
index 676975fcaec9..2037a3530aba 100644
--- a/ballista/rust/scheduler/src/lib.rs
+++ b/ballista/rust/scheduler/src/lib.rs
@@ -85,7 +85,8 @@ use self::state::{ConfigBackendClient, SchedulerState};
 use ballista_core::config::BallistaConfig;
 use ballista_core::execution_plans::ShuffleWriterExec;
 use ballista_core::serde::scheduler::to_proto::hash_partitioning_to_proto;
-use datafusion::physical_plan::parquet::ParquetExec;
+use ballista_core::utils::create_datafusion_context_concurrency;
+use datafusion::datasource::parquet::ParquetRootDesc;
 use datafusion::prelude::{ExecutionConfig, ExecutionContext};
 use std::time::{Instant, SystemTime, UNIX_EPOCH};
 
@@ -285,24 +286,19 @@ impl SchedulerGrpc for SchedulerServer {
 
         match file_type {
             FileType::Parquet => {
-                let parquet_exec =
-                    ParquetExec::try_from_path(&path, None, None, 1024, 1, None)
-                        .map_err(|e| {
-                            let msg = format!("Error opening parquet files: {}", e);
-                            error!("{}", msg);
-                            tonic::Status::internal(msg)
-                        })?;
+                let ctx = create_datafusion_context_concurrency(1);
+                let parquet_desc = ParquetRootDesc::new(&path, ctx).map_err(|e| {
+                    let msg = format!("Error opening parquet files: {}", e);
+                    error!("{}", msg);
+                    tonic::Status::internal(msg)
+                })?;
 
                 //TODO include statistics and any other info needed to reconstruct ParquetExec
                 Ok(Response::new(GetFileMetadataResult {
-                    schema: Some(parquet_exec.schema().as_ref().into()),
-                    partitions: parquet_exec
-                        .partitions()
-                        .iter()
-                        .map(|part| FilePartitionMetadata {
-                            filename: part.filenames().to_vec(),
-                        })
-                        .collect(),
+                    schema: Some(parquet_desc.schema().as_ref().into()),
+                    partitions: vec![FilePartitionMetadata {
+                        filename: vec![path],
+                    }],
                 }))
             }
             //TODO implement for CSV
diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs
index 10b5c2db795f..c45341bad2de 100644
--- a/benchmarks/src/bin/tpch.rs
+++ b/benchmarks/src/bin/tpch.rs
@@ -475,7 +475,10 @@ fn get_table(
         }
         "parquet" => {
             let path = format!("{}/{}", path, table);
-            Ok(Arc::new(ParquetTable::try_new(&path, max_concurrency)?))
+            Ok(Arc::new(ParquetTable::try_new(
+                &path,
+                ExecutionContext::with_concurrency(max_concurrency),
+            )?))
         }
         other => {
             unimplemented!("Invalid file format '{}'", other);
diff --git a/datafusion-examples/examples/flight_server.rs b/datafusion-examples/examples/flight_server.rs
index 138434ea2482..aab647b86676 100644
--- a/datafusion-examples/examples/flight_server.rs
+++ b/datafusion-examples/examples/flight_server.rs
@@ -65,7 +65,11 @@ impl FlightService for FlightServiceImpl {
     ) -> Result<Response<SchemaResult>, Status> {
         let request = request.into_inner();
 
-        let table = ParquetTable::try_new(&request.path[0], num_cpus::get()).unwrap();
+        let table = ParquetTable::try_new(
+            &request.path[0],
+            ExecutionContext::with_concurrency(num_cpus::get()),
+        )
+        .unwrap();
 
         let options = datafusion::arrow::ipc::writer::IpcWriteOptions::default();
         let schema_result = SchemaAsIpc::new(table.schema().as_ref(), &options).into();
diff --git a/datafusion/src/datasource/csv.rs b/datafusion/src/datasource/csv.rs
index 987c4fdb079d..d4ca073af2dd 100644
--- a/datafusion/src/datasource/csv.rs
+++ b/datafusion/src/datasource/csv.rs
@@ -40,12 +40,14 @@ use std::string::String;
 use std::sync::{Arc, Mutex};
 
 use crate::datasource::datasource::Statistics;
+use crate::datasource::local::LocalFileSystem;
+use crate::datasource::object_store::ObjectStore;
 use crate::datasource::{Source, TableProvider};
 use crate::error::{DataFusionError, Result};
 use crate::logical_plan::Expr;
 use crate::physical_plan::csv::CsvExec;
 pub use crate::physical_plan::csv::CsvReadOptions;
-use crate::physical_plan::{common, ExecutionPlan};
+use crate::physical_plan::ExecutionPlan;
 
 /// Represents a CSV file with a provided schema
 pub struct CsvFile {
@@ -64,7 +66,8 @@ impl CsvFile {
         let schema = Arc::new(match options.schema {
             Some(s) => s.clone(),
             None => {
-                let filenames = common::build_file_list(&path, options.file_extension)?;
+                let filenames = LocalFileSystem
+                    .list_all_files(path.as_str(), options.file_extension)?;
                 if filenames.is_empty() {
                     return Err(DataFusionError::Plan(format!(
                         "No files found at {path} with file extension {file_extension}",
diff --git a/datafusion/src/datasource/json.rs b/datafusion/src/datasource/json.rs
index 90fedfd6f528..5bd8a5f7121f 100644
--- a/datafusion/src/datasource/json.rs
+++ b/datafusion/src/datasource/json.rs
@@ -30,7 +30,6 @@ use crate::{
     datasource::{Source, TableProvider},
     error::{DataFusionError, Result},
     physical_plan::{
-        common,
         json::{NdJsonExec, NdJsonReadOptions},
         ExecutionPlan,
     },
@@ -38,6 +37,8 @@ use crate::{
 use arrow::{datatypes::SchemaRef, json::reader::infer_json_schema_from_seekable};
 
 use super::datasource::Statistics;
+use crate::datasource::local::LocalFileSystem;
+use crate::datasource::object_store::ObjectStore;
 
 trait SeekRead: Read + Seek {}
 
@@ -57,7 +58,8 @@ impl NdJsonFile {
         let schema = if let Some(schema) = options.schema {
             schema
         } else {
-            let filenames = common::build_file_list(path, options.file_extension)?;
+            let filenames =
+                LocalFileSystem.list_all_files(path, options.file_extension)?;
             if filenames.is_empty() {
                 return Err(DataFusionError::Plan(format!(
                     "No files found at {path} with file extension {file_extension}",
diff --git a/datafusion/src/datasource/local.rs b/datafusion/src/datasource/local.rs
new file mode 100644
index 000000000000..4890e9d229f9
--- /dev/null
+++ b/datafusion/src/datasource/local.rs
@@ -0,0 +1,126 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Object store that represents the Local File System.
+
+use crate::datasource::object_store::{ObjectReader, ObjectStore};
+use crate::error::DataFusionError;
+use crate::error::Result;
+use crate::parquet::file::reader::{ChunkReader, Length};
+use std::any::Any;
+use std::fs;
+use std::fs::{metadata, File};
+use std::io::Read;
+use std::sync::Arc;
+
+#[derive(Debug)]
+/// Local File System as Object Store.
+pub struct LocalFileSystem;
+
+impl ObjectStore for LocalFileSystem {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn list_all_files(&self, path: &str, ext: &str) -> Result<Vec<String>> {
+        list_all(path, ext)
+    }
+
+    fn get_reader(&self, file_path: &str) -> Result<Arc<dyn ObjectReader>> {
+        let file = File::open(file_path)?;
+        let reader = LocalFSObjectReader::new(file)?;
+        Ok(Arc::new(reader))
+    }
+}
+
+struct LocalFSObjectReader {
+    file: File,
+}
+
+impl LocalFSObjectReader {
+    fn new(file: File) -> Result<Self> {
+        Ok(Self { file })
+    }
+}
+
+impl ObjectReader for LocalFSObjectReader {
+    fn get_reader(&self, start: u64, length: usize) -> Box<dyn Read> {
+        Box::new(FileSegmentReader::new(
+            self.file.try_clone().unwrap(),
+            start,
+            length,
+        ))
+    }
+
+    fn length(&self) -> u64 {
+        self.file.len()
+    }
+}
+
+struct FileSegmentReader {
+    file: File,
+    start: u64,
+    length: usize,
+}
+
+impl FileSegmentReader {
+    fn new(file: File, start: u64, length: usize) -> Self {
+        Self {
+            file,
+            start,
+            length,
+        }
+    }
+}
+
+impl Read for FileSegmentReader {
+    fn read(&mut self, buf: &mut [u8]) -> std::result::Result<usize, std::io::Error> {
+        let mut file_source = self.file.get_read(self.start, self.length)?;
+        file_source.read(buf)
+    }
+}
+
+fn list_all(root_path: &str, ext: &str) -> Result<Vec<String>> {
+    let mut filenames: Vec<String> = Vec::new();
+    list_all_files(root_path, &mut filenames, ext)?;
+    Ok(filenames)
+}
+
+/// Recursively build a list of files in a directory with a given extension with an accumulator list
+fn list_all_files(dir: &str, filenames: &mut Vec<String>, ext: &str) -> Result<()> {
+    let metadata = metadata(dir)?;
+    if metadata.is_file() {
+        if dir.ends_with(ext) {
+            filenames.push(dir.to_string());
+        }
+    } else {
+        for entry in fs::read_dir(dir)? {
+            let entry = entry?;
+            let path = entry.path();
+            if let Some(path_name) = path.to_str() {
+                if path.is_dir() {
+                    list_all_files(path_name, filenames, ext)?;
+                } else if path_name.ends_with(ext) {
+                    filenames.push(path_name.to_string());
+                }
+            } else {
+                return Err(DataFusionError::Plan("Invalid path".to_string()));
+            }
+        }
+    }
+    Ok(())
+}
diff --git a/datafusion/src/datasource/mod.rs b/datafusion/src/datasource/mod.rs
index 9699a997caa1..64e84c8e5611 100644
--- a/datafusion/src/datasource/mod.rs
+++ b/datafusion/src/datasource/mod.rs
@@ -21,13 +21,24 @@ pub mod csv;
 pub mod datasource;
 pub mod empty;
 pub mod json;
+pub mod local;
 pub mod memory;
+pub mod object_store;
 pub mod parquet;
 
 pub use self::csv::{CsvFile, CsvReadOptions};
 pub use self::datasource::{TableProvider, TableType};
 pub use self::memory::MemTable;
 
+use crate::arrow::datatypes::{Schema, SchemaRef};
+use crate::datasource::datasource::{ColumnStatistics, Statistics};
+use crate::datasource::object_store::ObjectStore;
+use crate::error::{DataFusionError, Result};
+use crate::physical_plan::expressions::{MaxAccumulator, MinAccumulator};
+use crate::physical_plan::Accumulator;
+use crate::scalar::ScalarValue;
+use std::sync::Arc;
+
 /// Source for table input data
 pub(crate) enum Source<R = Box<dyn std::io::Read + Send + Sync + 'static>> {
     /// Path to a single file or a directory containing one of more files
@@ -36,3 +47,245 @@ pub(crate) enum Source<R = Box<dyn std::io::Read + Send + Sync + 'static>> {
     /// Read data from a reader
     Reader(std::sync::Mutex<Option<R>>),
 }
+
+#[derive(Debug, Clone)]
+/// A single file that should be read, along with its schema, statistics
+/// and partition column values that need to be appended to each row.
+pub struct PartitionedFile {
+    /// Path for the file (e.g. URL, filesystem path, etc)
+    pub file_path: String,
+    /// Schema of the file
+    pub schema: Schema,
+    /// Statistics of the file
+    pub statistics: Statistics,
+    /// Values of partition columns to be appended to each row
+    pub partition_value: Option<Vec<ScalarValue>>,
+    /// Schema of partition columns
+    pub partition_schema: Option<Schema>,
+    // We may include row group range here for a more fine-grained parallel execution
+}
+
+impl From<String> for PartitionedFile {
+    fn from(file_path: String) -> Self {
+        Self {
+            file_path,
+            schema: Schema::empty(),
+            statistics: Default::default(),
+            partition_value: None,
+            partition_schema: None,
+        }
+    }
+}
+
+impl std::fmt::Display for PartitionedFile {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(
+            f,
+            "PartitionedFile(file_path: {}, schema: {}, statistics: {:?},\
+         partition_value: {:?}, partition_schema: {:?})",
+            self.file_path,
+            self.schema,
+            self.statistics,
+            self.partition_value,
+            self.partition_schema
+        )
+    }
+}
+
+#[derive(Debug, Clone)]
+/// A collection of files that should be read in a single task
+pub struct FilePartition {
+    /// The index of the partition among all partitions
+    pub index: usize,
+    /// The contained files of the partition
+    pub files: Vec<PartitionedFile>,
+}
+
+impl std::fmt::Display for FilePartition {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        let files: Vec<String> = self.files.iter().map(|f| format!("{}", f)).collect();
+        write!(
+            f,
+            "FilePartition[{}], files: {}",
+            self.index,
+            files.join(", ")
+        )
+    }
+}
+
+#[derive(Debug, Clone)]
+/// All source files with same schema exists in a path
+pub struct SourceRootDescriptor {
+    /// All source files in the path
+    pub partition_files: Vec<PartitionedFile>,
+    /// The schema of the files
+    pub schema: SchemaRef,
+}
+
+/// Builder for ['SourceRootDescriptor'] inside given path
+pub trait SourceRootDescBuilder {
+    /// Construct a ['SourceRootDescriptor'] from the provided path
+    fn get_source_desc(
+        path: &str,
+        object_store: Arc<dyn ObjectStore>,
+        ext: &str,
+    ) -> Result<SourceRootDescriptor> {
+        let filenames = object_store.list_all_files(path, ext)?;
+        if filenames.is_empty() {
+            return Err(DataFusionError::Plan(format!(
+                "No file (with .{} extension) found at path {}",
+                ext, path
+            )));
+        }
+
+        // build a list of Parquet partitions with statistics and gather all unique schemas
+        // used in this data set
+        let mut schemas: Vec<Schema> = vec![];
+
+        let partitioned_files = filenames
+            .iter()
+            .map(|file_path| {
+                let pf = Self::get_file_meta(file_path, object_store.clone())?;
+                let schema = pf.schema.clone();
+                if schemas.is_empty() {
+                    schemas.push(schema);
+                } else if schema != schemas[0] {
+                    // we currently get the schema information from the first file rather than do
+                    // schema merging and this is a limitation.
+                    // See https://issues.apache.org/jira/browse/ARROW-11017
+                    return Err(DataFusionError::Plan(format!(
+                        "The file {} have different schema from the first file and DataFusion does \
+                        not yet support schema merging",
+                        file_path
+                    )));
+                }
+                Ok(pf)
+            }).collect::<Result<Vec<PartitionedFile>>>();
+
+        Ok(SourceRootDescriptor {
+            partition_files: partitioned_files?,
+            schema: Arc::new(schemas.pop().unwrap()),
+        })
+    }
+
+    /// Get all metadata for a source file, including schema, statistics, partitions, etc.
+    fn get_file_meta(
+        file_path: &str,
+        object_store: Arc<dyn ObjectStore>,
+    ) -> Result<PartitionedFile>;
+}
+
+/// Get all files as well as the summary statistics when a limit is provided
+pub fn get_statistics_with_limit(
+    source_desc: &SourceRootDescriptor,
+    limit: Option<usize>,
+) -> (Vec<PartitionedFile>, Statistics) {
+    let mut all_files = source_desc.partition_files.clone();
+    let schema = source_desc.schema.clone();
+
+    let mut total_byte_size = 0;
+    let mut null_counts = vec![0; schema.fields().len()];
+    let mut has_statistics = false;
+    let (mut max_values, mut min_values) = create_max_min_accs(&schema);
+
+    let mut num_rows = 0;
+    let mut num_files = 0;
+    for file in &all_files {
+        num_files += 1;
+        let file_stats = &file.statistics;
+        num_rows += file_stats.num_rows.unwrap_or(0);
+        total_byte_size += file_stats.total_byte_size.unwrap_or(0);
+        if let Some(vec) = &file_stats.column_statistics {
+            has_statistics = true;
+            for (i, cs) in vec.iter().enumerate() {
+                null_counts[i] += cs.null_count.unwrap_or(0);
+
+                if let Some(max_value) = &mut max_values[i] {
+                    if let Some(file_max) = cs.max_value.clone() {
+                        match max_value.update(&[file_max]) {
+                            Ok(_) => {}
+                            Err(_) => {
+                                max_values[i] = None;
+                            }
+                        }
+                    }
+                }
+
+                if let Some(min_value) = &mut min_values[i] {
+                    if let Some(file_min) = cs.min_value.clone() {
+                        match min_value.update(&[file_min]) {
+                            Ok(_) => {}
+                            Err(_) => {
+                                min_values[i] = None;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        if num_rows > limit.unwrap_or(usize::MAX) {
+            break;
+        }
+    }
+    all_files.truncate(num_files);
+
+    let column_stats = if has_statistics {
+        Some(get_col_stats(
+            &*schema,
+            null_counts,
+            &mut max_values,
+            &mut min_values,
+        ))
+    } else {
+        None
+    };
+
+    let statistics = Statistics {
+        num_rows: Some(num_rows as usize),
+        total_byte_size: Some(total_byte_size as usize),
+        column_statistics: column_stats,
+    };
+    (all_files, statistics)
+}
+
+fn create_max_min_accs(
+    schema: &Schema,
+) -> (Vec<Option<MaxAccumulator>>, Vec<Option<MinAccumulator>>) {
+    let max_values: Vec<Option<MaxAccumulator>> = schema
+        .fields()
+        .iter()
+        .map(|field| MaxAccumulator::try_new(field.data_type()).ok())
+        .collect::<Vec<_>>();
+    let min_values: Vec<Option<MinAccumulator>> = schema
+        .fields()
+        .iter()
+        .map(|field| MinAccumulator::try_new(field.data_type()).ok())
+        .collect::<Vec<_>>();
+    (max_values, min_values)
+}
+
+fn get_col_stats(
+    schema: &Schema,
+    null_counts: Vec<usize>,
+    max_values: &mut Vec<Option<MaxAccumulator>>,
+    min_values: &mut Vec<Option<MinAccumulator>>,
+) -> Vec<ColumnStatistics> {
+    (0..schema.fields().len())
+        .map(|i| {
+            let max_value = match &max_values[i] {
+                Some(max_value) => max_value.evaluate().ok(),
+                None => None,
+            };
+            let min_value = match &min_values[i] {
+                Some(min_value) => min_value.evaluate().ok(),
+                None => None,
+            };
+            ColumnStatistics {
+                null_count: Some(null_counts[i] as usize),
+                max_value,
+                min_value,
+                distinct_count: None,
+            }
+        })
+        .collect()
+}
diff --git a/datafusion/src/datasource/object_store.rs b/datafusion/src/datasource/object_store.rs
new file mode 100644
index 000000000000..5c7a53215534
--- /dev/null
+++ b/datafusion/src/datasource/object_store.rs
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Object Store abstracts access to an underlying file/object storage.
+
+use crate::datasource::local::LocalFileSystem;
+use crate::error::Result;
+use std::any::Any;
+use std::collections::HashMap;
+use std::fmt::Debug;
+use std::io::Read;
+use std::sync::{Arc, RwLock};
+
+/// Objct Reader for one file in a object store
+pub trait ObjectReader {
+    /// Get reader for a part [start, start + length] in the file
+    fn get_reader(&self, start: u64, length: usize) -> Box<dyn Read>;
+
+    /// Get lenght for the file
+    fn length(&self) -> u64;
+}
+
+/// A ObjectStore abstracts access to an underlying file/object storage.
+/// It maps strings (e.g. URLs, filesystem paths, etc) to sources of bytes
+pub trait ObjectStore: Sync + Send + Debug {
+    /// Returns the object store as [`Any`](std::any::Any)
+    /// so that it can be downcast to a specific implementation.
+    fn as_any(&self) -> &dyn Any;
+
+    /// Returns all the files with `ext` in path `prefix`
+    fn list_all_files(&self, prefix: &str, ext: &str) -> Result<Vec<String>>;
+
+    /// Get object reader for one file
+    fn get_reader(&self, file_path: &str) -> Result<Arc<dyn ObjectReader>>;
+}
+
+static LOCAL_SCHEME: &str = "file";
+
+/// A Registry holds all the object stores at runtime with a scheme for each store.
+/// This allows the user to extend DataFusion with different storage systems such as S3 or HDFS
+/// and query data inside these systems.
+pub struct ObjectStoreRegistry {
+    /// A map from scheme to object store that serve list / read operations for the store
+    pub object_stores: RwLock<HashMap<String, Arc<dyn ObjectStore>>>,
+}
+
+impl ObjectStoreRegistry {
+    /// Create the registry that object stores can registered into.
+    /// ['LocalFileSystem'] store is registered in by default to support read from localfs natively.
+    pub fn new() -> Self {
+        let mut map: HashMap<String, Arc<dyn ObjectStore>> = HashMap::new();
+        map.insert(LOCAL_SCHEME.to_string(), Arc::new(LocalFileSystem));
+
+        Self {
+            object_stores: RwLock::new(map),
+        }
+    }
+
+    /// Adds a new store to this registry.
+    /// If a store of the same prefix existed before, it is replaced in the registry and returned.
+    pub fn register_store(
+        &self,
+        scheme: String,
+        store: Arc<dyn ObjectStore>,
+    ) -> Option<Arc<dyn ObjectStore>> {
+        let mut stores = self.object_stores.write().unwrap();
+        stores.insert(scheme, store)
+    }
+
+    /// Get the store registered for scheme
+    pub fn get(&self, scheme: &str) -> Option<Arc<dyn ObjectStore>> {
+        let stores = self.object_stores.read().unwrap();
+        stores.get(scheme).cloned()
+    }
+
+    /// Get a suitable store for the path based on it's scheme. For example:
+    /// path with prefix file:/// or no prefix will return the default LocalFS store,
+    /// path with prefix s3:/// will return the S3 store if it's registered,
+    /// and will always return LocalFS store when a prefix is not registered in the path.
+    pub fn store_for_path(&self, path: &str) -> Arc<dyn ObjectStore> {
+        if let Some((scheme, _)) = path.split_once(':') {
+            let stores = self.object_stores.read().unwrap();
+            if let Some(store) = stores.get(&*scheme.to_lowercase()) {
+                return store.clone();
+            }
+        }
+        self.object_stores
+            .read()
+            .unwrap()
+            .get(LOCAL_SCHEME)
+            .unwrap()
+            .clone()
+    }
+}
diff --git a/datafusion/src/datasource/parquet.rs b/datafusion/src/datasource/parquet.rs
index 28f79a6ae8dd..aaec9e83f78c 100644
--- a/datafusion/src/datasource/parquet.rs
+++ b/datafusion/src/datasource/parquet.rs
@@ -18,39 +18,51 @@
 //! Parquet data source
 
 use std::any::Any;
-use std::string::String;
+use std::io::Read;
 use std::sync::Arc;
 
-use arrow::datatypes::*;
+use arrow::datatypes::SchemaRef;
 
+use parquet::arrow::ArrowReader;
+use parquet::arrow::ParquetFileArrowReader;
+use parquet::file::reader::ChunkReader;
+use parquet::file::serialized_reader::SerializedFileReader;
+use parquet::file::statistics::Statistics as ParquetStatistics;
+
+use super::datasource::TableProviderFilterPushDown;
+use crate::arrow::datatypes::{DataType, Field};
 use crate::datasource::datasource::Statistics;
-use crate::datasource::TableProvider;
+use crate::datasource::object_store::{ObjectReader, ObjectStore};
+use crate::datasource::{
+    create_max_min_accs, get_col_stats, get_statistics_with_limit, PartitionedFile,
+    SourceRootDescBuilder, SourceRootDescriptor, TableProvider,
+};
 use crate::error::Result;
 use crate::logical_plan::{combine_filters, Expr};
+use crate::parquet::file::reader::Length;
+use crate::physical_plan::expressions::{MaxAccumulator, MinAccumulator};
 use crate::physical_plan::parquet::ParquetExec;
-use crate::physical_plan::ExecutionPlan;
-
-use super::datasource::TableProviderFilterPushDown;
+use crate::physical_plan::{Accumulator, ExecutionPlan};
+use crate::prelude::ExecutionContext;
+use crate::scalar::ScalarValue;
 
 /// Table-based representation of a `ParquetFile`.
 pub struct ParquetTable {
     path: String,
-    schema: SchemaRef,
-    statistics: Statistics,
+    desc: Arc<ParquetRootDesc>,
     max_concurrency: usize,
     enable_pruning: bool,
 }
 
 impl ParquetTable {
     /// Attempt to initialize a new `ParquetTable` from a file path.
-    pub fn try_new(path: impl Into<String>, max_concurrency: usize) -> Result<Self> {
+    pub fn try_new(path: impl Into<String>, context: ExecutionContext) -> Result<Self> {
         let path = path.into();
-        let parquet_exec = ParquetExec::try_from_path(&path, None, None, 0, 1, None)?;
-        let schema = parquet_exec.schema();
+        let max_concurrency = context.state.lock().unwrap().config.concurrency;
+        let root_desc = ParquetRootDesc::new(path.as_str(), context);
         Ok(Self {
             path,
-            schema,
-            statistics: parquet_exec.statistics().to_owned(),
+            desc: Arc::new(root_desc?),
             max_concurrency,
             enable_pruning: true,
         })
@@ -80,7 +92,7 @@ impl TableProvider for ParquetTable {
 
     /// Get the schema for this parquet file.
     fn schema(&self) -> SchemaRef {
-        self.schema.clone()
+        self.desc.schema()
     }
 
     fn supports_filter_pushdown(
@@ -107,8 +119,8 @@ impl TableProvider for ParquetTable {
         } else {
             None
         };
-        Ok(Arc::new(ParquetExec::try_from_path(
-            &self.path,
+        Ok(Arc::new(ParquetExec::try_new(
+            self.desc.clone(),
             projection.clone(),
             predicate,
             limit
@@ -120,7 +132,7 @@ impl TableProvider for ParquetTable {
     }
 
     fn statistics(&self) -> Statistics {
-        self.statistics.clone()
+        self.desc.statistics()
     }
 
     fn has_exact_statistics(&self) -> bool {
@@ -128,6 +140,295 @@ impl TableProvider for ParquetTable {
     }
 }
 
+#[derive(Debug)]
+/// Descriptor for a parquet root path
+pub struct ParquetRootDesc {
+    /// object store for reading files inside the root path
+    pub object_store: Arc<dyn ObjectStore>,
+    /// metadata for files inside the root path
+    pub descriptor: SourceRootDescriptor,
+}
+
+impl ParquetRootDesc {
+    /// Construct a new parquet descriptor for a root path
+    pub fn new(root_path: &str, context: ExecutionContext) -> Result<Self> {
+        let object_store = context
+            .state
+            .lock()
+            .unwrap()
+            .object_store_registry
+            .store_for_path(root_path);
+        let root_desc = Self::get_source_desc(root_path, object_store.clone(), "parquet");
+        Ok(Self {
+            object_store,
+            descriptor: root_desc?,
+        })
+    }
+
+    /// Get file schema for all parquet files
+    pub fn schema(&self) -> SchemaRef {
+        self.descriptor.schema.clone()
+    }
+
+    /// Get the summary statistics for all parquet files
+    pub fn statistics(&self) -> Statistics {
+        get_statistics_with_limit(&self.descriptor, None).1
+    }
+
+    fn summarize_min_max(
+        max_values: &mut Vec<Option<MaxAccumulator>>,
+        min_values: &mut Vec<Option<MinAccumulator>>,
+        fields: &Vec<Field>,
+        i: usize,
+        stat: &ParquetStatistics,
+    ) {
+        match stat {
+            ParquetStatistics::Boolean(s) => {
+                if let DataType::Boolean = fields[i].data_type() {
+                    if s.has_min_max_set() {
+                        if let Some(max_value) = &mut max_values[i] {
+                            match max_value
+                                .update(&[ScalarValue::Boolean(Some(*s.max()))])
+                            {
+                                Ok(_) => {}
+                                Err(_) => {
+                                    max_values[i] = None;
+                                }
+                            }
+                        }
+                        if let Some(min_value) = &mut min_values[i] {
+                            match min_value
+                                .update(&[ScalarValue::Boolean(Some(*s.min()))])
+                            {
+                                Ok(_) => {}
+                                Err(_) => {
+                                    min_values[i] = None;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            ParquetStatistics::Int32(s) => {
+                if let DataType::Int32 = fields[i].data_type() {
+                    if s.has_min_max_set() {
+                        if let Some(max_value) = &mut max_values[i] {
+                            match max_value.update(&[ScalarValue::Int32(Some(*s.max()))])
+                            {
+                                Ok(_) => {}
+                                Err(_) => {
+                                    max_values[i] = None;
+                                }
+                            }
+                        }
+                        if let Some(min_value) = &mut min_values[i] {
+                            match min_value.update(&[ScalarValue::Int32(Some(*s.min()))])
+                            {
+                                Ok(_) => {}
+                                Err(_) => {
+                                    min_values[i] = None;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            ParquetStatistics::Int64(s) => {
+                if let DataType::Int64 = fields[i].data_type() {
+                    if s.has_min_max_set() {
+                        if let Some(max_value) = &mut max_values[i] {
+                            match max_value.update(&[ScalarValue::Int64(Some(*s.max()))])
+                            {
+                                Ok(_) => {}
+                                Err(_) => {
+                                    max_values[i] = None;
+                                }
+                            }
+                        }
+                        if let Some(min_value) = &mut min_values[i] {
+                            match min_value.update(&[ScalarValue::Int64(Some(*s.min()))])
+                            {
+                                Ok(_) => {}
+                                Err(_) => {
+                                    min_values[i] = None;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            ParquetStatistics::Float(s) => {
+                if let DataType::Float32 = fields[i].data_type() {
+                    if s.has_min_max_set() {
+                        if let Some(max_value) = &mut max_values[i] {
+                            match max_value
+                                .update(&[ScalarValue::Float32(Some(*s.max()))])
+                            {
+                                Ok(_) => {}
+                                Err(_) => {
+                                    max_values[i] = None;
+                                }
+                            }
+                        }
+                        if let Some(min_value) = &mut min_values[i] {
+                            match min_value
+                                .update(&[ScalarValue::Float32(Some(*s.min()))])
+                            {
+                                Ok(_) => {}
+                                Err(_) => {
+                                    min_values[i] = None;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            ParquetStatistics::Double(s) => {
+                if let DataType::Float64 = fields[i].data_type() {
+                    if s.has_min_max_set() {
+                        if let Some(max_value) = &mut max_values[i] {
+                            match max_value
+                                .update(&[ScalarValue::Float64(Some(*s.max()))])
+                            {
+                                Ok(_) => {}
+                                Err(_) => {
+                                    max_values[i] = None;
+                                }
+                            }
+                        }
+                        if let Some(min_value) = &mut min_values[i] {
+                            match min_value
+                                .update(&[ScalarValue::Float64(Some(*s.min()))])
+                            {
+                                Ok(_) => {}
+                                Err(_) => {
+                                    min_values[i] = None;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            _ => {}
+        }
+    }
+}
+
+impl SourceRootDescBuilder for ParquetRootDesc {
+    fn get_file_meta(
+        file_path: &str,
+        object_store: Arc<dyn ObjectStore>,
+    ) -> Result<PartitionedFile> {
+        let reader = object_store.get_reader(file_path)?;
+        let file_reader =
+            Arc::new(SerializedFileReader::new(ObjectReaderWrapper::new(reader))?);
+        let mut arrow_reader = ParquetFileArrowReader::new(file_reader);
+        let file_path = file_path.to_string();
+        let schema = arrow_reader.get_schema()?;
+        let num_fields = schema.fields().len();
+        let fields = schema.fields().to_vec();
+        let meta_data = arrow_reader.get_metadata();
+
+        let mut num_rows = 0;
+        let mut total_byte_size = 0;
+        let mut null_counts = vec![0; num_fields];
+        let mut has_statistics = false;
+
+        let (mut max_values, mut min_values) = create_max_min_accs(&schema);
+
+        for row_group_meta in meta_data.row_groups() {
+            num_rows += row_group_meta.num_rows();
+            total_byte_size += row_group_meta.total_byte_size();
+
+            let columns_null_counts = row_group_meta
+                .columns()
+                .iter()
+                .flat_map(|c| c.statistics().map(|stats| stats.null_count()));
+
+            for (i, cnt) in columns_null_counts.enumerate() {
+                null_counts[i] += cnt as usize
+            }
+
+            for (i, column) in row_group_meta.columns().iter().enumerate() {
+                if let Some(stat) = column.statistics() {
+                    has_statistics = true;
+                    ParquetRootDesc::summarize_min_max(
+                        &mut max_values,
+                        &mut min_values,
+                        &fields,
+                        i,
+                        stat,
+                    )
+                }
+            }
+        }
+
+        let column_stats = if has_statistics {
+            Some(get_col_stats(
+                &schema,
+                null_counts,
+                &mut max_values,
+                &mut min_values,
+            ))
+        } else {
+            None
+        };
+
+        let statistics = Statistics {
+            num_rows: Some(num_rows as usize),
+            total_byte_size: Some(total_byte_size as usize),
+            column_statistics: column_stats,
+        };
+
+        Ok(PartitionedFile {
+            file_path,
+            schema,
+            statistics,
+            partition_value: None,
+            partition_schema: None,
+        })
+    }
+}
+
+/// Thin wrapper over object wrapper to work with parquet file read
+pub struct ObjectReaderWrapper {
+    reader: Arc<dyn ObjectReader>,
+}
+
+impl ObjectReaderWrapper {
+    /// Construct a wrapper over the provided object reader
+    pub fn new(reader: Arc<dyn ObjectReader>) -> Self {
+        Self { reader }
+    }
+}
+
+impl ChunkReader for ObjectReaderWrapper {
+    type T = InnerReaderWrapper;
+
+    fn get_read(&self, start: u64, length: usize) -> parquet::errors::Result<Self::T> {
+        Ok(InnerReaderWrapper {
+            inner_reader: self.reader.get_reader(start, length),
+        })
+    }
+}
+
+impl Length for ObjectReaderWrapper {
+    fn len(&self) -> u64 {
+        self.reader.length()
+    }
+}
+
+/// Thin wrapper over reader for a parquet file
+pub struct InnerReaderWrapper {
+    inner_reader: Box<dyn Read>,
+}
+
+impl Read for InnerReaderWrapper {
+    fn read(&mut self, buf: &mut [u8]) -> std::result::Result<usize, std::io::Error> {
+        self.inner_reader.read(buf)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -355,7 +656,8 @@ mod tests {
     fn load_table(name: &str) -> Result<Arc<dyn TableProvider>> {
         let testdata = crate::test_util::parquet_test_data();
         let filename = format!("{}/{}", testdata, name);
-        let table = ParquetTable::try_new(&filename, 2)?;
+        let table =
+            ParquetTable::try_new(&filename, ExecutionContext::with_concurrency(2))?;
         Ok(Arc::new(table))
     }
 
diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs
index 0cf8b3b6c276..4c7b85c1eb26 100644
--- a/datafusion/src/execution/context.rs
+++ b/datafusion/src/execution/context.rs
@@ -49,6 +49,8 @@ use crate::catalog::{
     ResolvedTableReference, TableReference,
 };
 use crate::datasource::csv::CsvFile;
+use crate::datasource::object_store::ObjectStore;
+use crate::datasource::object_store::ObjectStoreRegistry;
 use crate::datasource::parquet::ParquetTable;
 use crate::datasource::TableProvider;
 use crate::error::{DataFusionError, Result};
@@ -164,10 +166,17 @@ impl ExecutionContext {
                 aggregate_functions: HashMap::new(),
                 config,
                 execution_props: ExecutionProps::new(),
+                object_store_registry: Arc::new(ObjectStoreRegistry::new()),
             })),
         }
     }
 
+    /// Creates a new execution context using the provided concurrency.
+    pub fn with_concurrency(concurrency: usize) -> ExecutionContext {
+        let config = ExecutionConfig::new().with_concurrency(concurrency);
+        ExecutionContext::with_config(config)
+    }
+
     /// Creates a dataframe that will execute a SQL query.
     pub fn sql(&mut self, sql: &str) -> Result<Arc<dyn DataFrame>> {
         let plan = self.create_logical_plan(sql)?;
@@ -288,12 +297,7 @@ impl ExecutionContext {
     ) -> Result<Arc<dyn DataFrame>> {
         Ok(Arc::new(DataFrameImpl::new(
             self.state.clone(),
-            &LogicalPlanBuilder::scan_parquet(
-                filename,
-                None,
-                self.state.lock().unwrap().config.concurrency,
-            )?
-            .build()?,
+            &LogicalPlanBuilder::scan_parquet(filename, None, self.clone())?.build()?,
         )))
     }
 
@@ -325,7 +329,7 @@ impl ExecutionContext {
     pub fn register_parquet(&mut self, name: &str, filename: &str) -> Result<()> {
         let table = {
             let m = self.state.lock().unwrap();
-            ParquetTable::try_new(filename, m.config.concurrency)?
+            ParquetTable::try_new(filename, self.clone())?
                 .with_enable_pruning(m.config.parquet_pruning)
         };
         self.register_table(name, Arc::new(table))?;
@@ -358,6 +362,25 @@ impl ExecutionContext {
         state.catalog_list.register_catalog(name, catalog)
     }
 
+    /// Registers a object store with scheme using a custom `ObjectStore` so that
+    /// an external file system or object storage system could be used against this context.
+    ///
+    /// Returns the `ObjectStore` previously registered for this
+    /// scheme, if any
+    pub fn register_object_store(
+        &self,
+        scheme: impl Into<String>,
+        object_store: Arc<dyn ObjectStore>,
+    ) -> Option<Arc<dyn ObjectStore>> {
+        let scheme = scheme.into();
+
+        self.state
+            .lock()
+            .unwrap()
+            .object_store_registry
+            .register_store(scheme, object_store)
+    }
+
     /// Retrieves a `CatalogProvider` instance by name
     pub fn catalog(&self, name: &str) -> Option<Arc<dyn CatalogProvider>> {
         self.state.lock().unwrap().catalog_list.catalog(name)
@@ -840,6 +863,8 @@ pub struct ExecutionContextState {
     pub config: ExecutionConfig,
     /// Execution properties
     pub execution_props: ExecutionProps,
+    /// Object Store that are registered with the context
+    pub object_store_registry: Arc<ObjectStoreRegistry>,
 }
 
 impl ExecutionProps {
@@ -867,6 +892,7 @@ impl ExecutionContextState {
             aggregate_functions: HashMap::new(),
             config: ExecutionConfig::new(),
             execution_props: ExecutionProps::new(),
+            object_store_registry: Arc::new(ObjectStoreRegistry::new()),
         }
     }
 
diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs
index 0dfc1e7aa048..77dd65c97ef3 100644
--- a/datafusion/src/logical_plan/builder.rs
+++ b/datafusion/src/logical_plan/builder.rs
@@ -40,6 +40,7 @@ use crate::logical_plan::{
     columnize_expr, normalize_col, normalize_cols, Column, DFField, DFSchema,
     DFSchemaRef, Partitioning,
 };
+use crate::prelude::ExecutionContext;
 
 /// Default table name for unnamed table
 pub const UNNAMED_TABLE: &str = "?table?";
@@ -137,20 +138,20 @@ impl LogicalPlanBuilder {
     pub fn scan_parquet(
         path: impl Into<String>,
         projection: Option<Vec<usize>>,
-        max_concurrency: usize,
+        context: ExecutionContext,
     ) -> Result<Self> {
         let path = path.into();
-        Self::scan_parquet_with_name(path.clone(), projection, max_concurrency, path)
+        Self::scan_parquet_with_name(path.clone(), projection, context, path)
     }
 
     /// Scan a Parquet data source and register it with a given table name
     pub fn scan_parquet_with_name(
         path: impl Into<String>,
         projection: Option<Vec<usize>>,
-        max_concurrency: usize,
+        context: ExecutionContext,
         table_name: impl Into<String>,
     ) -> Result<Self> {
-        let provider = Arc::new(ParquetTable::try_new(path, max_concurrency)?);
+        let provider = Arc::new(ParquetTable::try_new(path, context)?);
         Self::scan(table_name, provider, projection)
     }
 
diff --git a/datafusion/src/physical_optimizer/repartition.rs b/datafusion/src/physical_optimizer/repartition.rs
index 4504c81daa06..30ec896b4e2f 100644
--- a/datafusion/src/physical_optimizer/repartition.rs
+++ b/datafusion/src/physical_optimizer/repartition.rs
@@ -110,6 +110,8 @@ mod tests {
 
     use super::*;
     use crate::datasource::datasource::Statistics;
+    use crate::datasource::local::LocalFileSystem;
+    use crate::datasource::PartitionedFile;
     use crate::physical_plan::parquet::{
         ParquetExec, ParquetExecMetrics, ParquetPartition,
     };
@@ -122,11 +124,13 @@ mod tests {
             vec![],
             Arc::new(ParquetExec::new(
                 vec![ParquetPartition::new(
-                    vec!["x".to_string()],
-                    Statistics::default(),
+                    vec![PartitionedFile::from("x".to_string())],
+                    0,
                 )],
+                Arc::new(LocalFileSystem),
                 schema,
                 None,
+                Statistics::default(),
                 ParquetExecMetrics::new(),
                 None,
                 2048,
@@ -160,11 +164,13 @@ mod tests {
                 vec![],
                 Arc::new(ParquetExec::new(
                     vec![ParquetPartition::new(
-                        vec!["x".to_string()],
-                        Statistics::default(),
+                        vec![PartitionedFile::from("x".to_string())],
+                        0,
                     )],
+                    Arc::new(LocalFileSystem),
                     schema,
                     None,
+                    Statistics::default(),
                     ParquetExecMetrics::new(),
                     None,
                     2048,
diff --git a/datafusion/src/physical_plan/common.rs b/datafusion/src/physical_plan/common.rs
index 2482bfc0872c..628095c6640c 100644
--- a/datafusion/src/physical_plan/common.rs
+++ b/datafusion/src/physical_plan/common.rs
@@ -27,8 +27,6 @@ use arrow::error::Result as ArrowResult;
 use arrow::record_batch::RecordBatch;
 use futures::channel::mpsc;
 use futures::{SinkExt, Stream, StreamExt, TryStreamExt};
-use std::fs;
-use std::fs::metadata;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 use tokio::task::JoinHandle;
@@ -107,42 +105,6 @@ pub(crate) fn combine_batches(
     }
 }
 
-/// Recursively builds a list of files in a directory with a given extension
-pub fn build_file_list(dir: &str, ext: &str) -> Result<Vec<String>> {
-    let mut filenames: Vec<String> = Vec::new();
-    build_file_list_recurse(dir, &mut filenames, ext)?;
-    Ok(filenames)
-}
-
-/// Recursively build a list of files in a directory with a given extension with an accumulator list
-fn build_file_list_recurse(
-    dir: &str,
-    filenames: &mut Vec<String>,
-    ext: &str,
-) -> Result<()> {
-    let metadata = metadata(dir)?;
-    if metadata.is_file() {
-        if dir.ends_with(ext) {
-            filenames.push(dir.to_string());
-        }
-    } else {
-        for entry in fs::read_dir(dir)? {
-            let entry = entry?;
-            let path = entry.path();
-            if let Some(path_name) = path.to_str() {
-                if path.is_dir() {
-                    build_file_list_recurse(path_name, filenames, ext)?;
-                } else if path_name.ends_with(ext) {
-                    filenames.push(path_name.to_string());
-                }
-            } else {
-                return Err(DataFusionError::Plan("Invalid path".to_string()));
-            }
-        }
-    }
-    Ok(())
-}
-
 /// Spawns a task to the tokio threadpool and writes its outputs to the provided mpsc sender
 pub(crate) fn spawn_execution(
     input: Arc<dyn ExecutionPlan>,
diff --git a/datafusion/src/physical_plan/csv.rs b/datafusion/src/physical_plan/csv.rs
index 544f98cba0c6..293f46d7a736 100644
--- a/datafusion/src/physical_plan/csv.rs
+++ b/datafusion/src/physical_plan/csv.rs
@@ -17,9 +17,11 @@
 
 //! Execution plan for reading CSV files
 
+use crate::datasource::local::LocalFileSystem;
+use crate::datasource::object_store::ObjectStore;
 use crate::error::{DataFusionError, Result};
 use crate::physical_plan::ExecutionPlan;
-use crate::physical_plan::{common, source::Source, Partitioning};
+use crate::physical_plan::{source::Source, Partitioning};
 use arrow::csv;
 use arrow::datatypes::{Schema, SchemaRef};
 use arrow::error::Result as ArrowResult;
@@ -141,7 +143,7 @@ impl CsvExec {
     ) -> Result<Self> {
         let file_extension = String::from(options.file_extension);
 
-        let filenames = common::build_file_list(path, file_extension.as_str())?;
+        let filenames = LocalFileSystem.list_all_files(path, options.file_extension)?;
         if filenames.is_empty() {
             return Err(DataFusionError::Execution(format!(
                 "No files found at {path} with file extension {file_extension}",
diff --git a/datafusion/src/physical_plan/json.rs b/datafusion/src/physical_plan/json.rs
index ed9b0b03a38e..df7e9e5e5014 100644
--- a/datafusion/src/physical_plan/json.rs
+++ b/datafusion/src/physical_plan/json.rs
@@ -19,7 +19,9 @@
 use async_trait::async_trait;
 use futures::Stream;
 
-use super::{common, source::Source, ExecutionPlan, Partitioning, RecordBatchStream};
+use super::{source::Source, ExecutionPlan, Partitioning, RecordBatchStream};
+use crate::datasource::local::LocalFileSystem;
+use crate::datasource::object_store::ObjectStore;
 use crate::error::{DataFusionError, Result};
 use arrow::json::reader::{infer_json_schema_from_iterator, ValueIter};
 use arrow::{
@@ -87,7 +89,7 @@ impl NdJsonExec {
     ) -> Result<Self> {
         let file_extension = options.file_extension.to_string();
 
-        let filenames = common::build_file_list(path, &file_extension)?;
+        let filenames = LocalFileSystem.list_all_files(path, options.file_extension)?;
 
         if filenames.is_empty() {
             return Err(DataFusionError::Execution(format!(
diff --git a/datafusion/src/physical_plan/parquet.rs b/datafusion/src/physical_plan/parquet.rs
index ec5611f96292..bc3e0d597213 100644
--- a/datafusion/src/physical_plan/parquet.rs
+++ b/datafusion/src/physical_plan/parquet.rs
@@ -18,7 +18,6 @@
 //! Execution plan for reading Parquet files
 
 use std::fmt;
-use std::fs::File;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 use std::{any::Any, convert::TryInto};
@@ -28,7 +27,7 @@ use crate::{
     logical_plan::{Column, Expr},
     physical_optimizer::pruning::{PruningPredicate, PruningStatistics},
     physical_plan::{
-        common, DisplayFormatType, ExecutionPlan, Partitioning, RecordBatchStream,
+        DisplayFormatType, ExecutionPlan, Partitioning, RecordBatchStream,
         SendableRecordBatchStream,
     },
     scalar::ScalarValue,
@@ -36,7 +35,7 @@ use crate::{
 
 use arrow::{
     array::ArrayRef,
-    datatypes::{DataType, Schema, SchemaRef},
+    datatypes::{Schema, SchemaRef},
     error::{ArrowError, Result as ArrowResult},
     record_batch::RecordBatch,
 };
@@ -57,19 +56,23 @@ use tokio::{
 };
 use tokio_stream::wrappers::ReceiverStream;
 
-use crate::datasource::datasource::{ColumnStatistics, Statistics};
+use crate::datasource::datasource::Statistics;
 use async_trait::async_trait;
 use futures::stream::{Stream, StreamExt};
 
 use super::SQLMetric;
-use crate::physical_plan::expressions::{MaxAccumulator, MinAccumulator};
-use crate::physical_plan::Accumulator;
+use crate::datasource::object_store::ObjectStore;
+use crate::datasource::parquet::{ObjectReaderWrapper, ParquetRootDesc};
+use crate::datasource::{get_statistics_with_limit, FilePartition, PartitionedFile};
+use crate::prelude::ExecutionContext;
 
 /// Execution plan for scanning one or more Parquet partitions
 #[derive(Debug, Clone)]
 pub struct ParquetExec {
     /// Parquet partitions to read
     partitions: Vec<ParquetPartition>,
+    /// Source used for get reader for partitions
+    object_store: Arc<dyn ObjectStore>,
     /// Schema after projection is applied
     schema: SchemaRef,
     /// Projection for which columns to load
@@ -98,9 +101,7 @@ pub struct ParquetExec {
 #[derive(Debug, Clone)]
 pub struct ParquetPartition {
     /// The Parquet filename for this partition
-    pub filenames: Vec<String>,
-    /// Statistics for this partition
-    pub statistics: Statistics,
+    pub file_partition: FilePartition,
     /// Execution metrics
     metrics: ParquetPartitionMetrics,
 }
@@ -129,290 +130,44 @@ impl ParquetExec {
         projection: Option<Vec<usize>>,
         predicate: Option<Expr>,
         batch_size: usize,
-        max_concurrency: usize,
+        context: ExecutionContext,
         limit: Option<usize>,
     ) -> Result<Self> {
+        let max_concurrency = context.state.lock().unwrap().config.concurrency;
         // build a list of filenames from the specified path, which could be a single file or
         // a directory containing one or more parquet files
-        let filenames = common::build_file_list(path, ".parquet")?;
-        if filenames.is_empty() {
-            Err(DataFusionError::Plan(format!(
-                "No Parquet files (with .parquet extension) found at path {}",
-                path
-            )))
-        } else {
-            let filenames = filenames
-                .iter()
-                .map(|filename| filename.as_str())
-                .collect::<Vec<&str>>();
-            Self::try_from_files(
-                &filenames,
-                projection,
-                predicate,
-                batch_size,
-                max_concurrency,
-                limit,
-            )
-        }
+        let root_desc = ParquetRootDesc::new(path, context)?;
+        Self::try_new(
+            Arc::new(root_desc),
+            projection,
+            predicate,
+            batch_size,
+            max_concurrency,
+            limit,
+        )
     }
 
-    /// Create a new Parquet reader execution plan based on the specified list of Parquet
-    /// files
-    pub fn try_from_files(
-        filenames: &[&str],
+    /// Create a new Parquet reader execution plan with root descriptor, provided partitions and schema
+    pub fn try_new(
+        desc: Arc<ParquetRootDesc>,
         projection: Option<Vec<usize>>,
         predicate: Option<Expr>,
         batch_size: usize,
         max_concurrency: usize,
         limit: Option<usize>,
     ) -> Result<Self> {
-        debug!("Creating ParquetExec, filenames: {:?}, projection {:?}, predicate: {:?}, limit: {:?}",
-               filenames, projection, predicate, limit);
-        // build a list of Parquet partitions with statistics and gather all unique schemas
-        // used in this data set
-        let mut schemas: Vec<Schema> = vec![];
-        let mut partitions = Vec::with_capacity(max_concurrency);
-        let filenames: Vec<String> = filenames.iter().map(|s| s.to_string()).collect();
-        let chunks = split_files(&filenames, max_concurrency);
-        let mut num_rows = 0;
-        let mut num_fields = 0;
-        let mut fields = Vec::new();
-        let mut total_byte_size = 0;
-        let mut null_counts = Vec::new();
-        let mut max_values: Vec<Option<MaxAccumulator>> = Vec::new();
-        let mut min_values: Vec<Option<MinAccumulator>> = Vec::new();
-        let mut limit_exhausted = false;
-        for chunk in chunks {
-            let mut filenames: Vec<String> =
-                chunk.iter().map(|x| x.to_string()).collect();
-            let mut total_files = 0;
-            for filename in &filenames {
-                total_files += 1;
-                let file = File::open(filename)?;
-                let file_reader = Arc::new(SerializedFileReader::new(file)?);
-                let mut arrow_reader = ParquetFileArrowReader::new(file_reader);
-                let meta_data = arrow_reader.get_metadata();
-                // collect all the unique schemas in this data set
-                let schema = arrow_reader.get_schema()?;
-                if schemas.is_empty() || schema != schemas[0] {
-                    fields = schema.fields().to_vec();
-                    num_fields = schema.fields().len();
-                    null_counts = vec![0; num_fields];
-                    max_values = schema
-                        .fields()
-                        .iter()
-                        .map(|field| MaxAccumulator::try_new(field.data_type()).ok())
-                        .collect::<Vec<_>>();
-                    min_values = schema
-                        .fields()
-                        .iter()
-                        .map(|field| MinAccumulator::try_new(field.data_type()).ok())
-                        .collect::<Vec<_>>();
-                    schemas.push(schema);
-                }
+        debug!("Creating ParquetExec, desc: {:?}, projection {:?}, predicate: {:?}, limit: {:?}",
+               desc, projection, predicate, limit);
 
-                for row_group_meta in meta_data.row_groups() {
-                    num_rows += row_group_meta.num_rows();
-                    total_byte_size += row_group_meta.total_byte_size();
+        let (all_files, statistics) = get_statistics_with_limit(&desc.descriptor, limit);
+        let schema = desc.schema();
 
-                    // Currently assumes every Parquet file has same schema
-                    // https://issues.apache.org/jira/browse/ARROW-11017
-                    let columns_null_counts = row_group_meta
-                        .columns()
-                        .iter()
-                        .flat_map(|c| c.statistics().map(|stats| stats.null_count()));
-
-                    for (i, cnt) in columns_null_counts.enumerate() {
-                        null_counts[i] += cnt
-                    }
-
-                    for (i, column) in row_group_meta.columns().iter().enumerate() {
-                        if let Some(stat) = column.statistics() {
-                            match stat {
-                                ParquetStatistics::Boolean(s) => {
-                                    if let DataType::Boolean = fields[i].data_type() {
-                                        if s.has_min_max_set() {
-                                            if let Some(max_value) = &mut max_values[i] {
-                                                match max_value.update(&[
-                                                    ScalarValue::Boolean(Some(*s.max())),
-                                                ]) {
-                                                    Ok(_) => {}
-                                                    Err(_) => {
-                                                        max_values[i] = None;
-                                                    }
-                                                }
-                                            }
-                                            if let Some(min_value) = &mut min_values[i] {
-                                                match min_value.update(&[
-                                                    ScalarValue::Boolean(Some(*s.min())),
-                                                ]) {
-                                                    Ok(_) => {}
-                                                    Err(_) => {
-                                                        min_values[i] = None;
-                                                    }
-                                                }
-                                            }
-                                        }
-                                    }
-                                }
-                                ParquetStatistics::Int32(s) => {
-                                    if let DataType::Int32 = fields[i].data_type() {
-                                        if s.has_min_max_set() {
-                                            if let Some(max_value) = &mut max_values[i] {
-                                                match max_value.update(&[
-                                                    ScalarValue::Int32(Some(*s.max())),
-                                                ]) {
-                                                    Ok(_) => {}
-                                                    Err(_) => {
-                                                        max_values[i] = None;
-                                                    }
-                                                }
-                                            }
-                                            if let Some(min_value) = &mut min_values[i] {
-                                                match min_value.update(&[
-                                                    ScalarValue::Int32(Some(*s.min())),
-                                                ]) {
-                                                    Ok(_) => {}
-                                                    Err(_) => {
-                                                        min_values[i] = None;
-                                                    }
-                                                }
-                                            }
-                                        }
-                                    }
-                                }
-                                ParquetStatistics::Int64(s) => {
-                                    if let DataType::Int64 = fields[i].data_type() {
-                                        if s.has_min_max_set() {
-                                            if let Some(max_value) = &mut max_values[i] {
-                                                match max_value.update(&[
-                                                    ScalarValue::Int64(Some(*s.max())),
-                                                ]) {
-                                                    Ok(_) => {}
-                                                    Err(_) => {
-                                                        max_values[i] = None;
-                                                    }
-                                                }
-                                            }
-                                            if let Some(min_value) = &mut min_values[i] {
-                                                match min_value.update(&[
-                                                    ScalarValue::Int64(Some(*s.min())),
-                                                ]) {
-                                                    Ok(_) => {}
-                                                    Err(_) => {
-                                                        min_values[i] = None;
-                                                    }
-                                                }
-                                            }
-                                        }
-                                    }
-                                }
-                                ParquetStatistics::Float(s) => {
-                                    if let DataType::Float32 = fields[i].data_type() {
-                                        if s.has_min_max_set() {
-                                            if let Some(max_value) = &mut max_values[i] {
-                                                match max_value.update(&[
-                                                    ScalarValue::Float32(Some(*s.max())),
-                                                ]) {
-                                                    Ok(_) => {}
-                                                    Err(_) => {
-                                                        max_values[i] = None;
-                                                    }
-                                                }
-                                            }
-                                            if let Some(min_value) = &mut min_values[i] {
-                                                match min_value.update(&[
-                                                    ScalarValue::Float32(Some(*s.min())),
-                                                ]) {
-                                                    Ok(_) => {}
-                                                    Err(_) => {
-                                                        min_values[i] = None;
-                                                    }
-                                                }
-                                            }
-                                        }
-                                    }
-                                }
-                                ParquetStatistics::Double(s) => {
-                                    if let DataType::Float64 = fields[i].data_type() {
-                                        if s.has_min_max_set() {
-                                            if let Some(max_value) = &mut max_values[i] {
-                                                match max_value.update(&[
-                                                    ScalarValue::Float64(Some(*s.max())),
-                                                ]) {
-                                                    Ok(_) => {}
-                                                    Err(_) => {
-                                                        max_values[i] = None;
-                                                    }
-                                                }
-                                            }
-                                            if let Some(min_value) = &mut min_values[i] {
-                                                match min_value.update(&[
-                                                    ScalarValue::Float64(Some(*s.min())),
-                                                ]) {
-                                                    Ok(_) => {}
-                                                    Err(_) => {
-                                                        min_values[i] = None;
-                                                    }
-                                                }
-                                            }
-                                        }
-                                    }
-                                }
-                                _ => {}
-                            }
-                        }
-                    }
-
-                    if limit.map(|x| num_rows >= x as i64).unwrap_or(false) {
-                        limit_exhausted = true;
-                        break;
-                    }
-                }
-            }
-            let column_stats = (0..num_fields)
-                .map(|i| {
-                    let max_value = match &max_values[i] {
-                        Some(max_value) => max_value.evaluate().ok(),
-                        None => None,
-                    };
-                    let min_value = match &min_values[i] {
-                        Some(min_value) => min_value.evaluate().ok(),
-                        None => None,
-                    };
-                    ColumnStatistics {
-                        null_count: Some(null_counts[i] as usize),
-                        max_value,
-                        min_value,
-                        distinct_count: None,
-                    }
-                })
-                .collect();
-
-            let statistics = Statistics {
-                num_rows: Some(num_rows as usize),
-                total_byte_size: Some(total_byte_size as usize),
-                column_statistics: Some(column_stats),
-            };
-            // remove files that are not needed in case of limit
-            filenames.truncate(total_files);
-            partitions.push(ParquetPartition::new(filenames, statistics));
-            if limit_exhausted {
-                break;
-            }
+        let mut partitions = Vec::with_capacity(max_concurrency);
+        let chunked_files = split_files(&all_files, max_concurrency);
+        for (index, group) in chunked_files.iter().enumerate() {
+            partitions.push(ParquetPartition::new(Vec::from(*group), index));
         }
 
-        // we currently get the schema information from the first file rather than do
-        // schema merging and this is a limitation.
-        // See https://issues.apache.org/jira/browse/ARROW-11017
-        if schemas.len() > 1 {
-            return Err(DataFusionError::Plan(format!(
-                "The Parquet files have {} different schemas and DataFusion does \
-                not yet support schema merging",
-                schemas.len()
-            )));
-        }
-        let schema = Arc::new(schemas.pop().unwrap());
         let metrics = ParquetExecMetrics::new();
 
         let predicate_builder = predicate.and_then(|predicate_expr| {
@@ -431,8 +186,10 @@ impl ParquetExec {
 
         Ok(Self::new(
             partitions,
+            desc.object_store.clone(),
             schema,
             projection,
+            statistics,
             metrics,
             predicate_builder,
             batch_size,
@@ -443,8 +200,10 @@ impl ParquetExec {
     /// Create a new Parquet reader execution plan with provided partitions and schema
     pub fn new(
         partitions: Vec<ParquetPartition>,
+        object_store: Arc<dyn ObjectStore>,
         schema: SchemaRef,
         projection: Option<Vec<usize>>,
+        statistics: Statistics,
         metrics: ParquetExecMetrics,
         predicate_builder: Option<PruningPredicate>,
         batch_size: usize,
@@ -462,96 +221,23 @@ impl ParquetExec {
                 .collect(),
         );
 
-        // sum the statistics
-        let mut num_rows: Option<usize> = None;
-        let mut total_byte_size: Option<usize> = None;
-        let mut null_counts: Vec<usize> = vec![0; schema.fields().len()];
-        let mut has_statistics = false;
-        let mut max_values = schema
-            .fields()
-            .iter()
-            .map(|field| MaxAccumulator::try_new(field.data_type()).ok())
-            .collect::<Vec<_>>();
-        let mut min_values = schema
-            .fields()
-            .iter()
-            .map(|field| MinAccumulator::try_new(field.data_type()).ok())
-            .collect::<Vec<_>>();
-        for part in &partitions {
-            if let Some(n) = part.statistics.num_rows {
-                num_rows = Some(num_rows.unwrap_or(0) + n)
-            }
-            if let Some(n) = part.statistics.total_byte_size {
-                total_byte_size = Some(total_byte_size.unwrap_or(0) + n)
+        let new_column_statistics = statistics.column_statistics.map(|stats| {
+            let mut projected_stats = Vec::with_capacity(projection.len());
+            for proj in &projection {
+                projected_stats.push(stats[*proj].clone());
             }
-            if let Some(x) = &part.statistics.column_statistics {
-                let part_nulls: Vec<Option<usize>> =
-                    x.iter().map(|c| c.null_count).collect();
-                has_statistics = true;
-
-                let part_max_values: Vec<Option<ScalarValue>> =
-                    x.iter().map(|c| c.max_value.clone()).collect();
-                let part_min_values: Vec<Option<ScalarValue>> =
-                    x.iter().map(|c| c.min_value.clone()).collect();
-
-                for &i in projection.iter() {
-                    null_counts[i] = part_nulls[i].unwrap_or(0);
-                    if let Some(part_max_value) = part_max_values[i].clone() {
-                        if let Some(max_value) = &mut max_values[i] {
-                            match max_value.update(&[part_max_value]) {
-                                Ok(_) => {}
-                                Err(_) => {
-                                    max_values[i] = None;
-                                }
-                            }
-                        }
-                    }
-                    if let Some(part_min_value) = part_min_values[i].clone() {
-                        if let Some(min_value) = &mut min_values[i] {
-                            match min_value.update(&[part_min_value]) {
-                                Ok(_) => {}
-                                Err(_) => {
-                                    min_values[i] = None;
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        let column_stats = if has_statistics {
-            Some(
-                (0..schema.fields().len())
-                    .map(|i| {
-                        let max_value = match &max_values[i] {
-                            Some(max_value) => max_value.evaluate().ok(),
-                            None => None,
-                        };
-                        let min_value = match &min_values[i] {
-                            Some(min_value) => min_value.evaluate().ok(),
-                            None => None,
-                        };
-                        ColumnStatistics {
-                            null_count: Some(null_counts[i] as usize),
-                            max_value,
-                            min_value,
-                            distinct_count: None,
-                        }
-                    })
-                    .collect(),
-            )
-        } else {
-            None
-        };
+            projected_stats
+        });
 
         let statistics = Statistics {
-            num_rows,
-            total_byte_size,
-            column_statistics: column_stats,
+            num_rows: statistics.num_rows,
+            total_byte_size: statistics.total_byte_size,
+            column_statistics: new_column_statistics,
         };
+
         Self {
             partitions,
+            object_store,
             schema: Arc::new(projected_schema),
             projection,
             metrics,
@@ -585,22 +271,20 @@ impl ParquetExec {
 
 impl ParquetPartition {
     /// Create a new parquet partition
-    pub fn new(filenames: Vec<String>, statistics: Statistics) -> Self {
+    pub fn new(files: Vec<PartitionedFile>, index: usize) -> Self {
         Self {
-            filenames,
-            statistics,
+            file_partition: FilePartition { index, files },
             metrics: ParquetPartitionMetrics::new(),
         }
     }
 
     /// The Parquet filename for this partition
-    pub fn filenames(&self) -> &[String] {
-        &self.filenames
-    }
-
-    /// Statistics for this partition
-    pub fn statistics(&self) -> &Statistics {
-        &self.statistics
+    pub fn filenames(&self) -> Vec<String> {
+        self.file_partition
+            .files
+            .iter()
+            .map(|f| f.file_path.clone())
+            .collect()
     }
 }
 
@@ -666,8 +350,8 @@ impl ExecutionPlan for ParquetExec {
             Receiver<ArrowResult<RecordBatch>>,
         ) = channel(2);
 
-        let partition = &self.partitions[partition];
-        let filenames = partition.filenames.clone();
+        let object_store = self.object_store.clone();
+        let partition = self.partitions[partition].clone();
         let metrics = partition.metrics.clone();
         let projection = self.projection.clone();
         let predicate_builder = self.predicate_builder.clone();
@@ -676,7 +360,8 @@ impl ExecutionPlan for ParquetExec {
 
         task::spawn_blocking(move || {
             if let Err(e) = read_files(
-                &filenames,
+                object_store,
+                partition,
                 metrics,
                 &projection,
                 &predicate_builder,
@@ -704,9 +389,7 @@ impl ExecutionPlan for ParquetExec {
                 let files: Vec<_> = self
                     .partitions
                     .iter()
-                    .map(|pp| pp.filenames.iter())
-                    .flatten()
-                    .map(|s| s.as_str())
+                    .map(|pp| format!("{}", pp.file_partition))
                     .collect();
 
                 write!(
@@ -726,14 +409,11 @@ impl ExecutionPlan for ParquetExec {
             .flat_map(|p| {
                 vec![
                     (
-                        format!(
-                            "numPredicateEvaluationErrors for {}",
-                            p.filenames.join(",")
-                        ),
+                        format!("numPredicateEvaluationErrors for {}", p.file_partition),
                         p.metrics.predicate_evaluation_errors.as_ref().clone(),
                     ),
                     (
-                        format!("numRowGroupsPruned for {}", p.filenames.join(",")),
+                        format!("numRowGroupsPruned for {}", p.file_partition),
                         p.metrics.row_groups_pruned.as_ref().clone(),
                     ),
                 ]
@@ -857,7 +537,7 @@ fn build_row_group_predicate(
     match predicate_values {
         Ok(values) => {
             // NB: false means don't scan row group
-            let num_pruned = values.iter().filter(|&v| !v).count();
+            let num_pruned = values.iter().filter(|&v| !*v).count();
             metrics.row_groups_pruned.add(num_pruned);
             Box::new(move |_, i| values[i])
         }
@@ -872,7 +552,8 @@ fn build_row_group_predicate(
 }
 
 fn read_files(
-    filenames: &[String],
+    object_store: Arc<dyn ObjectStore>,
+    partition: ParquetPartition,
     metrics: ParquetPartitionMetrics,
     projection: &[usize],
     predicate_builder: &Option<PruningPredicate>,
@@ -881,9 +562,11 @@ fn read_files(
     limit: Option<usize>,
 ) -> Result<()> {
     let mut total_rows = 0;
-    'outer: for filename in filenames {
-        let file = File::open(&filename)?;
-        let mut file_reader = SerializedFileReader::new(file)?;
+    let all_files = partition.file_partition.files;
+    'outer: for partitioned_file in all_files {
+        let reader = object_store.get_reader(partitioned_file.file_path.as_str())?;
+        let mut file_reader =
+            SerializedFileReader::new(ObjectReaderWrapper::new(reader))?;
         if let Some(predicate_builder) = predicate_builder {
             let row_group_predicate = build_row_group_predicate(
                 predicate_builder,
@@ -910,7 +593,7 @@ fn read_files(
                 Some(Err(e)) => {
                     let err_msg = format!(
                         "Error reading batch from {}: {}",
-                        filename,
+                        partitioned_file,
                         e.to_string()
                     );
                     // send error to operator
@@ -930,12 +613,15 @@ fn read_files(
     Ok(())
 }
 
-fn split_files(filenames: &[String], n: usize) -> Vec<&[String]> {
-    let mut chunk_size = filenames.len() / n;
-    if filenames.len() % n > 0 {
+fn split_files(
+    partitioned_files: &[PartitionedFile],
+    n: usize,
+) -> Vec<&[PartitionedFile]> {
+    let mut chunk_size = partitioned_files.len() / n;
+    if partitioned_files.len() % n > 0 {
         chunk_size += 1;
     }
-    filenames.chunks(chunk_size).collect()
+    partitioned_files.chunks(chunk_size).collect()
 }
 
 struct ParquetStream {
@@ -973,24 +659,24 @@ mod tests {
 
     #[test]
     fn test_split_files() {
-        let filenames = vec![
-            "a".to_string(),
-            "b".to_string(),
-            "c".to_string(),
-            "d".to_string(),
-            "e".to_string(),
+        let files = vec![
+            PartitionedFile::from("a".to_string()),
+            PartitionedFile::from("b".to_string()),
+            PartitionedFile::from("c".to_string()),
+            PartitionedFile::from("d".to_string()),
+            PartitionedFile::from("e".to_string()),
         ];
 
-        let chunks = split_files(&filenames, 1);
+        let chunks = split_files(&files, 1);
         assert_eq!(1, chunks.len());
         assert_eq!(5, chunks[0].len());
 
-        let chunks = split_files(&filenames, 2);
+        let chunks = split_files(&files, 2);
         assert_eq!(2, chunks.len());
         assert_eq!(3, chunks[0].len());
         assert_eq!(2, chunks[1].len());
 
-        let chunks = split_files(&filenames, 5);
+        let chunks = split_files(&files, 5);
         assert_eq!(5, chunks.len());
         assert_eq!(1, chunks[0].len());
         assert_eq!(1, chunks[1].len());
@@ -998,7 +684,7 @@ mod tests {
         assert_eq!(1, chunks[3].len());
         assert_eq!(1, chunks[4].len());
 
-        let chunks = split_files(&filenames, 123);
+        let chunks = split_files(&files, 123);
         assert_eq!(5, chunks.len());
         assert_eq!(1, chunks[0].len());
         assert_eq!(1, chunks[1].len());
@@ -1016,7 +702,7 @@ mod tests {
             Some(vec![0, 1, 2]),
             None,
             1024,
-            4,
+            ExecutionContext::with_concurrency(4),
             None,
         )?;
         assert_eq!(parquet_exec.output_partitioning().partition_count(), 1);

From 6f59715587b614306497443e249e56a2bd3a2a27 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Tue, 10 Aug 2021 22:38:32 +0800
Subject: [PATCH 02/16] fix read

---
 .../src/execution_plans/shuffle_writer.rs     |  2 ++
 datafusion/src/datasource/local.rs            | 32 ++-----------------
 datafusion/src/datasource/object_store.rs     |  2 +-
 datafusion/src/datasource/parquet.rs          |  2 +-
 datafusion/src/execution/context.rs           |  1 +
 datafusion/src/physical_plan/repartition.rs   |  2 +-
 6 files changed, 9 insertions(+), 32 deletions(-)

diff --git a/ballista/rust/core/src/execution_plans/shuffle_writer.rs b/ballista/rust/core/src/execution_plans/shuffle_writer.rs
index b1db21fa90a1..8c66a1e3a739 100644
--- a/ballista/rust/core/src/execution_plans/shuffle_writer.rs
+++ b/ballista/rust/core/src/execution_plans/shuffle_writer.rs
@@ -482,6 +482,7 @@ mod tests {
     use tempfile::TempDir;
 
     #[tokio::test]
+    #[ignore]
     async fn test() -> Result<()> {
         let input_plan = Arc::new(CoalescePartitionsExec::new(create_input_plan()?));
         let work_dir = TempDir::new()?;
@@ -534,6 +535,7 @@ mod tests {
     }
 
     #[tokio::test]
+    #[ignore]
     async fn test_partitioned() -> Result<()> {
         let input_plan = create_input_plan()?;
         let work_dir = TempDir::new()?;
diff --git a/datafusion/src/datasource/local.rs b/datafusion/src/datasource/local.rs
index 4890e9d229f9..cdd9b324a7ad 100644
--- a/datafusion/src/datasource/local.rs
+++ b/datafusion/src/datasource/local.rs
@@ -20,7 +20,8 @@
 use crate::datasource::object_store::{ObjectReader, ObjectStore};
 use crate::error::DataFusionError;
 use crate::error::Result;
-use crate::parquet::file::reader::{ChunkReader, Length};
+use crate::parquet::file::reader::Length;
+use crate::parquet::file::serialized_reader::FileSource;
 use std::any::Any;
 use std::fs;
 use std::fs::{metadata, File};
@@ -59,11 +60,7 @@ impl LocalFSObjectReader {
 
 impl ObjectReader for LocalFSObjectReader {
     fn get_reader(&self, start: u64, length: usize) -> Box<dyn Read> {
-        Box::new(FileSegmentReader::new(
-            self.file.try_clone().unwrap(),
-            start,
-            length,
-        ))
+        Box::new(FileSource::<File>::new(&self.file, start, length))
     }
 
     fn length(&self) -> u64 {
@@ -71,29 +68,6 @@ impl ObjectReader for LocalFSObjectReader {
     }
 }
 
-struct FileSegmentReader {
-    file: File,
-    start: u64,
-    length: usize,
-}
-
-impl FileSegmentReader {
-    fn new(file: File, start: u64, length: usize) -> Self {
-        Self {
-            file,
-            start,
-            length,
-        }
-    }
-}
-
-impl Read for FileSegmentReader {
-    fn read(&mut self, buf: &mut [u8]) -> std::result::Result<usize, std::io::Error> {
-        let mut file_source = self.file.get_read(self.start, self.length)?;
-        file_source.read(buf)
-    }
-}
-
 fn list_all(root_path: &str, ext: &str) -> Result<Vec<String>> {
     let mut filenames: Vec<String> = Vec::new();
     list_all_files(root_path, &mut filenames, ext)?;
diff --git a/datafusion/src/datasource/object_store.rs b/datafusion/src/datasource/object_store.rs
index 5c7a53215534..ef1e8701f4e0 100644
--- a/datafusion/src/datasource/object_store.rs
+++ b/datafusion/src/datasource/object_store.rs
@@ -41,7 +41,7 @@ pub trait ObjectStore: Sync + Send + Debug {
     /// so that it can be downcast to a specific implementation.
     fn as_any(&self) -> &dyn Any;
 
-    /// Returns all the files with `ext` in path `prefix`
+    /// Returns all the files with filename extension `ext` in path `prefix`
     fn list_all_files(&self, prefix: &str, ext: &str) -> Result<Vec<String>>;
 
     /// Get object reader for one file
diff --git a/datafusion/src/datasource/parquet.rs b/datafusion/src/datasource/parquet.rs
index aaec9e83f78c..fcf7ccd5331b 100644
--- a/datafusion/src/datasource/parquet.rs
+++ b/datafusion/src/datasource/parquet.rs
@@ -178,7 +178,7 @@ impl ParquetRootDesc {
     fn summarize_min_max(
         max_values: &mut Vec<Option<MaxAccumulator>>,
         min_values: &mut Vec<Option<MinAccumulator>>,
-        fields: &Vec<Field>,
+        fields: &[Field],
         i: usize,
         stat: &ParquetStatistics,
     ) {
diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs
index 4c7b85c1eb26..a49e5b24f1ce 100644
--- a/datafusion/src/execution/context.rs
+++ b/datafusion/src/execution/context.rs
@@ -2709,6 +2709,7 @@ mod tests {
     }
 
     #[tokio::test]
+    #[ignore]
     async fn write_parquet_results() -> Result<()> {
         // create partitioned input file and context
         let tmp_dir = TempDir::new()?;
diff --git a/datafusion/src/physical_plan/repartition.rs b/datafusion/src/physical_plan/repartition.rs
index eb3fe5560fd6..9047a01729aa 100644
--- a/datafusion/src/physical_plan/repartition.rs
+++ b/datafusion/src/physical_plan/repartition.rs
@@ -734,7 +734,7 @@ mod tests {
     #[tokio::test]
     // skip this test when hash function is different because the hard
     // coded expected output is a function of the hash values
-    #[cfg(not(feature = "force_hash_collisions"))]
+    //#[cfg(not(feature = "force_hash_collisions"))]
     async fn repartition_with_dropping_output_stream() {
         #[derive(Debug)]
         struct Case<'a> {

From 5545ac74fc47c0c814dcb280c01ee18b43ef96b0 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Tue, 10 Aug 2021 22:54:58 +0800
Subject: [PATCH 03/16] deadlock

---
 datafusion/src/execution/context.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs
index a49e5b24f1ce..1771468d8ea7 100644
--- a/datafusion/src/execution/context.rs
+++ b/datafusion/src/execution/context.rs
@@ -328,9 +328,9 @@ impl ExecutionContext {
     /// executed against this context.
     pub fn register_parquet(&mut self, name: &str, filename: &str) -> Result<()> {
         let table = {
-            let m = self.state.lock().unwrap();
+            let enable_pruning = self.state.lock().unwrap().config.parquet_pruning;
             ParquetTable::try_new(filename, self.clone())?
-                .with_enable_pruning(m.config.parquet_pruning)
+                .with_enable_pruning(enable_pruning)
         };
         self.register_table(name, Arc::new(table))?;
         Ok(())

From b0a353c5f3c39da7f19dc9e64e21e43ea0e80cc3 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Wed, 11 Aug 2021 11:55:19 +0800
Subject: [PATCH 04/16] fix prunning test

---
 datafusion/src/datasource/mod.rs            | 20 +++-----------------
 datafusion/src/execution/context.rs         |  1 -
 datafusion/src/physical_plan/repartition.rs |  1 +
 3 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/datafusion/src/datasource/mod.rs b/datafusion/src/datasource/mod.rs
index 64e84c8e5611..71f1bcef58ad 100644
--- a/datafusion/src/datasource/mod.rs
+++ b/datafusion/src/datasource/mod.rs
@@ -79,16 +79,7 @@ impl From<String> for PartitionedFile {
 
 impl std::fmt::Display for PartitionedFile {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(
-            f,
-            "PartitionedFile(file_path: {}, schema: {}, statistics: {:?},\
-         partition_value: {:?}, partition_schema: {:?})",
-            self.file_path,
-            self.schema,
-            self.statistics,
-            self.partition_value,
-            self.partition_schema
-        )
+        write!(f, "{}", self.file_path)
     }
 }
 
@@ -103,13 +94,8 @@ pub struct FilePartition {
 
 impl std::fmt::Display for FilePartition {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        let files: Vec<String> = self.files.iter().map(|f| format!("{}", f)).collect();
-        write!(
-            f,
-            "FilePartition[{}], files: {}",
-            self.index,
-            files.join(", ")
-        )
+        let files: Vec<String> = self.files.iter().map(|f| f.to_string()).collect();
+        write!(f, "{}", files.join(", "))
     }
 }
 
diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs
index 1771468d8ea7..1b5882712c61 100644
--- a/datafusion/src/execution/context.rs
+++ b/datafusion/src/execution/context.rs
@@ -2709,7 +2709,6 @@ mod tests {
     }
 
     #[tokio::test]
-    #[ignore]
     async fn write_parquet_results() -> Result<()> {
         // create partitioned input file and context
         let tmp_dir = TempDir::new()?;
diff --git a/datafusion/src/physical_plan/repartition.rs b/datafusion/src/physical_plan/repartition.rs
index 9047a01729aa..9f875083283f 100644
--- a/datafusion/src/physical_plan/repartition.rs
+++ b/datafusion/src/physical_plan/repartition.rs
@@ -732,6 +732,7 @@ mod tests {
     }
 
     #[tokio::test]
+    #[ignore]
     // skip this test when hash function is different because the hard
     // coded expected output is a function of the hash values
     //#[cfg(not(feature = "force_hash_collisions"))]

From 42b6f436fa739da196df8e6ccf964ce3a96edd05 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Wed, 11 Aug 2021 13:44:52 +0800
Subject: [PATCH 05/16] fix clippy

---
 datafusion/src/physical_plan/parquet.rs     | 2 ++
 datafusion/src/physical_plan/repartition.rs | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/datafusion/src/physical_plan/parquet.rs b/datafusion/src/physical_plan/parquet.rs
index bc3e0d597213..24f7c12178eb 100644
--- a/datafusion/src/physical_plan/parquet.rs
+++ b/datafusion/src/physical_plan/parquet.rs
@@ -198,6 +198,7 @@ impl ParquetExec {
     }
 
     /// Create a new Parquet reader execution plan with provided partitions and schema
+    #[allow(clippy::too_many_arguments)]
     pub fn new(
         partitions: Vec<ParquetPartition>,
         object_store: Arc<dyn ObjectStore>,
@@ -551,6 +552,7 @@ fn build_row_group_predicate(
     }
 }
 
+#[allow(clippy::too_many_arguments)]
 fn read_files(
     object_store: Arc<dyn ObjectStore>,
     partition: ParquetPartition,
diff --git a/datafusion/src/physical_plan/repartition.rs b/datafusion/src/physical_plan/repartition.rs
index 9f875083283f..84fdc3d04d83 100644
--- a/datafusion/src/physical_plan/repartition.rs
+++ b/datafusion/src/physical_plan/repartition.rs
@@ -262,6 +262,7 @@ impl RepartitionExec {
             // fetch the next batch
             let now = Instant::now();
             let result = stream.next().await;
+            println!("input {:?}", result);
             metrics.fetch_nanos.add_elapsed(now);
 
             // Input is done
@@ -298,11 +299,13 @@ impl RepartitionExec {
                     hashes_buf.resize(arrays[0].len(), 0);
                     // Hash arrays and compute buckets based on number of partitions
                     let hashes = create_hashes(&arrays, &random_state, hashes_buf)?;
+                    println!("hashes: {:?}", &hashes);
                     let mut indices = vec![vec![]; num_output_partitions];
                     for (index, hash) in hashes.iter().enumerate() {
                         indices[(*hash % num_output_partitions as u64) as usize]
                             .push(index as u64)
                     }
+                    println!("indices: {:?}", &indices);
                     metrics.repart_nanos.add_elapsed(now);
                     for (num_output_partition, partition_indices) in
                         indices.into_iter().enumerate()
@@ -732,7 +735,6 @@ mod tests {
     }
 
     #[tokio::test]
-    #[ignore]
     // skip this test when hash function is different because the hard
     // coded expected output is a function of the hash values
     //#[cfg(not(feature = "force_hash_collisions"))]

From 97793953ddc1fc798ed7b070c51e4049ce11d180 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Wed, 11 Aug 2021 13:45:19 +0800
Subject: [PATCH 06/16] fix

---
 datafusion/src/physical_plan/repartition.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/src/physical_plan/repartition.rs b/datafusion/src/physical_plan/repartition.rs
index 84fdc3d04d83..e6ce98c28721 100644
--- a/datafusion/src/physical_plan/repartition.rs
+++ b/datafusion/src/physical_plan/repartition.rs
@@ -737,7 +737,7 @@ mod tests {
     #[tokio::test]
     // skip this test when hash function is different because the hard
     // coded expected output is a function of the hash values
-    //#[cfg(not(feature = "force_hash_collisions"))]
+    #[cfg(not(feature = "force_hash_collisions"))]
     async fn repartition_with_dropping_output_stream() {
         #[derive(Debug)]
         struct Case<'a> {

From 9a8614ba3287106868d9b277907afcba7faa2941 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Wed, 11 Aug 2021 14:18:50 +0800
Subject: [PATCH 07/16] enable shuffle_writer tests

---
 ballista/rust/core/src/execution_plans/shuffle_writer.rs | 2 --
 datafusion/src/physical_plan/repartition.rs              | 3 ---
 2 files changed, 5 deletions(-)

diff --git a/ballista/rust/core/src/execution_plans/shuffle_writer.rs b/ballista/rust/core/src/execution_plans/shuffle_writer.rs
index 8c66a1e3a739..b1db21fa90a1 100644
--- a/ballista/rust/core/src/execution_plans/shuffle_writer.rs
+++ b/ballista/rust/core/src/execution_plans/shuffle_writer.rs
@@ -482,7 +482,6 @@ mod tests {
     use tempfile::TempDir;
 
     #[tokio::test]
-    #[ignore]
     async fn test() -> Result<()> {
         let input_plan = Arc::new(CoalescePartitionsExec::new(create_input_plan()?));
         let work_dir = TempDir::new()?;
@@ -535,7 +534,6 @@ mod tests {
     }
 
     #[tokio::test]
-    #[ignore]
     async fn test_partitioned() -> Result<()> {
         let input_plan = create_input_plan()?;
         let work_dir = TempDir::new()?;
diff --git a/datafusion/src/physical_plan/repartition.rs b/datafusion/src/physical_plan/repartition.rs
index e6ce98c28721..eb3fe5560fd6 100644
--- a/datafusion/src/physical_plan/repartition.rs
+++ b/datafusion/src/physical_plan/repartition.rs
@@ -262,7 +262,6 @@ impl RepartitionExec {
             // fetch the next batch
             let now = Instant::now();
             let result = stream.next().await;
-            println!("input {:?}", result);
             metrics.fetch_nanos.add_elapsed(now);
 
             // Input is done
@@ -299,13 +298,11 @@ impl RepartitionExec {
                     hashes_buf.resize(arrays[0].len(), 0);
                     // Hash arrays and compute buckets based on number of partitions
                     let hashes = create_hashes(&arrays, &random_state, hashes_buf)?;
-                    println!("hashes: {:?}", &hashes);
                     let mut indices = vec![vec![]; num_output_partitions];
                     for (index, hash) in hashes.iter().enumerate() {
                         indices[(*hash % num_output_partitions as u64) as usize]
                             .push(index as u64)
                     }
-                    println!("indices: {:?}", &indices);
                     metrics.repart_nanos.add_elapsed(now);
                     for (num_output_partition, partition_indices) in
                         indices.into_iter().enumerate()

From 90e4d889e75b38fb294f987e8b67e1df1390bd99 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Fri, 13 Aug 2021 18:11:29 +0800
Subject: [PATCH 08/16] wip make it async

---
 datafusion/src/datasource/csv.rs          |  2 +-
 datafusion/src/datasource/json.rs         |  2 +-
 datafusion/src/datasource/local.rs        | 94 ++++++++++++++++-------
 datafusion/src/datasource/mod.rs          |  2 +-
 datafusion/src/datasource/object_store.rs | 11 ++-
 datafusion/src/physical_plan/csv.rs       |  2 +-
 datafusion/src/physical_plan/json.rs      |  2 +-
 7 files changed, 79 insertions(+), 36 deletions(-)

diff --git a/datafusion/src/datasource/csv.rs b/datafusion/src/datasource/csv.rs
index d4ca073af2dd..ec6cbe472950 100644
--- a/datafusion/src/datasource/csv.rs
+++ b/datafusion/src/datasource/csv.rs
@@ -67,7 +67,7 @@ impl CsvFile {
             Some(s) => s.clone(),
             None => {
                 let filenames = LocalFileSystem
-                    .list_all_files(path.as_str(), options.file_extension)?;
+                    .list_all_files(path.as_str(), options.file_extension).await?;
                 if filenames.is_empty() {
                     return Err(DataFusionError::Plan(format!(
                         "No files found at {path} with file extension {file_extension}",
diff --git a/datafusion/src/datasource/json.rs b/datafusion/src/datasource/json.rs
index 5bd8a5f7121f..84f5bccb8fac 100644
--- a/datafusion/src/datasource/json.rs
+++ b/datafusion/src/datasource/json.rs
@@ -59,7 +59,7 @@ impl NdJsonFile {
             schema
         } else {
             let filenames =
-                LocalFileSystem.list_all_files(path, options.file_extension)?;
+                LocalFileSystem.list_all_files(path, options.file_extension).await?;
             if filenames.is_empty() {
                 return Err(DataFusionError::Plan(format!(
                     "No files found at {path} with file extension {file_extension}",
diff --git a/datafusion/src/datasource/local.rs b/datafusion/src/datasource/local.rs
index cdd9b324a7ad..7b99e38dc043 100644
--- a/datafusion/src/datasource/local.rs
+++ b/datafusion/src/datasource/local.rs
@@ -16,29 +16,32 @@
 // under the License.
 
 //! Object store that represents the Local File System.
-
-use crate::datasource::object_store::{ObjectReader, ObjectStore};
+use crate::datasource::object_store::{ObjectReader, ObjectStore, FileNameStream};
 use crate::error::DataFusionError;
 use crate::error::Result;
 use crate::parquet::file::reader::Length;
 use crate::parquet::file::serialized_reader::FileSource;
+use async_trait::async_trait;
 use std::any::Any;
-use std::fs;
-use std::fs::{metadata, File};
 use std::io::Read;
 use std::sync::Arc;
+use futures::{stream, Stream, StreamExt};
+use tokio::fs::{File, self, ReadDir};
+use std::path::PathBuf;
 
 #[derive(Debug)]
 /// Local File System as Object Store.
 pub struct LocalFileSystem;
 
+
+#[async_trait]
 impl ObjectStore for LocalFileSystem {
     fn as_any(&self) -> &dyn Any {
         self
     }
 
-    fn list_all_files(&self, path: &str, ext: &str) -> Result<Vec<String>> {
-        list_all(path, ext)
+    async fn list_all_files(&self, path: &str, ext: &str) -> Result<FileNameStream> {
+        list_all(path.to_string(), ext.to_string()).await
     }
 
     fn get_reader(&self, file_path: &str) -> Result<Arc<dyn ObjectReader>> {
@@ -68,33 +71,66 @@ impl ObjectReader for LocalFSObjectReader {
     }
 }
 
-fn list_all(root_path: &str, ext: &str) -> Result<Vec<String>> {
-    let mut filenames: Vec<String> = Vec::new();
-    list_all_files(root_path, &mut filenames, ext)?;
-    Ok(filenames)
-}
+async fn list_all(root_path: String, ext: String) -> Result<FileNameStream> {
+    // let mut filenames: Vec<String> = Vec::new();
+    // list_all_files(root_path, &mut filenames, ext).await?;
+    // Ok(filenames)
 
-/// Recursively build a list of files in a directory with a given extension with an accumulator list
-fn list_all_files(dir: &str, filenames: &mut Vec<String>, ext: &str) -> Result<()> {
-    let metadata = metadata(dir)?;
-    if metadata.is_file() {
-        if dir.ends_with(ext) {
-            filenames.push(dir.to_string());
-        }
-    } else {
-        for entry in fs::read_dir(dir)? {
-            let entry = entry?;
-            let path = entry.path();
-            if let Some(path_name) = path.to_str() {
-                if path.is_dir() {
-                    list_all_files(path_name, filenames, ext)?;
-                } else if path_name.ends_with(ext) {
-                    filenames.push(path_name.to_string());
+    async fn one_level(path: String, to_visit: &mut Vec<String>, ext: String) -> Result<Vec<String>> {
+        let mut dir = fs::read_dir(path).await?;
+        let mut files = Vec::new();
+
+        while let Some(child) = dir.next_entry().await? {
+            if let Some(child_path) = child.path().to_str() {
+                if child.metadata().await?.is_dir() {
+                    to_visit.push(child_path.to_string());
+                } else {
+                    if child_path.ends_with(&ext) {
+                        files.push(child_path.to_string())
+                    }
                 }
             } else {
-                return Err(DataFusionError::Plan("Invalid path".to_string()));
+                return Err(DataFusionError::Plan("Invalid path".to_string()))
             }
+
         }
+        Ok(files)
     }
-    Ok(())
+
+    stream::unfold(vec![root_path], |mut to_visit| {
+        async {
+            let path = to_visit.pop()?;
+            let file_stream = match one_level(path, &mut to_visit, ext).await {
+                Ok(files) => stream::iter(files).map(Ok).left_stream(),
+                Err(e) => stream::once(async { Err(e) }).right_stream(),
+            };
+
+            Some((file_stream, to_visit))
+        }
+    }).flatten()
 }
+
+/// Recursively build a list of files in a directory with a given extension with an accumulator list
+// async fn list_all_files(dir: &str, filenames: &mut Vec<String>, ext: &str) -> Result<()> {
+//     let metadata = std::fs::metadata(dir)?;
+//     if metadata.is_file() {
+//         if dir.ends_with(ext) {
+//             filenames.push(dir.to_string());
+//         }
+//     } else {
+//         for entry in std::fs::read_dir(dir)? {
+//             let entry = entry?;
+//             let path = entry.path();
+//             if let Some(path_name) = path.to_str() {
+//                 if path.is_dir() {
+//                     list_all_files(path_name, filenames, ext).await?;
+//                 } else if path_name.ends_with(ext) {
+//                     filenames.push(path_name.to_string());
+//                 }
+//             } else {
+//                 return Err(DataFusionError::Plan("Invalid path".to_string()));
+//             }
+//         }
+//     }
+//     Ok(())
+// }
diff --git a/datafusion/src/datasource/mod.rs b/datafusion/src/datasource/mod.rs
index 71f1bcef58ad..b34770c07b4d 100644
--- a/datafusion/src/datasource/mod.rs
+++ b/datafusion/src/datasource/mod.rs
@@ -116,7 +116,7 @@ pub trait SourceRootDescBuilder {
         object_store: Arc<dyn ObjectStore>,
         ext: &str,
     ) -> Result<SourceRootDescriptor> {
-        let filenames = object_store.list_all_files(path, ext)?;
+        let filenames = object_store.list_all_files(path, ext).await?;
         if filenames.is_empty() {
             return Err(DataFusionError::Plan(format!(
                 "No file (with .{} extension) found at path {}",
diff --git a/datafusion/src/datasource/object_store.rs b/datafusion/src/datasource/object_store.rs
index ef1e8701f4e0..8c2de37e5be2 100644
--- a/datafusion/src/datasource/object_store.rs
+++ b/datafusion/src/datasource/object_store.rs
@@ -19,13 +19,17 @@
 
 use crate::datasource::local::LocalFileSystem;
 use crate::error::Result;
+use async_trait::async_trait;
 use std::any::Any;
 use std::collections::HashMap;
 use std::fmt::Debug;
 use std::io::Read;
 use std::sync::{Arc, RwLock};
+use futures::{Stream, StreamExt};
+use std::pin::Pin;
 
-/// Objct Reader for one file in a object store
+/// Object Reader for one file in a object store
+#[async_trait]
 pub trait ObjectReader {
     /// Get reader for a part [start, start + length] in the file
     fn get_reader(&self, start: u64, length: usize) -> Box<dyn Read>;
@@ -34,15 +38,18 @@ pub trait ObjectReader {
     fn length(&self) -> u64;
 }
 
+pub type FileNameStream = Pin<Box<dyn Stream<Item = Result<String>> + Send + Sync + 'static>>;
+
 /// A ObjectStore abstracts access to an underlying file/object storage.
 /// It maps strings (e.g. URLs, filesystem paths, etc) to sources of bytes
+#[async_trait]
 pub trait ObjectStore: Sync + Send + Debug {
     /// Returns the object store as [`Any`](std::any::Any)
     /// so that it can be downcast to a specific implementation.
     fn as_any(&self) -> &dyn Any;
 
     /// Returns all the files with filename extension `ext` in path `prefix`
-    fn list_all_files(&self, prefix: &str, ext: &str) -> Result<Vec<String>>;
+    async fn list_all_files(&self, prefix: &str, ext: &str) -> Result<FileNameStream>;
 
     /// Get object reader for one file
     fn get_reader(&self, file_path: &str) -> Result<Arc<dyn ObjectReader>>;
diff --git a/datafusion/src/physical_plan/csv.rs b/datafusion/src/physical_plan/csv.rs
index 293f46d7a736..3a44c60fe4b6 100644
--- a/datafusion/src/physical_plan/csv.rs
+++ b/datafusion/src/physical_plan/csv.rs
@@ -143,7 +143,7 @@ impl CsvExec {
     ) -> Result<Self> {
         let file_extension = String::from(options.file_extension);
 
-        let filenames = LocalFileSystem.list_all_files(path, options.file_extension)?;
+        let filenames = LocalFileSystem.list_all_files(path, options.file_extension).await?;
         if filenames.is_empty() {
             return Err(DataFusionError::Execution(format!(
                 "No files found at {path} with file extension {file_extension}",
diff --git a/datafusion/src/physical_plan/json.rs b/datafusion/src/physical_plan/json.rs
index df7e9e5e5014..2ce4e07c0871 100644
--- a/datafusion/src/physical_plan/json.rs
+++ b/datafusion/src/physical_plan/json.rs
@@ -89,7 +89,7 @@ impl NdJsonExec {
     ) -> Result<Self> {
         let file_extension = options.file_extension.to_string();
 
-        let filenames = LocalFileSystem.list_all_files(path, options.file_extension)?;
+        let filenames = LocalFileSystem.list_all_files(path, options.file_extension).await?;
 
         if filenames.is_empty() {
             return Err(DataFusionError::Execution(format!(

From af106a9664695c8eb0b1d81a722a8f5b0d99f563 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Mon, 16 Aug 2021 15:03:15 +0800
Subject: [PATCH 09/16] resolve comment, wip

---
 .../core/src/serde/logical_plan/from_proto.rs  |  8 ++------
 .../core/src/serde/physical_plan/from_proto.rs |  5 ++---
 ballista/rust/core/src/utils.rs                |  5 -----
 ballista/rust/scheduler/src/lib.rs             |  3 +--
 datafusion/src/datasource/csv.rs               |  4 ++--
 datafusion/src/datasource/json.rs              |  4 ++--
 datafusion/src/datasource/mod.rs               |  3 +--
 .../src/datasource/{ => object_store}/local.rs |  2 +-
 .../{object_store.rs => object_store/mod.rs}   | 18 ++++++++++++------
 datafusion/src/datasource/parquet.rs           |  2 +-
 .../src/physical_optimizer/repartition.rs      |  2 +-
 datafusion/src/physical_plan/csv.rs            |  4 ++--
 datafusion/src/physical_plan/json.rs           |  4 ++--
 datafusion/src/physical_plan/parquet.rs        |  4 ++--
 14 files changed, 31 insertions(+), 37 deletions(-)
 rename datafusion/src/datasource/{ => object_store}/local.rs (98%)
 rename datafusion/src/datasource/{object_store.rs => object_store/mod.rs} (95%)

diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs
index 24faddd9f34d..dbc819d855c8 100644
--- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs
+++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs
@@ -159,7 +159,7 @@ impl TryInto<LogicalPlan> for &protobuf::LogicalPlanNode {
                 LogicalPlanBuilder::scan_parquet_with_name(
                     &scan.path,
                     projection,
-                    create_datafusion_context_concurrency(24),
+                    ExecutionContext::with_concurrency(24),
                     &scan.table_name,
                 )? //TODO concurrency
                 .build()
@@ -1100,13 +1100,9 @@ impl TryInto<Field> for &protobuf::Field {
     }
 }
 
-use crate::utils::create_datafusion_context_concurrency;
 use datafusion::physical_plan::datetime_expressions::to_timestamp;
 use datafusion::physical_plan::{aggregates, windows};
-use datafusion::prelude::{
-    array, date_part, date_trunc, length, lower, ltrim, md5, rtrim, sha224, sha256,
-    sha384, sha512, trim, upper,
-};
+use datafusion::prelude::{array, date_part, date_trunc, length, lower, ltrim, md5, rtrim, sha224, sha256, sha384, sha512, trim, upper, ExecutionContext};
 use std::convert::TryFrom;
 
 impl TryFrom<i32> for protobuf::FileType {
diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs
index 1441f87bc0aa..2767c9318ce1 100644
--- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs
+++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs
@@ -29,7 +29,6 @@ use crate::serde::protobuf::repartition_exec_node::PartitionMethod;
 use crate::serde::protobuf::ShuffleReaderPartition;
 use crate::serde::scheduler::PartitionLocation;
 use crate::serde::{from_proto_binary_op, proto_error, protobuf};
-use crate::utils::create_datafusion_context_concurrency;
 use crate::{convert_box_required, convert_required, into_required};
 use datafusion::arrow::datatypes::{DataType, Schema, SchemaRef};
 use datafusion::catalog::catalog::{
@@ -71,7 +70,7 @@ use datafusion::physical_plan::{
     Partitioning,
 };
 use datafusion::physical_plan::{AggregateExpr, ExecutionPlan, PhysicalExpr, WindowExpr};
-use datafusion::prelude::CsvReadOptions;
+use datafusion::prelude::{CsvReadOptions, ExecutionContext};
 use log::debug;
 use protobuf::physical_expr_node::ExprType;
 use protobuf::physical_plan_node::PhysicalPlanType;
@@ -137,7 +136,7 @@ impl TryInto<Arc<dyn ExecutionPlan>> for &protobuf::PhysicalPlanNode {
                     Some(projection),
                     None,
                     scan.batch_size as usize,
-                    create_datafusion_context_concurrency(scan.num_partitions as usize),
+                    ExecutionContext::with_concurrency(scan.num_partitions as usize),
                     None,
                 )?))
             }
diff --git a/ballista/rust/core/src/utils.rs b/ballista/rust/core/src/utils.rs
index e960b77575a9..4187faa6645a 100644
--- a/ballista/rust/core/src/utils.rs
+++ b/ballista/rust/core/src/utils.rs
@@ -252,11 +252,6 @@ pub fn create_datafusion_context(
     ExecutionContext::with_config(config)
 }
 
-/// Create a DataFusion context that is compatible with Ballista in concurrency
-pub fn create_datafusion_context_concurrency(concurrency: usize) -> ExecutionContext {
-    ExecutionContext::with_concurrency(concurrency)
-}
-
 pub struct BallistaQueryPlanner {
     scheduler_url: String,
     config: BallistaConfig,
diff --git a/ballista/rust/scheduler/src/lib.rs b/ballista/rust/scheduler/src/lib.rs
index 2037a3530aba..9ab01ce9e64c 100644
--- a/ballista/rust/scheduler/src/lib.rs
+++ b/ballista/rust/scheduler/src/lib.rs
@@ -85,7 +85,6 @@ use self::state::{ConfigBackendClient, SchedulerState};
 use ballista_core::config::BallistaConfig;
 use ballista_core::execution_plans::ShuffleWriterExec;
 use ballista_core::serde::scheduler::to_proto::hash_partitioning_to_proto;
-use ballista_core::utils::create_datafusion_context_concurrency;
 use datafusion::datasource::parquet::ParquetRootDesc;
 use datafusion::prelude::{ExecutionConfig, ExecutionContext};
 use std::time::{Instant, SystemTime, UNIX_EPOCH};
@@ -286,7 +285,7 @@ impl SchedulerGrpc for SchedulerServer {
 
         match file_type {
             FileType::Parquet => {
-                let ctx = create_datafusion_context_concurrency(1);
+                let ctx = ExecutionContext::with_concurrency(1);
                 let parquet_desc = ParquetRootDesc::new(&path, ctx).map_err(|e| {
                     let msg = format!("Error opening parquet files: {}", e);
                     error!("{}", msg);
diff --git a/datafusion/src/datasource/csv.rs b/datafusion/src/datasource/csv.rs
index ec6cbe472950..3f19e5772860 100644
--- a/datafusion/src/datasource/csv.rs
+++ b/datafusion/src/datasource/csv.rs
@@ -40,7 +40,7 @@ use std::string::String;
 use std::sync::{Arc, Mutex};
 
 use crate::datasource::datasource::Statistics;
-use crate::datasource::local::LocalFileSystem;
+use crate::datasource::object_store::local::LocalFileSystem;
 use crate::datasource::object_store::ObjectStore;
 use crate::datasource::{Source, TableProvider};
 use crate::error::{DataFusionError, Result};
@@ -67,7 +67,7 @@ impl CsvFile {
             Some(s) => s.clone(),
             None => {
                 let filenames = LocalFileSystem
-                    .list_all_files(path.as_str(), options.file_extension).await?;
+                    .list(path.as_str(), options.file_extension).await?;
                 if filenames.is_empty() {
                     return Err(DataFusionError::Plan(format!(
                         "No files found at {path} with file extension {file_extension}",
diff --git a/datafusion/src/datasource/json.rs b/datafusion/src/datasource/json.rs
index 84f5bccb8fac..d04e95892cf3 100644
--- a/datafusion/src/datasource/json.rs
+++ b/datafusion/src/datasource/json.rs
@@ -37,7 +37,7 @@ use crate::{
 use arrow::{datatypes::SchemaRef, json::reader::infer_json_schema_from_seekable};
 
 use super::datasource::Statistics;
-use crate::datasource::local::LocalFileSystem;
+use crate::datasource::object_store::local::LocalFileSystem;
 use crate::datasource::object_store::ObjectStore;
 
 trait SeekRead: Read + Seek {}
@@ -59,7 +59,7 @@ impl NdJsonFile {
             schema
         } else {
             let filenames =
-                LocalFileSystem.list_all_files(path, options.file_extension).await?;
+                LocalFileSystem.list(path, options.file_extension).await?;
             if filenames.is_empty() {
                 return Err(DataFusionError::Plan(format!(
                     "No files found at {path} with file extension {file_extension}",
diff --git a/datafusion/src/datasource/mod.rs b/datafusion/src/datasource/mod.rs
index b34770c07b4d..b53822b83dd3 100644
--- a/datafusion/src/datasource/mod.rs
+++ b/datafusion/src/datasource/mod.rs
@@ -21,7 +21,6 @@ pub mod csv;
 pub mod datasource;
 pub mod empty;
 pub mod json;
-pub mod local;
 pub mod memory;
 pub mod object_store;
 pub mod parquet;
@@ -116,7 +115,7 @@ pub trait SourceRootDescBuilder {
         object_store: Arc<dyn ObjectStore>,
         ext: &str,
     ) -> Result<SourceRootDescriptor> {
-        let filenames = object_store.list_all_files(path, ext).await?;
+        let filenames = object_store.list(path, ext).await?;
         if filenames.is_empty() {
             return Err(DataFusionError::Plan(format!(
                 "No file (with .{} extension) found at path {}",
diff --git a/datafusion/src/datasource/local.rs b/datafusion/src/datasource/object_store/local.rs
similarity index 98%
rename from datafusion/src/datasource/local.rs
rename to datafusion/src/datasource/object_store/local.rs
index 7b99e38dc043..f1a9595d854f 100644
--- a/datafusion/src/datasource/local.rs
+++ b/datafusion/src/datasource/object_store/local.rs
@@ -40,7 +40,7 @@ impl ObjectStore for LocalFileSystem {
         self
     }
 
-    async fn list_all_files(&self, path: &str, ext: &str) -> Result<FileNameStream> {
+    async fn list(&self, path: &str, ext: &str) -> Result<FileNameStream> {
         list_all(path.to_string(), ext.to_string()).await
     }
 
diff --git a/datafusion/src/datasource/object_store.rs b/datafusion/src/datasource/object_store/mod.rs
similarity index 95%
rename from datafusion/src/datasource/object_store.rs
rename to datafusion/src/datasource/object_store/mod.rs
index 8c2de37e5be2..b273aecf5a84 100644
--- a/datafusion/src/datasource/object_store.rs
+++ b/datafusion/src/datasource/object_store/mod.rs
@@ -17,16 +17,22 @@
 
 //! Object Store abstracts access to an underlying file/object storage.
 
-use crate::datasource::local::LocalFileSystem;
-use crate::error::Result;
-use async_trait::async_trait;
+pub mod local;
+
 use std::any::Any;
 use std::collections::HashMap;
 use std::fmt::Debug;
 use std::io::Read;
+use std::pin::Pin;
 use std::sync::{Arc, RwLock};
+
+use async_trait::async_trait;
 use futures::{Stream, StreamExt};
-use std::pin::Pin;
+
+use local::LocalFileSystem;
+
+use crate::error::Result;
+
 
 /// Object Reader for one file in a object store
 #[async_trait]
@@ -49,7 +55,7 @@ pub trait ObjectStore: Sync + Send + Debug {
     fn as_any(&self) -> &dyn Any;
 
     /// Returns all the files with filename extension `ext` in path `prefix`
-    async fn list_all_files(&self, prefix: &str, ext: &str) -> Result<FileNameStream>;
+    async fn list(&self, prefix: &str, ext: &str) -> Result<FileNameStream>;
 
     /// Get object reader for one file
     fn get_reader(&self, file_path: &str) -> Result<Arc<dyn ObjectReader>>;
@@ -98,7 +104,7 @@ impl ObjectStoreRegistry {
     /// path with prefix file:/// or no prefix will return the default LocalFS store,
     /// path with prefix s3:/// will return the S3 store if it's registered,
     /// and will always return LocalFS store when a prefix is not registered in the path.
-    pub fn store_for_path(&self, path: &str) -> Arc<dyn ObjectStore> {
+    pub fn get_by_path(&self, path: &str) -> Arc<dyn ObjectStore> {
         if let Some((scheme, _)) = path.split_once(':') {
             let stores = self.object_stores.read().unwrap();
             if let Some(store) = stores.get(&*scheme.to_lowercase()) {
diff --git a/datafusion/src/datasource/parquet.rs b/datafusion/src/datasource/parquet.rs
index fcf7ccd5331b..03eca4605c2a 100644
--- a/datafusion/src/datasource/parquet.rs
+++ b/datafusion/src/datasource/parquet.rs
@@ -157,7 +157,7 @@ impl ParquetRootDesc {
             .lock()
             .unwrap()
             .object_store_registry
-            .store_for_path(root_path);
+            .get_by_path(root_path);
         let root_desc = Self::get_source_desc(root_path, object_store.clone(), "parquet");
         Ok(Self {
             object_store,
diff --git a/datafusion/src/physical_optimizer/repartition.rs b/datafusion/src/physical_optimizer/repartition.rs
index 30ec896b4e2f..42986d5c4dcc 100644
--- a/datafusion/src/physical_optimizer/repartition.rs
+++ b/datafusion/src/physical_optimizer/repartition.rs
@@ -110,7 +110,7 @@ mod tests {
 
     use super::*;
     use crate::datasource::datasource::Statistics;
-    use crate::datasource::local::LocalFileSystem;
+    use crate::datasource::object_store::local::LocalFileSystem;
     use crate::datasource::PartitionedFile;
     use crate::physical_plan::parquet::{
         ParquetExec, ParquetExecMetrics, ParquetPartition,
diff --git a/datafusion/src/physical_plan/csv.rs b/datafusion/src/physical_plan/csv.rs
index 3a44c60fe4b6..05e8361051f9 100644
--- a/datafusion/src/physical_plan/csv.rs
+++ b/datafusion/src/physical_plan/csv.rs
@@ -17,7 +17,7 @@
 
 //! Execution plan for reading CSV files
 
-use crate::datasource::local::LocalFileSystem;
+use crate::datasource::object_store::local::LocalFileSystem;
 use crate::datasource::object_store::ObjectStore;
 use crate::error::{DataFusionError, Result};
 use crate::physical_plan::ExecutionPlan;
@@ -143,7 +143,7 @@ impl CsvExec {
     ) -> Result<Self> {
         let file_extension = String::from(options.file_extension);
 
-        let filenames = LocalFileSystem.list_all_files(path, options.file_extension).await?;
+        let filenames = LocalFileSystem.list(path, options.file_extension).await?;
         if filenames.is_empty() {
             return Err(DataFusionError::Execution(format!(
                 "No files found at {path} with file extension {file_extension}",
diff --git a/datafusion/src/physical_plan/json.rs b/datafusion/src/physical_plan/json.rs
index 2ce4e07c0871..a694aae5cbde 100644
--- a/datafusion/src/physical_plan/json.rs
+++ b/datafusion/src/physical_plan/json.rs
@@ -20,7 +20,7 @@ use async_trait::async_trait;
 use futures::Stream;
 
 use super::{source::Source, ExecutionPlan, Partitioning, RecordBatchStream};
-use crate::datasource::local::LocalFileSystem;
+use crate::datasource::object_store::local::LocalFileSystem;
 use crate::datasource::object_store::ObjectStore;
 use crate::error::{DataFusionError, Result};
 use arrow::json::reader::{infer_json_schema_from_iterator, ValueIter};
@@ -89,7 +89,7 @@ impl NdJsonExec {
     ) -> Result<Self> {
         let file_extension = options.file_extension.to_string();
 
-        let filenames = LocalFileSystem.list_all_files(path, options.file_extension).await?;
+        let filenames = LocalFileSystem.list(path, options.file_extension).await?;
 
         if filenames.is_empty() {
             return Err(DataFusionError::Execution(format!(
diff --git a/datafusion/src/physical_plan/parquet.rs b/datafusion/src/physical_plan/parquet.rs
index 24f7c12178eb..30bdf15f07a2 100644
--- a/datafusion/src/physical_plan/parquet.rs
+++ b/datafusion/src/physical_plan/parquet.rs
@@ -360,7 +360,7 @@ impl ExecutionPlan for ParquetExec {
         let limit = self.limit;
 
         task::spawn_blocking(move || {
-            if let Err(e) = read_files(
+            if let Err(e) = read_partition(
                 object_store,
                 partition,
                 metrics,
@@ -553,7 +553,7 @@ fn build_row_group_predicate(
 }
 
 #[allow(clippy::too_many_arguments)]
-fn read_files(
+fn read_partition(
     object_store: Arc<dyn ObjectStore>,
     partition: ParquetPartition,
     metrics: ParquetPartitionMetrics,

From 2c2650b9e7257a84c748b26102724a4859556841 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Mon, 16 Aug 2021 15:15:16 +0800
Subject: [PATCH 10/16] take sync async func apart

---
 .../core/src/serde/logical_plan/from_proto.rs |  5 +-
 datafusion/src/datasource/csv.rs              |  4 +-
 datafusion/src/datasource/json.rs             |  3 +-
 .../src/datasource/object_store/local.rs      | 99 ++++++++++---------
 datafusion/src/datasource/object_store/mod.rs | 12 ++-
 datafusion/src/physical_plan/csv.rs           |  2 +-
 datafusion/src/physical_plan/json.rs          |  2 +-
 7 files changed, 71 insertions(+), 56 deletions(-)

diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs
index dbc819d855c8..6490b251eec9 100644
--- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs
+++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs
@@ -1102,7 +1102,10 @@ impl TryInto<Field> for &protobuf::Field {
 
 use datafusion::physical_plan::datetime_expressions::to_timestamp;
 use datafusion::physical_plan::{aggregates, windows};
-use datafusion::prelude::{array, date_part, date_trunc, length, lower, ltrim, md5, rtrim, sha224, sha256, sha384, sha512, trim, upper, ExecutionContext};
+use datafusion::prelude::{
+    array, date_part, date_trunc, length, lower, ltrim, md5, rtrim, sha224, sha256,
+    sha384, sha512, trim, upper, ExecutionContext,
+};
 use std::convert::TryFrom;
 
 impl TryFrom<i32> for protobuf::FileType {
diff --git a/datafusion/src/datasource/csv.rs b/datafusion/src/datasource/csv.rs
index 3f19e5772860..6df59d63eed2 100644
--- a/datafusion/src/datasource/csv.rs
+++ b/datafusion/src/datasource/csv.rs
@@ -66,8 +66,8 @@ impl CsvFile {
         let schema = Arc::new(match options.schema {
             Some(s) => s.clone(),
             None => {
-                let filenames = LocalFileSystem
-                    .list(path.as_str(), options.file_extension).await?;
+                let filenames =
+                    LocalFileSystem.list(path.as_str(), options.file_extension)?;
                 if filenames.is_empty() {
                     return Err(DataFusionError::Plan(format!(
                         "No files found at {path} with file extension {file_extension}",
diff --git a/datafusion/src/datasource/json.rs b/datafusion/src/datasource/json.rs
index d04e95892cf3..e353cfe917c5 100644
--- a/datafusion/src/datasource/json.rs
+++ b/datafusion/src/datasource/json.rs
@@ -58,8 +58,7 @@ impl NdJsonFile {
         let schema = if let Some(schema) = options.schema {
             schema
         } else {
-            let filenames =
-                LocalFileSystem.list(path, options.file_extension).await?;
+            let filenames = LocalFileSystem.list(path, options.file_extension)?;
             if filenames.is_empty() {
                 return Err(DataFusionError::Plan(format!(
                     "No files found at {path} with file extension {file_extension}",
diff --git a/datafusion/src/datasource/object_store/local.rs b/datafusion/src/datasource/object_store/local.rs
index f1a9595d854f..fbdaaa38109e 100644
--- a/datafusion/src/datasource/object_store/local.rs
+++ b/datafusion/src/datasource/object_store/local.rs
@@ -16,31 +16,36 @@
 // under the License.
 
 //! Object store that represents the Local File System.
-use crate::datasource::object_store::{ObjectReader, ObjectStore, FileNameStream};
+use crate::datasource::object_store::{FileNameStream, ObjectReader, ObjectStore};
 use crate::error::DataFusionError;
 use crate::error::Result;
 use crate::parquet::file::reader::Length;
 use crate::parquet::file::serialized_reader::FileSource;
 use async_trait::async_trait;
+use futures::{stream, Stream, StreamExt};
 use std::any::Any;
 use std::io::Read;
-use std::sync::Arc;
-use futures::{stream, Stream, StreamExt};
-use tokio::fs::{File, self, ReadDir};
 use std::path::PathBuf;
+use std::sync::Arc;
+use tokio::fs::{self, File, ReadDir};
 
 #[derive(Debug)]
 /// Local File System as Object Store.
 pub struct LocalFileSystem;
 
-
 #[async_trait]
 impl ObjectStore for LocalFileSystem {
     fn as_any(&self) -> &dyn Any {
         self
     }
 
-    async fn list(&self, path: &str, ext: &str) -> Result<FileNameStream> {
+    fn list(&self, path: &str, ext: &str) -> Result<Vec<String>> {
+        let mut filenames: Vec<String> = Vec::new();
+        list_all_files(path, &mut filenames, ext)?;
+        Ok(filenames)
+    }
+
+    async fn list_async(&self, path: &str, ext: &str) -> Result<FileNameStream> {
         list_all(path.to_string(), ext.to_string()).await
     }
 
@@ -66,17 +71,21 @@ impl ObjectReader for LocalFSObjectReader {
         Box::new(FileSource::<File>::new(&self.file, start, length))
     }
 
+    fn get_reader_async(&self, start: u64, length: usize) -> Box<dyn Read> {
+        todo!()
+    }
+
     fn length(&self) -> u64 {
         self.file.len()
     }
 }
 
 async fn list_all(root_path: String, ext: String) -> Result<FileNameStream> {
-    // let mut filenames: Vec<String> = Vec::new();
-    // list_all_files(root_path, &mut filenames, ext).await?;
-    // Ok(filenames)
-
-    async fn one_level(path: String, to_visit: &mut Vec<String>, ext: String) -> Result<Vec<String>> {
+    async fn one_level(
+        path: String,
+        to_visit: &mut Vec<String>,
+        ext: String,
+    ) -> Result<Vec<String>> {
         let mut dir = fs::read_dir(path).await?;
         let mut files = Vec::new();
 
@@ -90,47 +99,45 @@ async fn list_all(root_path: String, ext: String) -> Result<FileNameStream> {
                     }
                 }
             } else {
-                return Err(DataFusionError::Plan("Invalid path".to_string()))
+                return Err(DataFusionError::Plan("Invalid path".to_string()));
             }
-
         }
         Ok(files)
     }
 
-    stream::unfold(vec![root_path], |mut to_visit| {
-        async {
-            let path = to_visit.pop()?;
-            let file_stream = match one_level(path, &mut to_visit, ext).await {
-                Ok(files) => stream::iter(files).map(Ok).left_stream(),
-                Err(e) => stream::once(async { Err(e) }).right_stream(),
-            };
+    stream::unfold(vec![root_path], |mut to_visit| async {
+        let path = to_visit.pop()?;
+        let file_stream = match one_level(path, &mut to_visit, ext).await {
+            Ok(files) => stream::iter(files).map(Ok).left_stream(),
+            Err(e) => stream::once(async { Err(e) }).right_stream(),
+        };
 
-            Some((file_stream, to_visit))
-        }
-    }).flatten()
+        Some((file_stream, to_visit))
+    })
+    .flatten()
 }
 
 /// Recursively build a list of files in a directory with a given extension with an accumulator list
-// async fn list_all_files(dir: &str, filenames: &mut Vec<String>, ext: &str) -> Result<()> {
-//     let metadata = std::fs::metadata(dir)?;
-//     if metadata.is_file() {
-//         if dir.ends_with(ext) {
-//             filenames.push(dir.to_string());
-//         }
-//     } else {
-//         for entry in std::fs::read_dir(dir)? {
-//             let entry = entry?;
-//             let path = entry.path();
-//             if let Some(path_name) = path.to_str() {
-//                 if path.is_dir() {
-//                     list_all_files(path_name, filenames, ext).await?;
-//                 } else if path_name.ends_with(ext) {
-//                     filenames.push(path_name.to_string());
-//                 }
-//             } else {
-//                 return Err(DataFusionError::Plan("Invalid path".to_string()));
-//             }
-//         }
-//     }
-//     Ok(())
-// }
+fn list_all_files(dir: &str, filenames: &mut Vec<String>, ext: &str) -> Result<()> {
+    let metadata = std::fs::metadata(dir)?;
+    if metadata.is_file() {
+        if dir.ends_with(ext) {
+            filenames.push(dir.to_string());
+        }
+    } else {
+        for entry in std::fs::read_dir(dir)? {
+            let entry = entry?;
+            let path = entry.path();
+            if let Some(path_name) = path.to_str() {
+                if path.is_dir() {
+                    list_all_files(path_name, filenames, ext).await?;
+                } else if path_name.ends_with(ext) {
+                    filenames.push(path_name.to_string());
+                }
+            } else {
+                return Err(DataFusionError::Plan("Invalid path".to_string()));
+            }
+        }
+    }
+    Ok(())
+}
diff --git a/datafusion/src/datasource/object_store/mod.rs b/datafusion/src/datasource/object_store/mod.rs
index b273aecf5a84..f9b8e0407fdf 100644
--- a/datafusion/src/datasource/object_store/mod.rs
+++ b/datafusion/src/datasource/object_store/mod.rs
@@ -33,18 +33,21 @@ use local::LocalFileSystem;
 
 use crate::error::Result;
 
-
 /// Object Reader for one file in a object store
 #[async_trait]
 pub trait ObjectReader {
     /// Get reader for a part [start, start + length] in the file
     fn get_reader(&self, start: u64, length: usize) -> Box<dyn Read>;
 
+    /// Get reader for a part [start, start + length] in the file asynchronously
+    fn get_reader_async(&self, start: u64, length: usize) -> Box<dyn Read>;
+
     /// Get lenght for the file
     fn length(&self) -> u64;
 }
 
-pub type FileNameStream = Pin<Box<dyn Stream<Item = Result<String>> + Send + Sync + 'static>>;
+pub type FileNameStream =
+    Pin<Box<dyn Stream<Item = Result<String>> + Send + Sync + 'static>>;
 
 /// A ObjectStore abstracts access to an underlying file/object storage.
 /// It maps strings (e.g. URLs, filesystem paths, etc) to sources of bytes
@@ -55,7 +58,10 @@ pub trait ObjectStore: Sync + Send + Debug {
     fn as_any(&self) -> &dyn Any;
 
     /// Returns all the files with filename extension `ext` in path `prefix`
-    async fn list(&self, prefix: &str, ext: &str) -> Result<FileNameStream>;
+    fn list(&self, prefix: &str, ext: &str) -> Result<Vec<String>>;
+
+    /// Returns all the files with filename extension `ext` in path `prefix` asynchronously
+    async fn list_async(&self, prefix: &str, ext: &str) -> Result<FileNameStream>;
 
     /// Get object reader for one file
     fn get_reader(&self, file_path: &str) -> Result<Arc<dyn ObjectReader>>;
diff --git a/datafusion/src/physical_plan/csv.rs b/datafusion/src/physical_plan/csv.rs
index 05e8361051f9..3f2214c57750 100644
--- a/datafusion/src/physical_plan/csv.rs
+++ b/datafusion/src/physical_plan/csv.rs
@@ -143,7 +143,7 @@ impl CsvExec {
     ) -> Result<Self> {
         let file_extension = String::from(options.file_extension);
 
-        let filenames = LocalFileSystem.list(path, options.file_extension).await?;
+        let filenames = LocalFileSystem.list(path, options.file_extension)?;
         if filenames.is_empty() {
             return Err(DataFusionError::Execution(format!(
                 "No files found at {path} with file extension {file_extension}",
diff --git a/datafusion/src/physical_plan/json.rs b/datafusion/src/physical_plan/json.rs
index a694aae5cbde..e70af74fb7e9 100644
--- a/datafusion/src/physical_plan/json.rs
+++ b/datafusion/src/physical_plan/json.rs
@@ -89,7 +89,7 @@ impl NdJsonExec {
     ) -> Result<Self> {
         let file_extension = options.file_extension.to_string();
 
-        let filenames = LocalFileSystem.list(path, options.file_extension).await?;
+        let filenames = LocalFileSystem.list(path, options.file_extension)?;
 
         if filenames.is_empty() {
             return Err(DataFusionError::Execution(format!(

From cf2c2038aa94704d86f8925edeba6b2550f4dc78 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Tue, 17 Aug 2021 22:29:36 +0800
Subject: [PATCH 11/16] Make async list and metadata fetch

---
 benchmarks/src/bin/tpch.rs                    |   5 +-
 datafusion/src/datasource/csv.rs              |   2 +-
 datafusion/src/datasource/mod.rs              | 161 ++++++++++++++----
 .../src/datasource/object_store/local.rs      |  91 +++++-----
 datafusion/src/datasource/object_store/mod.rs |   7 +-
 datafusion/src/datasource/parquet.rs          |  70 ++++++--
 6 files changed, 240 insertions(+), 96 deletions(-)

diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs
index c45341bad2de..f47ae57eacc3 100644
--- a/benchmarks/src/bin/tpch.rs
+++ b/benchmarks/src/bin/tpch.rs
@@ -475,9 +475,12 @@ fn get_table(
         }
         "parquet" => {
             let path = format!("{}/{}", path, table);
-            Ok(Arc::new(ParquetTable::try_new(
+            let schema = get_schema(table);
+            Ok(Arc::new(ParquetTable::try_new_with_schema(
                 &path,
                 ExecutionContext::with_concurrency(max_concurrency),
+                schema,
+                false,
             )?))
         }
         other => {
diff --git a/datafusion/src/datasource/csv.rs b/datafusion/src/datasource/csv.rs
index 6df59d63eed2..160556cd4da7 100644
--- a/datafusion/src/datasource/csv.rs
+++ b/datafusion/src/datasource/csv.rs
@@ -49,7 +49,7 @@ use crate::physical_plan::csv::CsvExec;
 pub use crate::physical_plan::csv::CsvReadOptions;
 use crate::physical_plan::ExecutionPlan;
 
-/// Represents a CSV file with a provided schema
+/// Represents a CSV file with a provided scxhema
 pub struct CsvFile {
     source: Source,
     schema: SchemaRef,
diff --git a/datafusion/src/datasource/mod.rs b/datafusion/src/datasource/mod.rs
index b53822b83dd3..0fae9571a1fd 100644
--- a/datafusion/src/datasource/mod.rs
+++ b/datafusion/src/datasource/mod.rs
@@ -31,12 +31,18 @@ pub use self::memory::MemTable;
 
 use crate::arrow::datatypes::{Schema, SchemaRef};
 use crate::datasource::datasource::{ColumnStatistics, Statistics};
-use crate::datasource::object_store::ObjectStore;
+use crate::datasource::object_store::{FileNameStream, ObjectStore};
 use crate::error::{DataFusionError, Result};
 use crate::physical_plan::expressions::{MaxAccumulator, MinAccumulator};
 use crate::physical_plan::Accumulator;
-use crate::scalar::ScalarValue;
+use async_trait::async_trait;
+use futures::{Stream, StreamExt};
+use std::fmt::Debug;
+use std::pin::Pin;
 use std::sync::Arc;
+use tokio::runtime::{Handle, Runtime};
+use tokio::sync::mpsc::{channel, Receiver, Sender};
+use tokio_stream::wrappers::ReceiverStream;
 
 /// Source for table input data
 pub(crate) enum Source<R = Box<dyn std::io::Read + Send + Sync + 'static>> {
@@ -57,10 +63,10 @@ pub struct PartitionedFile {
     pub schema: Schema,
     /// Statistics of the file
     pub statistics: Statistics,
-    /// Values of partition columns to be appended to each row
-    pub partition_value: Option<Vec<ScalarValue>>,
-    /// Schema of partition columns
-    pub partition_schema: Option<Schema>,
+    // Values of partition columns to be appended to each row
+    // pub partition_value: Option<Vec<ScalarValue>>,
+    // Schema of partition columns
+    // pub partition_schema: Option<Schema>,
     // We may include row group range here for a more fine-grained parallel execution
 }
 
@@ -70,8 +76,6 @@ impl From<String> for PartitionedFile {
             file_path,
             schema: Schema::empty(),
             statistics: Default::default(),
-            partition_value: None,
-            partition_schema: None,
         }
     }
 }
@@ -107,59 +111,142 @@ pub struct SourceRootDescriptor {
     pub schema: SchemaRef,
 }
 
+/// Stream of
+pub type PartitionedFileStream =
+    Pin<Box<dyn Stream<Item = Result<PartitionedFile>> + Send + Sync + 'static>>;
+
 /// Builder for ['SourceRootDescriptor'] inside given path
-pub trait SourceRootDescBuilder {
+#[async_trait]
+pub trait SourceRootDescBuilder: Sync + Send + Debug {
     /// Construct a ['SourceRootDescriptor'] from the provided path
     fn get_source_desc(
         path: &str,
         object_store: Arc<dyn ObjectStore>,
         ext: &str,
+        provided_schema: Option<Schema>,
+        collect_statistics: bool,
     ) -> Result<SourceRootDescriptor> {
-        let filenames = object_store.list(path, ext).await?;
-        if filenames.is_empty() {
-            return Err(DataFusionError::Plan(format!(
-                "No file (with .{} extension) found at path {}",
-                ext, path
-            )));
-        }
+        let handle = get_runtime_handle();
+        let mut results: Vec<Result<PartitionedFile>> = Vec::new();
+        handle.block_on(async {
+            match Self::get_source_desc_async(
+                path,
+                object_store,
+                ext,
+                provided_schema,
+                collect_statistics,
+            )
+            .await
+            {
+                Ok(mut stream) => {
+                    while let Some(pf) = stream.next().await {
+                        results.push(pf);
+                    }
+                }
+                Err(e) => {
+                    results.push(Err(e));
+                }
+            }
+        });
+
+        let partition_results: Result<Vec<PartitionedFile>> =
+            results.into_iter().collect();
+        let partition_files = partition_results?;
 
         // build a list of Parquet partitions with statistics and gather all unique schemas
         // used in this data set
         let mut schemas: Vec<Schema> = vec![];
 
-        let partitioned_files = filenames
-            .iter()
-            .map(|file_path| {
-                let pf = Self::get_file_meta(file_path, object_store.clone())?;
-                let schema = pf.schema.clone();
-                if schemas.is_empty() {
-                    schemas.push(schema);
-                } else if schema != schemas[0] {
-                    // we currently get the schema information from the first file rather than do
-                    // schema merging and this is a limitation.
-                    // See https://issues.apache.org/jira/browse/ARROW-11017
-                    return Err(DataFusionError::Plan(format!(
-                        "The file {} have different schema from the first file and DataFusion does \
+        for pf in &partition_files {
+            let schema = pf.schema.clone();
+            if schemas.is_empty() {
+                schemas.push(schema);
+            } else if schema != schemas[0] {
+                // we currently get the schema information from the first file rather than do
+                // schema merging and this is a limitation.
+                // See https://issues.apache.org/jira/browse/ARROW-11017
+                return Err(DataFusionError::Plan(format!(
+                    "The file {} have different schema from the first file and DataFusion does \
                         not yet support schema merging",
-                        file_path
-                    )));
-                }
-                Ok(pf)
-            }).collect::<Result<Vec<PartitionedFile>>>();
+                    pf.file_path
+                )));
+            }
+        }
 
         Ok(SourceRootDescriptor {
-            partition_files: partitioned_files?,
+            partition_files,
             schema: Arc::new(schemas.pop().unwrap()),
         })
     }
 
+    /// Construct a ['SourceRootDescriptor'] from the provided path asynchronously
+    async fn get_source_desc_async(
+        path: &str,
+        object_store: Arc<dyn ObjectStore>,
+        ext: &str,
+        provided_schema: Option<Schema>,
+        collect_statistics: bool,
+    ) -> Result<PartitionedFileStream> {
+        let mut list_result: FileNameStream = object_store.list_async(path, ext).await?;
+
+        let (tx, rx): (
+            Sender<Result<PartitionedFile>>,
+            Receiver<Result<PartitionedFile>>,
+        ) = channel(2);
+
+        let mut contains_file = false;
+        while let Some(item) = list_result.next().await {
+            contains_file = true;
+            match item {
+                Ok(file_path) => {
+                    if collect_statistics {
+                        let tx = tx.clone();
+                        let object_store = object_store.clone();
+                        let path = file_path.clone();
+                        tokio::spawn(async move {
+                            let file_meta = Self::get_file_meta(path, object_store).await;
+                            tx.send(file_meta).await.unwrap();
+                        });
+                    } else {
+                        tx.send(Ok(PartitionedFile {
+                            file_path,
+                            schema: provided_schema.clone().unwrap(),
+                            statistics: Statistics::default(),
+                        }))
+                        .await
+                        .unwrap();
+                    }
+                }
+                Err(e) => {
+                    tx.send(Err(e)).await.unwrap();
+                }
+            }
+        }
+
+        if !contains_file {
+            return Err(DataFusionError::Plan(format!(
+                "No file (with .{} extension) found at path {}",
+                ext, path
+            )));
+        }
+
+        Ok(Box::pin(ReceiverStream::new(rx)))
+    }
+
     /// Get all metadata for a source file, including schema, statistics, partitions, etc.
-    fn get_file_meta(
-        file_path: &str,
+    async fn get_file_meta(
+        file_path: String,
         object_store: Arc<dyn ObjectStore>,
     ) -> Result<PartitionedFile>;
 }
 
+fn get_runtime_handle() -> Handle {
+    match Handle::try_current() {
+        Ok(h) => h,
+        Err(_) => Runtime::new().unwrap().handle().to_owned(),
+    }
+}
+
 /// Get all files as well as the summary statistics when a limit is provided
 pub fn get_statistics_with_limit(
     source_desc: &SourceRootDescriptor,
diff --git a/datafusion/src/datasource/object_store/local.rs b/datafusion/src/datasource/object_store/local.rs
index fbdaaa38109e..e2e1541e0d11 100644
--- a/datafusion/src/datasource/object_store/local.rs
+++ b/datafusion/src/datasource/object_store/local.rs
@@ -16,18 +16,18 @@
 // under the License.
 
 //! Object store that represents the Local File System.
+use crate::datasource::get_runtime_handle;
 use crate::datasource::object_store::{FileNameStream, ObjectReader, ObjectStore};
 use crate::error::DataFusionError;
 use crate::error::Result;
 use crate::parquet::file::reader::Length;
 use crate::parquet::file::serialized_reader::FileSource;
 use async_trait::async_trait;
-use futures::{stream, Stream, StreamExt};
+use futures::{stream, StreamExt};
 use std::any::Any;
+use std::fs::File;
 use std::io::Read;
-use std::path::PathBuf;
 use std::sync::Arc;
-use tokio::fs::{self, File, ReadDir};
 
 #[derive(Debug)]
 /// Local File System as Object Store.
@@ -40,13 +40,12 @@ impl ObjectStore for LocalFileSystem {
     }
 
     fn list(&self, path: &str, ext: &str) -> Result<Vec<String>> {
-        let mut filenames: Vec<String> = Vec::new();
-        list_all_files(path, &mut filenames, ext)?;
+        let filenames: Vec<String> = list_all(path.to_string(), ext.to_string())?;
         Ok(filenames)
     }
 
     async fn list_async(&self, path: &str, ext: &str) -> Result<FileNameStream> {
-        list_all(path.to_string(), ext.to_string()).await
+        list_all_async(path.to_string(), ext.to_string()).await
     }
 
     fn get_reader(&self, file_path: &str) -> Result<Arc<dyn ObjectReader>> {
@@ -71,7 +70,7 @@ impl ObjectReader for LocalFSObjectReader {
         Box::new(FileSource::<File>::new(&self.file, start, length))
     }
 
-    fn get_reader_async(&self, start: u64, length: usize) -> Box<dyn Read> {
+    fn get_reader_async(&self, _start: u64, _length: usize) -> Box<dyn Read> {
         todo!()
     }
 
@@ -80,23 +79,39 @@ impl ObjectReader for LocalFSObjectReader {
     }
 }
 
-async fn list_all(root_path: String, ext: String) -> Result<FileNameStream> {
-    async fn one_level(
+fn list_all(root_path: String, ext: String) -> Result<Vec<String>> {
+    let handle = get_runtime_handle();
+    let mut file_results: Vec<Result<String>> = Vec::new();
+    handle.block_on(async {
+        match list_all_async(root_path, ext).await {
+            Ok(mut stream) => {
+                while let Some(result) = stream.next().await {
+                    file_results.push(result);
+                }
+            }
+            Err(_) => {
+                file_results.push(Err(DataFusionError::Plan("Invalid path".to_string())));
+            }
+        }
+    });
+    file_results.into_iter().collect()
+}
+
+async fn list_all_async(root_path: String, ext: String) -> Result<FileNameStream> {
+    async fn find_files_in_dir(
         path: String,
         to_visit: &mut Vec<String>,
         ext: String,
     ) -> Result<Vec<String>> {
-        let mut dir = fs::read_dir(path).await?;
+        let mut dir = tokio::fs::read_dir(path).await?;
         let mut files = Vec::new();
 
         while let Some(child) = dir.next_entry().await? {
             if let Some(child_path) = child.path().to_str() {
                 if child.metadata().await?.is_dir() {
                     to_visit.push(child_path.to_string());
-                } else {
-                    if child_path.ends_with(&ext) {
-                        files.push(child_path.to_string())
-                    }
+                } else if child_path.ends_with(&ext.clone()) {
+                    files.push(child_path.to_string())
                 }
             } else {
                 return Err(DataFusionError::Plan("Invalid path".to_string()));
@@ -105,39 +120,23 @@ async fn list_all(root_path: String, ext: String) -> Result<FileNameStream> {
         Ok(files)
     }
 
-    stream::unfold(vec![root_path], |mut to_visit| async {
-        let path = to_visit.pop()?;
-        let file_stream = match one_level(path, &mut to_visit, ext).await {
-            Ok(files) => stream::iter(files).map(Ok).left_stream(),
-            Err(e) => stream::once(async { Err(e) }).right_stream(),
-        };
-
-        Some((file_stream, to_visit))
-    })
-    .flatten()
-}
-
-/// Recursively build a list of files in a directory with a given extension with an accumulator list
-fn list_all_files(dir: &str, filenames: &mut Vec<String>, ext: &str) -> Result<()> {
-    let metadata = std::fs::metadata(dir)?;
-    if metadata.is_file() {
-        if dir.ends_with(ext) {
-            filenames.push(dir.to_string());
-        }
-    } else {
-        for entry in std::fs::read_dir(dir)? {
-            let entry = entry?;
-            let path = entry.path();
-            if let Some(path_name) = path.to_str() {
-                if path.is_dir() {
-                    list_all_files(path_name, filenames, ext).await?;
-                } else if path_name.ends_with(ext) {
-                    filenames.push(path_name.to_string());
+    let result = stream::unfold(vec![root_path], move |mut to_visit| {
+        let ext = ext.clone();
+        async move {
+            match to_visit.pop() {
+                None => None,
+                Some(path) => {
+                    let file_stream =
+                        match find_files_in_dir(path, &mut to_visit, ext).await {
+                            Ok(files) => stream::iter(files).map(Ok).left_stream(),
+                            Err(e) => stream::once(async { Err(e) }).right_stream(),
+                        };
+
+                    Some((file_stream, to_visit))
                 }
-            } else {
-                return Err(DataFusionError::Plan("Invalid path".to_string()));
             }
         }
-    }
-    Ok(())
+    })
+    .flatten();
+    Ok(Box::pin(result))
 }
diff --git a/datafusion/src/datasource/object_store/mod.rs b/datafusion/src/datasource/object_store/mod.rs
index f9b8e0407fdf..9d3776fb9475 100644
--- a/datafusion/src/datasource/object_store/mod.rs
+++ b/datafusion/src/datasource/object_store/mod.rs
@@ -27,7 +27,7 @@ use std::pin::Pin;
 use std::sync::{Arc, RwLock};
 
 use async_trait::async_trait;
-use futures::{Stream, StreamExt};
+use futures::Stream;
 
 use local::LocalFileSystem;
 
@@ -46,6 +46,11 @@ pub trait ObjectReader {
     fn length(&self) -> u64;
 }
 
+/// Stream of files get listed from object store. Currently, we only
+/// return file paths, but for many object stores, object listing will actually give us more
+/// information than just the file path, for example, last updated time and file size are
+/// often returned as part of the api/sys call.
+/// These extra metadata might be useful for other purposes.
 pub type FileNameStream =
     Pin<Box<dyn Stream<Item = Result<String>> + Send + Sync + 'static>>;
 
diff --git a/datafusion/src/datasource/parquet.rs b/datafusion/src/datasource/parquet.rs
index 03eca4605c2a..010d7ed8f228 100644
--- a/datafusion/src/datasource/parquet.rs
+++ b/datafusion/src/datasource/parquet.rs
@@ -21,8 +21,7 @@ use std::any::Any;
 use std::io::Read;
 use std::sync::Arc;
 
-use arrow::datatypes::SchemaRef;
-
+use async_trait::async_trait;
 use parquet::arrow::ArrowReader;
 use parquet::arrow::ParquetFileArrowReader;
 use parquet::file::reader::ChunkReader;
@@ -30,7 +29,7 @@ use parquet::file::serialized_reader::SerializedFileReader;
 use parquet::file::statistics::Statistics as ParquetStatistics;
 
 use super::datasource::TableProviderFilterPushDown;
-use crate::arrow::datatypes::{DataType, Field};
+use crate::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use crate::datasource::datasource::Statistics;
 use crate::datasource::object_store::{ObjectReader, ObjectStore};
 use crate::datasource::{
@@ -68,6 +67,30 @@ impl ParquetTable {
         })
     }
 
+    /// Attempt to initialize a new `ParquetTable` from a file path and known schema.
+    /// If collect_statistics is `false`, doesn't read files until necessary by scan
+    pub fn try_new_with_schema(
+        path: impl Into<String>,
+        context: ExecutionContext,
+        schema: Schema,
+        collect_statistics: bool,
+    ) -> Result<Self> {
+        let path = path.into();
+        let max_concurrency = context.state.lock().unwrap().config.concurrency;
+        let root_desc = ParquetRootDesc::new_with_schema(
+            path.as_str(),
+            context,
+            Some(schema),
+            collect_statistics,
+        );
+        Ok(Self {
+            path,
+            desc: Arc::new(root_desc?),
+            max_concurrency,
+            enable_pruning: true,
+        })
+    }
+
     /// Get the path for the Parquet file(s) represented by this ParquetTable instance
     pub fn path(&self) -> &str {
         &self.path
@@ -158,7 +181,34 @@ impl ParquetRootDesc {
             .unwrap()
             .object_store_registry
             .get_by_path(root_path);
-        let root_desc = Self::get_source_desc(root_path, object_store.clone(), "parquet");
+        let root_desc =
+            Self::get_source_desc(root_path, object_store.clone(), "parquet", None, true);
+        Ok(Self {
+            object_store,
+            descriptor: root_desc?,
+        })
+    }
+
+    /// Construct a new parquet descriptor for a root path with known schema
+    pub fn new_with_schema(
+        root_path: &str,
+        context: ExecutionContext,
+        schema: Option<Schema>,
+        collect_statistics: bool,
+    ) -> Result<Self> {
+        let object_store = context
+            .state
+            .lock()
+            .unwrap()
+            .object_store_registry
+            .get_by_path(root_path);
+        let root_desc = Self::get_source_desc(
+            root_path,
+            object_store.clone(),
+            "parquet",
+            schema,
+            collect_statistics,
+        );
         Ok(Self {
             object_store,
             descriptor: root_desc?,
@@ -314,12 +364,13 @@ impl ParquetRootDesc {
     }
 }
 
+#[async_trait]
 impl SourceRootDescBuilder for ParquetRootDesc {
-    fn get_file_meta(
-        file_path: &str,
+    async fn get_file_meta(
+        file_path: String,
         object_store: Arc<dyn ObjectStore>,
     ) -> Result<PartitionedFile> {
-        let reader = object_store.get_reader(file_path)?;
+        let reader = object_store.get_reader(file_path.as_str())?;
         let file_reader =
             Arc::new(SerializedFileReader::new(ObjectReaderWrapper::new(reader))?);
         let mut arrow_reader = ParquetFileArrowReader::new(file_reader);
@@ -384,8 +435,6 @@ impl SourceRootDescBuilder for ParquetRootDesc {
             file_path,
             schema,
             statistics,
-            partition_value: None,
-            partition_schema: None,
         })
     }
 }
@@ -418,7 +467,8 @@ impl Length for ObjectReaderWrapper {
     }
 }
 
-/// Thin wrapper over reader for a parquet file
+/// Thin wrapper over reader for a parquet file.
+/// To be removed once rust-lang/rfcs#1598 is stabilized
 pub struct InnerReaderWrapper {
     inner_reader: Box<dyn Read>,
 }

From 0596c9c664cdc1047606a862fd7a626c6a023f6d Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Wed, 18 Aug 2021 16:03:00 +0800
Subject: [PATCH 12/16] async read

---
 datafusion/src/datasource/mod.rs              |   1 +
 .../src/datasource/object_store/local.rs      |  45 ++++--
 datafusion/src/datasource/object_store/mod.rs |  25 ++-
 datafusion/src/datasource/parquet.rs          |  16 +-
 datafusion/src/datasource/parquet_io.rs       | 144 ++++++++++++++++++
 5 files changed, 212 insertions(+), 19 deletions(-)
 create mode 100644 datafusion/src/datasource/parquet_io.rs

diff --git a/datafusion/src/datasource/mod.rs b/datafusion/src/datasource/mod.rs
index 0fae9571a1fd..f79cae799558 100644
--- a/datafusion/src/datasource/mod.rs
+++ b/datafusion/src/datasource/mod.rs
@@ -24,6 +24,7 @@ pub mod json;
 pub mod memory;
 pub mod object_store;
 pub mod parquet;
+pub mod parquet_io;
 
 pub use self::csv::{CsvFile, CsvReadOptions};
 pub use self::datasource::{TableProvider, TableType};
diff --git a/datafusion/src/datasource/object_store/local.rs b/datafusion/src/datasource/object_store/local.rs
index e2e1541e0d11..1c76c2eac118 100644
--- a/datafusion/src/datasource/object_store/local.rs
+++ b/datafusion/src/datasource/object_store/local.rs
@@ -17,17 +17,19 @@
 
 //! Object store that represents the Local File System.
 use crate::datasource::get_runtime_handle;
-use crate::datasource::object_store::{FileNameStream, ObjectReader, ObjectStore};
+use crate::datasource::object_store::{
+    FileNameStream, ObjectReader, ObjectStore, ThreadSafeRead,
+};
+use crate::datasource::parquet_io::FileSource2;
 use crate::error::DataFusionError;
 use crate::error::Result;
 use crate::parquet::file::reader::Length;
-use crate::parquet::file::serialized_reader::FileSource;
 use async_trait::async_trait;
 use futures::{stream, StreamExt};
 use std::any::Any;
 use std::fs::File;
-use std::io::Read;
 use std::sync::Arc;
+use tokio::task;
 
 #[derive(Debug)]
 /// Local File System as Object Store.
@@ -65,17 +67,42 @@ impl LocalFSObjectReader {
     }
 }
 
+#[async_trait]
 impl ObjectReader for LocalFSObjectReader {
-    fn get_reader(&self, start: u64, length: usize) -> Box<dyn Read> {
-        Box::new(FileSource::<File>::new(&self.file, start, length))
+    fn get_reader(&self, start: u64, length: usize) -> Result<Box<dyn ThreadSafeRead>> {
+        Ok(Box::new(FileSource2::<File>::new(
+            &self.file, start, length,
+        )))
     }
 
-    fn get_reader_async(&self, _start: u64, _length: usize) -> Box<dyn Read> {
-        todo!()
+    async fn get_reader_async(
+        &self,
+        start: u64,
+        length: usize,
+    ) -> Result<Box<dyn ThreadSafeRead>> {
+        let file = self.file.try_clone()?;
+        match task::spawn_blocking(move || {
+            let read: Result<Box<dyn ThreadSafeRead>> =
+                Ok(Box::new(FileSource2::<File>::new(&file, start, length)));
+            read
+        })
+        .await
+        {
+            Ok(r) => r,
+            Err(e) => Err(DataFusionError::Internal(e.to_string())),
+        }
     }
 
-    fn length(&self) -> u64 {
-        self.file.len()
+    fn length(&self) -> Result<u64> {
+        Ok(self.file.len())
+    }
+
+    async fn length_async(&self) -> Result<u64> {
+        let file = self.file.try_clone()?;
+        match task::spawn_blocking(move || Ok(file.len())).await {
+            Ok(r) => r,
+            Err(e) => Err(DataFusionError::Internal(e.to_string())),
+        }
     }
 }
 
diff --git a/datafusion/src/datasource/object_store/mod.rs b/datafusion/src/datasource/object_store/mod.rs
index 9d3776fb9475..d365ab34380d 100644
--- a/datafusion/src/datasource/object_store/mod.rs
+++ b/datafusion/src/datasource/object_store/mod.rs
@@ -31,19 +31,36 @@ use futures::Stream;
 
 use local::LocalFileSystem;
 
+use crate::datasource::get_runtime_handle;
 use crate::error::Result;
 
+/// Thread safe read
+pub trait ThreadSafeRead: Read + Send + Sync + 'static {}
+
 /// Object Reader for one file in a object store
 #[async_trait]
 pub trait ObjectReader {
     /// Get reader for a part [start, start + length] in the file
-    fn get_reader(&self, start: u64, length: usize) -> Box<dyn Read>;
+    fn get_reader(&self, start: u64, length: usize) -> Result<Box<dyn ThreadSafeRead>> {
+        let handle = get_runtime_handle();
+        handle.block_on(self.get_reader_async(start, length))
+    }
 
     /// Get reader for a part [start, start + length] in the file asynchronously
-    fn get_reader_async(&self, start: u64, length: usize) -> Box<dyn Read>;
+    async fn get_reader_async(
+        &self,
+        start: u64,
+        length: usize,
+    ) -> Result<Box<dyn ThreadSafeRead>>;
+
+    /// Get length for the file
+    fn length(&self) -> Result<u64> {
+        let handle = get_runtime_handle();
+        handle.block_on(self.length_async())
+    }
 
-    /// Get lenght for the file
-    fn length(&self) -> u64;
+    /// Get length for the file asynchronously
+    async fn length_async(&self) -> Result<u64>;
 }
 
 /// Stream of files get listed from object store. Currently, we only
diff --git a/datafusion/src/datasource/parquet.rs b/datafusion/src/datasource/parquet.rs
index 010d7ed8f228..bfc893bf8cc7 100644
--- a/datafusion/src/datasource/parquet.rs
+++ b/datafusion/src/datasource/parquet.rs
@@ -24,6 +24,7 @@ use std::sync::Arc;
 use async_trait::async_trait;
 use parquet::arrow::ArrowReader;
 use parquet::arrow::ParquetFileArrowReader;
+use parquet::errors::ParquetError;
 use parquet::file::reader::ChunkReader;
 use parquet::file::serialized_reader::SerializedFileReader;
 use parquet::file::statistics::Statistics as ParquetStatistics;
@@ -31,7 +32,7 @@ use parquet::file::statistics::Statistics as ParquetStatistics;
 use super::datasource::TableProviderFilterPushDown;
 use crate::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use crate::datasource::datasource::Statistics;
-use crate::datasource::object_store::{ObjectReader, ObjectStore};
+use crate::datasource::object_store::{ObjectReader, ObjectStore, ThreadSafeRead};
 use crate::datasource::{
     create_max_min_accs, get_col_stats, get_statistics_with_limit, PartitionedFile,
     SourceRootDescBuilder, SourceRootDescriptor, TableProvider,
@@ -455,22 +456,25 @@ impl ChunkReader for ObjectReaderWrapper {
     type T = InnerReaderWrapper;
 
     fn get_read(&self, start: u64, length: usize) -> parquet::errors::Result<Self::T> {
-        Ok(InnerReaderWrapper {
-            inner_reader: self.reader.get_reader(start, length),
-        })
+        match self.reader.get_reader(start, length) {
+            Ok(reader) => Ok(InnerReaderWrapper {
+                inner_reader: reader,
+            }),
+            Err(e) => Err(ParquetError::General(e.to_string())),
+        }
     }
 }
 
 impl Length for ObjectReaderWrapper {
     fn len(&self) -> u64 {
-        self.reader.length()
+        self.reader.length().unwrap_or(0u64)
     }
 }
 
 /// Thin wrapper over reader for a parquet file.
 /// To be removed once rust-lang/rfcs#1598 is stabilized
 pub struct InnerReaderWrapper {
-    inner_reader: Box<dyn Read>,
+    inner_reader: Box<dyn ThreadSafeRead>,
 }
 
 impl Read for InnerReaderWrapper {
diff --git a/datafusion/src/datasource/parquet_io.rs b/datafusion/src/datasource/parquet_io.rs
new file mode 100644
index 000000000000..a74fd70f25d5
--- /dev/null
+++ b/datafusion/src/datasource/parquet_io.rs
@@ -0,0 +1,144 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Copy of parquet::util::io::FileSource for thread safe parquet reader
+
+use std::sync::Mutex;
+use std::{cmp, fmt, io::*};
+
+use crate::datasource::object_store::ThreadSafeRead;
+use crate::parquet::file::reader::Length;
+use crate::parquet::util::io::{ParquetReader, Position};
+
+const DEFAULT_BUF_SIZE: usize = 8 * 1024;
+
+// ----------------------------------------------------------------------
+
+/// ParquetReader is the interface which needs to be fulfilled to be able to parse a
+/// parquet source.
+pub trait ThreadSafeParquetReader: ParquetReader + Send + Sync + 'static {}
+impl<T: ParquetReader + Send + Sync + 'static> ThreadSafeParquetReader for T {}
+
+/// Struct that represents a slice of a file data with independent start position and
+/// length. Internally clones provided file handle, wraps with a custom implementation
+/// of BufReader that resets position before any read.
+///
+/// This is workaround and alternative for `file.try_clone()` method. It clones `File`
+/// while preserving independent position, which is not available with `try_clone()`.
+///
+/// Designed after `arrow::io::RandomAccessFile` and `std::io::BufReader`
+pub struct FileSource2<R: ThreadSafeParquetReader> {
+    reader: Mutex<R>,
+    start: u64,     // start position in a file
+    end: u64,       // end position in a file
+    buf: Vec<u8>,   // buffer where bytes read in advance are stored
+    buf_pos: usize, // current position of the reader in the buffer
+    buf_cap: usize, // current number of bytes read into the buffer
+}
+
+impl<R: ThreadSafeParquetReader> fmt::Debug for FileSource2<R> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("FileSource")
+            .field("reader", &"OPAQUE")
+            .field("start", &self.start)
+            .field("end", &self.end)
+            .field("buf.len", &self.buf.len())
+            .field("buf_pos", &self.buf_pos)
+            .field("buf_cap", &self.buf_cap)
+            .finish()
+    }
+}
+
+impl<R: ThreadSafeParquetReader> FileSource2<R> {
+    /// Creates new file reader with start and length from a file handle
+    pub fn new(fd: &R, start: u64, length: usize) -> Self {
+        let reader = Mutex::new(fd.try_clone().unwrap());
+        Self {
+            reader,
+            start,
+            end: start + length as u64,
+            buf: vec![0_u8; DEFAULT_BUF_SIZE],
+            buf_pos: 0,
+            buf_cap: 0,
+        }
+    }
+
+    fn fill_inner_buf(&mut self) -> Result<&[u8]> {
+        if self.buf_pos >= self.buf_cap {
+            // If we've reached the end of our internal buffer then we need to fetch
+            // some more data from the underlying reader.
+            // Branch using `>=` instead of the more correct `==`
+            // to tell the compiler that the pos..cap slice is always valid.
+            debug_assert!(self.buf_pos == self.buf_cap);
+            let mut reader = self.reader.lock().unwrap();
+            reader.seek(SeekFrom::Start(self.start))?; // always seek to start before reading
+            self.buf_cap = reader.read(&mut self.buf)?;
+            self.buf_pos = 0;
+        }
+        Ok(&self.buf[self.buf_pos..self.buf_cap])
+    }
+
+    fn skip_inner_buf(&mut self, buf: &mut [u8]) -> Result<usize> {
+        // discard buffer
+        self.buf_pos = 0;
+        self.buf_cap = 0;
+        // read directly into param buffer
+        let mut reader = self.reader.lock().unwrap();
+        reader.seek(SeekFrom::Start(self.start))?; // always seek to start before reading
+        let nread = reader.read(buf)?;
+        self.start += nread as u64;
+        Ok(nread)
+    }
+}
+
+impl<R: ThreadSafeParquetReader> Read for FileSource2<R> {
+    fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
+        let bytes_to_read = cmp::min(buf.len(), (self.end - self.start) as usize);
+        let buf = &mut buf[0..bytes_to_read];
+
+        // If we don't have any buffered data and we're doing a massive read
+        // (larger than our internal buffer), bypass our internal buffer
+        // entirely.
+        if self.buf_pos == self.buf_cap && buf.len() >= self.buf.len() {
+            return self.skip_inner_buf(buf);
+        }
+        let nread = {
+            let mut rem = self.fill_inner_buf()?;
+            // copy the data from the inner buffer to the param buffer
+            rem.read(buf)?
+        };
+        // consume from buffer
+        self.buf_pos = cmp::min(self.buf_pos + nread, self.buf_cap);
+
+        self.start += nread as u64;
+        Ok(nread)
+    }
+}
+
+impl<R: ThreadSafeParquetReader> Position for FileSource2<R> {
+    fn pos(&self) -> u64 {
+        self.start
+    }
+}
+
+impl<R: ThreadSafeParquetReader> Length for FileSource2<R> {
+    fn len(&self) -> u64 {
+        self.end - self.start
+    }
+}
+
+impl<R: ThreadSafeParquetReader> ThreadSafeRead for FileSource2<R> {}

From 908f4453bcab9252fc4c30fc124b1b4dbfccae8e Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Thu, 19 Aug 2021 00:19:32 +0800
Subject: [PATCH 13/16] fix new runtime get discard

---
 datafusion/Cargo.toml                           |  2 +-
 datafusion/src/datasource/mod.rs                | 11 +++++++----
 datafusion/src/datasource/object_store/local.rs |  2 +-
 datafusion/src/datasource/object_store/mod.rs   |  4 ++--
 4 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml
index db950c4956ce..839d622effe3 100644
--- a/datafusion/Cargo.toml
+++ b/datafusion/Cargo.toml
@@ -58,7 +58,7 @@ chrono = "0.4"
 async-trait = "0.1.41"
 futures = "0.3"
 pin-project-lite= "^0.2.0"
-tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] }
+tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync", "fs"] }
 tokio-stream = "0.1"
 log = "^0.4"
 md-5 = { version = "^0.9.1", optional = true }
diff --git a/datafusion/src/datasource/mod.rs b/datafusion/src/datasource/mod.rs
index f79cae799558..da95c5b438a3 100644
--- a/datafusion/src/datasource/mod.rs
+++ b/datafusion/src/datasource/mod.rs
@@ -127,7 +127,7 @@ pub trait SourceRootDescBuilder: Sync + Send + Debug {
         provided_schema: Option<Schema>,
         collect_statistics: bool,
     ) -> Result<SourceRootDescriptor> {
-        let handle = get_runtime_handle();
+        let (handle, _rt) = get_runtime_handle();
         let mut results: Vec<Result<PartitionedFile>> = Vec::new();
         handle.block_on(async {
             match Self::get_source_desc_async(
@@ -241,10 +241,13 @@ pub trait SourceRootDescBuilder: Sync + Send + Debug {
     ) -> Result<PartitionedFile>;
 }
 
-fn get_runtime_handle() -> Handle {
+fn get_runtime_handle() -> (Handle, Option<Runtime>) {
     match Handle::try_current() {
-        Ok(h) => h,
-        Err(_) => Runtime::new().unwrap().handle().to_owned(),
+        Ok(h) => (h, None),
+        Err(_) => {
+            let rt = Runtime::new().unwrap();
+            (rt.handle().clone(), Some(rt))
+        }
     }
 }
 
diff --git a/datafusion/src/datasource/object_store/local.rs b/datafusion/src/datasource/object_store/local.rs
index 1c76c2eac118..274ab405b541 100644
--- a/datafusion/src/datasource/object_store/local.rs
+++ b/datafusion/src/datasource/object_store/local.rs
@@ -107,7 +107,7 @@ impl ObjectReader for LocalFSObjectReader {
 }
 
 fn list_all(root_path: String, ext: String) -> Result<Vec<String>> {
-    let handle = get_runtime_handle();
+    let (handle, _rt) = get_runtime_handle();
     let mut file_results: Vec<Result<String>> = Vec::new();
     handle.block_on(async {
         match list_all_async(root_path, ext).await {
diff --git a/datafusion/src/datasource/object_store/mod.rs b/datafusion/src/datasource/object_store/mod.rs
index d365ab34380d..58efa9ea3887 100644
--- a/datafusion/src/datasource/object_store/mod.rs
+++ b/datafusion/src/datasource/object_store/mod.rs
@@ -42,7 +42,7 @@ pub trait ThreadSafeRead: Read + Send + Sync + 'static {}
 pub trait ObjectReader {
     /// Get reader for a part [start, start + length] in the file
     fn get_reader(&self, start: u64, length: usize) -> Result<Box<dyn ThreadSafeRead>> {
-        let handle = get_runtime_handle();
+        let (handle, _rt) = get_runtime_handle();
         handle.block_on(self.get_reader_async(start, length))
     }
 
@@ -55,7 +55,7 @@ pub trait ObjectReader {
 
     /// Get length for the file
     fn length(&self) -> Result<u64> {
-        let handle = get_runtime_handle();
+        let (handle, _rt) = get_runtime_handle();
         handle.block_on(self.length_async())
     }
 

From a9f9a5e3af39b77a73959852903740c89c9588b5 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Thu, 19 Aug 2021 01:34:59 +0800
Subject: [PATCH 14/16] Use futures::executor::block_on instead

---
 ballista/rust/scheduler/src/planner.rs        | 12 +++++-----
 datafusion/Cargo.toml                         |  2 +-
 datafusion/src/datasource/mod.rs              | 14 +----------
 .../src/datasource/object_store/local.rs      |  4 +---
 datafusion/src/datasource/object_store/mod.rs |  7 ++----
 datafusion/src/physical_plan/planner.rs       | 24 +++++++++----------
 6 files changed, 23 insertions(+), 40 deletions(-)

diff --git a/ballista/rust/scheduler/src/planner.rs b/ballista/rust/scheduler/src/planner.rs
index 05025f282477..246a057fbf88 100644
--- a/ballista/rust/scheduler/src/planner.rs
+++ b/ballista/rust/scheduler/src/planner.rs
@@ -269,8 +269,8 @@ mod test {
         };
     }
 
-    #[test]
-    fn distributed_hash_aggregate_plan() -> Result<(), BallistaError> {
+    #[tokio::test]
+    async fn distributed_hash_aggregate_plan() -> Result<(), BallistaError> {
         let mut ctx = datafusion_test_context("testdata")?;
 
         // simplified form of TPC-H query 1
@@ -352,8 +352,8 @@ mod test {
         Ok(())
     }
 
-    #[test]
-    fn distributed_join_plan() -> Result<(), BallistaError> {
+    #[tokio::test]
+    async fn distributed_join_plan() -> Result<(), BallistaError> {
         let mut ctx = datafusion_test_context("testdata")?;
 
         // simplified form of TPC-H query 12
@@ -523,8 +523,8 @@ order by
         Ok(())
     }
 
-    #[test]
-    fn roundtrip_serde_hash_aggregate() -> Result<(), BallistaError> {
+    #[tokio::test]
+    async fn roundtrip_serde_hash_aggregate() -> Result<(), BallistaError> {
         let mut ctx = datafusion_test_context("testdata")?;
 
         // simplified form of TPC-H query 1
diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml
index 839d622effe3..9541e6d5a57a 100644
--- a/datafusion/Cargo.toml
+++ b/datafusion/Cargo.toml
@@ -56,7 +56,7 @@ paste = "^1.0"
 num_cpus = "1.13.0"
 chrono = "0.4"
 async-trait = "0.1.41"
-futures = "0.3"
+futures = { version = "0.3", features = ["executor"] }
 pin-project-lite= "^0.2.0"
 tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync", "fs"] }
 tokio-stream = "0.1"
diff --git a/datafusion/src/datasource/mod.rs b/datafusion/src/datasource/mod.rs
index da95c5b438a3..199444cd9cd6 100644
--- a/datafusion/src/datasource/mod.rs
+++ b/datafusion/src/datasource/mod.rs
@@ -41,7 +41,6 @@ use futures::{Stream, StreamExt};
 use std::fmt::Debug;
 use std::pin::Pin;
 use std::sync::Arc;
-use tokio::runtime::{Handle, Runtime};
 use tokio::sync::mpsc::{channel, Receiver, Sender};
 use tokio_stream::wrappers::ReceiverStream;
 
@@ -127,9 +126,8 @@ pub trait SourceRootDescBuilder: Sync + Send + Debug {
         provided_schema: Option<Schema>,
         collect_statistics: bool,
     ) -> Result<SourceRootDescriptor> {
-        let (handle, _rt) = get_runtime_handle();
         let mut results: Vec<Result<PartitionedFile>> = Vec::new();
-        handle.block_on(async {
+        futures::executor::block_on(async {
             match Self::get_source_desc_async(
                 path,
                 object_store,
@@ -241,16 +239,6 @@ pub trait SourceRootDescBuilder: Sync + Send + Debug {
     ) -> Result<PartitionedFile>;
 }
 
-fn get_runtime_handle() -> (Handle, Option<Runtime>) {
-    match Handle::try_current() {
-        Ok(h) => (h, None),
-        Err(_) => {
-            let rt = Runtime::new().unwrap();
-            (rt.handle().clone(), Some(rt))
-        }
-    }
-}
-
 /// Get all files as well as the summary statistics when a limit is provided
 pub fn get_statistics_with_limit(
     source_desc: &SourceRootDescriptor,
diff --git a/datafusion/src/datasource/object_store/local.rs b/datafusion/src/datasource/object_store/local.rs
index 274ab405b541..4244baf6a12b 100644
--- a/datafusion/src/datasource/object_store/local.rs
+++ b/datafusion/src/datasource/object_store/local.rs
@@ -16,7 +16,6 @@
 // under the License.
 
 //! Object store that represents the Local File System.
-use crate::datasource::get_runtime_handle;
 use crate::datasource::object_store::{
     FileNameStream, ObjectReader, ObjectStore, ThreadSafeRead,
 };
@@ -107,9 +106,8 @@ impl ObjectReader for LocalFSObjectReader {
 }
 
 fn list_all(root_path: String, ext: String) -> Result<Vec<String>> {
-    let (handle, _rt) = get_runtime_handle();
     let mut file_results: Vec<Result<String>> = Vec::new();
-    handle.block_on(async {
+    futures::executor::block_on(async {
         match list_all_async(root_path, ext).await {
             Ok(mut stream) => {
                 while let Some(result) = stream.next().await {
diff --git a/datafusion/src/datasource/object_store/mod.rs b/datafusion/src/datasource/object_store/mod.rs
index 58efa9ea3887..03f51ce42592 100644
--- a/datafusion/src/datasource/object_store/mod.rs
+++ b/datafusion/src/datasource/object_store/mod.rs
@@ -31,7 +31,6 @@ use futures::Stream;
 
 use local::LocalFileSystem;
 
-use crate::datasource::get_runtime_handle;
 use crate::error::Result;
 
 /// Thread safe read
@@ -42,8 +41,7 @@ pub trait ThreadSafeRead: Read + Send + Sync + 'static {}
 pub trait ObjectReader {
     /// Get reader for a part [start, start + length] in the file
     fn get_reader(&self, start: u64, length: usize) -> Result<Box<dyn ThreadSafeRead>> {
-        let (handle, _rt) = get_runtime_handle();
-        handle.block_on(self.get_reader_async(start, length))
+        futures::executor::block_on(self.get_reader_async(start, length))
     }
 
     /// Get reader for a part [start, start + length] in the file asynchronously
@@ -55,8 +53,7 @@ pub trait ObjectReader {
 
     /// Get length for the file
     fn length(&self) -> Result<u64> {
-        let (handle, _rt) = get_runtime_handle();
-        handle.block_on(self.length_async())
+        futures::executor::block_on(self.length_async())
     }
 
     /// Get length for the file asynchronously
diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs
index 02ab15d1a652..9d4673c15ff8 100644
--- a/datafusion/src/physical_plan/planner.rs
+++ b/datafusion/src/physical_plan/planner.rs
@@ -1399,8 +1399,8 @@ mod tests {
         planner.create_physical_plan(logical_plan, &ctx_state)
     }
 
-    #[test]
-    fn test_all_operators() -> Result<()> {
+    #[tokio::test]
+    async fn test_all_operators() -> Result<()> {
         let testdata = crate::test_util::arrow_test_data();
         let path = format!("{}/csv/aggregate_test_100.csv", testdata);
 
@@ -1444,8 +1444,8 @@ mod tests {
         Ok(())
     }
 
-    #[test]
-    fn test_with_csv_plan() -> Result<()> {
+    #[tokio::test]
+    async fn test_with_csv_plan() -> Result<()> {
         let testdata = crate::test_util::arrow_test_data();
         let path = format!("{}/csv/aggregate_test_100.csv", testdata);
 
@@ -1463,8 +1463,8 @@ mod tests {
         Ok(())
     }
 
-    #[test]
-    fn errors() -> Result<()> {
+    #[tokio::test]
+    async fn errors() -> Result<()> {
         let testdata = crate::test_util::arrow_test_data();
         let path = format!("{}/csv/aggregate_test_100.csv", testdata);
         let options = CsvReadOptions::new().schema_infer_max_records(100);
@@ -1565,8 +1565,8 @@ mod tests {
         }
     }
 
-    #[test]
-    fn in_list_types() -> Result<()> {
+    #[tokio::test]
+    async fn in_list_types() -> Result<()> {
         let testdata = crate::test_util::arrow_test_data();
         let path = format!("{}/csv/aggregate_test_100.csv", testdata);
         let options = CsvReadOptions::new().schema_infer_max_records(100);
@@ -1612,8 +1612,8 @@ mod tests {
         Ok(())
     }
 
-    #[test]
-    fn hash_agg_input_schema() -> Result<()> {
+    #[tokio::test]
+    async fn hash_agg_input_schema() -> Result<()> {
         let testdata = crate::test_util::arrow_test_data();
         let path = format!("{}/csv/aggregate_test_100.csv", testdata);
 
@@ -1635,8 +1635,8 @@ mod tests {
         Ok(())
     }
 
-    #[test]
-    fn hash_agg_group_by_partitioned() -> Result<()> {
+    #[tokio::test]
+    async fn hash_agg_group_by_partitioned() -> Result<()> {
         let testdata = crate::test_util::arrow_test_data();
         let path = format!("{}/csv/aggregate_test_100.csv", testdata);
 

From 7d894ccda6091412c35a073f9686acd33159109c Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Thu, 19 Aug 2021 13:56:05 +0800
Subject: [PATCH 15/16] fix doc test

---
 datafusion/src/dataframe.rs                   | 33 ++++++++++------
 datafusion/src/datasource/csv.rs              |  3 ++
 .../src/datasource/object_store/local.rs      | 38 ++++++++++---------
 datafusion/src/datasource/parquet.rs          | 16 ++++----
 datafusion/src/execution/context.rs           |  8 ++--
 datafusion/src/physical_plan/mod.rs           |  4 ++
 datafusion/src/physical_plan/parquet.rs       |  2 +-
 datafusion/tests/parquet_pruning.rs           | 36 +++++++++---------
 datafusion/tests/sql.rs                       |  4 +-
 9 files changed, 84 insertions(+), 60 deletions(-)

diff --git a/datafusion/src/dataframe.rs b/datafusion/src/dataframe.rs
index 608f6dbcaf17..45727f4160f7 100644
--- a/datafusion/src/dataframe.rs
+++ b/datafusion/src/dataframe.rs
@@ -41,7 +41,8 @@ use async_trait::async_trait;
 /// ```
 /// # use datafusion::prelude::*;
 /// # use datafusion::error::Result;
-/// # fn main() -> Result<()> {
+/// #[tokio::main]
+/// # async fn main() -> Result<()> {
 /// let mut ctx = ExecutionContext::new();
 /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
 /// let df = df.filter(col("a").lt_eq(col("b")))?
@@ -59,7 +60,8 @@ pub trait DataFrame: Send + Sync {
     /// ```
     /// # use datafusion::prelude::*;
     /// # use datafusion::error::Result;
-    /// # fn main() -> Result<()> {
+    /// #[tokio::main]
+    /// # async fn main() -> Result<()> {
     /// let mut ctx = ExecutionContext::new();
     /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
     /// let df = df.select_columns(&["a", "b"])?;
@@ -73,7 +75,8 @@ pub trait DataFrame: Send + Sync {
     /// ```
     /// # use datafusion::prelude::*;
     /// # use datafusion::error::Result;
-    /// # fn main() -> Result<()> {
+    /// #[tokio::main]
+    /// # async fn main() -> Result<()> {
     /// let mut ctx = ExecutionContext::new();
     /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
     /// let df = df.select(vec![col("a") * col("b"), col("c")])?;
@@ -87,7 +90,8 @@ pub trait DataFrame: Send + Sync {
     /// ```
     /// # use datafusion::prelude::*;
     /// # use datafusion::error::Result;
-    /// # fn main() -> Result<()> {
+    /// #[tokio::main]
+    /// # async fn main() -> Result<()> {
     /// let mut ctx = ExecutionContext::new();
     /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
     /// let df = df.filter(col("a").lt_eq(col("b")))?;
@@ -101,7 +105,8 @@ pub trait DataFrame: Send + Sync {
     /// ```
     /// # use datafusion::prelude::*;
     /// # use datafusion::error::Result;
-    /// # fn main() -> Result<()> {
+    /// #[tokio::main]
+    /// # async fn main() -> Result<()> {
     /// let mut ctx = ExecutionContext::new();
     /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
     ///
@@ -124,7 +129,8 @@ pub trait DataFrame: Send + Sync {
     /// ```
     /// # use datafusion::prelude::*;
     /// # use datafusion::error::Result;
-    /// # fn main() -> Result<()> {
+    /// #[tokio::main]
+    /// # async fn main() -> Result<()> {
     /// let mut ctx = ExecutionContext::new();
     /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
     /// let df = df.limit(100)?;
@@ -138,7 +144,8 @@ pub trait DataFrame: Send + Sync {
     /// ```
     /// # use datafusion::prelude::*;
     /// # use datafusion::error::Result;
-    /// # fn main() -> Result<()> {
+    /// #[tokio::main]
+    /// # async fn main() -> Result<()> {
     /// let mut ctx = ExecutionContext::new();
     /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
     /// let df = df.union(df.clone())?;
@@ -153,7 +160,8 @@ pub trait DataFrame: Send + Sync {
     /// ```
     /// # use datafusion::prelude::*;
     /// # use datafusion::error::Result;
-    /// # fn main() -> Result<()> {
+    /// #[tokio::main]
+    /// # async fn main() -> Result<()> {
     /// let mut ctx = ExecutionContext::new();
     /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
     /// let df = df.sort(vec![col("a").sort(true, true), col("b").sort(false, false)])?;
@@ -196,7 +204,8 @@ pub trait DataFrame: Send + Sync {
     /// ```
     /// # use datafusion::prelude::*;
     /// # use datafusion::error::Result;
-    /// # fn main() -> Result<()> {
+    /// #[tokio::main]
+    /// # async fn main() -> Result<()> {
     /// let mut ctx = ExecutionContext::new();
     /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
     /// let df1 = df.repartition(Partitioning::RoundRobinBatch(4))?;
@@ -275,7 +284,8 @@ pub trait DataFrame: Send + Sync {
     /// ```
     /// # use datafusion::prelude::*;
     /// # use datafusion::error::Result;
-    /// # fn main() -> Result<()> {
+    /// #[tokio::main]
+    /// # async fn main() -> Result<()> {
     /// let mut ctx = ExecutionContext::new();
     /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
     /// let schema = df.schema();
@@ -309,7 +319,8 @@ pub trait DataFrame: Send + Sync {
     /// ```
     /// # use datafusion::prelude::*;
     /// # use datafusion::error::Result;
-    /// # fn main() -> Result<()> {
+    /// #[tokio::main]
+    /// # async fn main() -> Result<()> {
     /// let mut ctx = ExecutionContext::new();
     /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
     /// let f = df.registry();
diff --git a/datafusion/src/datasource/csv.rs b/datafusion/src/datasource/csv.rs
index 160556cd4da7..8e55180ea829 100644
--- a/datafusion/src/datasource/csv.rs
+++ b/datafusion/src/datasource/csv.rs
@@ -24,6 +24,8 @@
 //! ```
 //! use datafusion::datasource::TableProvider;
 //! use datafusion::datasource::csv::{CsvFile, CsvReadOptions};
+//! #[tokio::main]
+//! # async fn main() {
 //!
 //! let testdata = datafusion::test_util::arrow_test_data();
 //! let csvdata = CsvFile::try_new(
@@ -31,6 +33,7 @@
 //!     CsvReadOptions::new().delimiter(b'|'),
 //! ).unwrap();
 //! let schema = csvdata.schema();
+//! # }
 //! ```
 
 use arrow::datatypes::SchemaRef;
diff --git a/datafusion/src/datasource/object_store/local.rs b/datafusion/src/datasource/object_store/local.rs
index 4244baf6a12b..36bb7e29de40 100644
--- a/datafusion/src/datasource/object_store/local.rs
+++ b/datafusion/src/datasource/object_store/local.rs
@@ -145,23 +145,27 @@ async fn list_all_async(root_path: String, ext: String) -> Result<FileNameStream
         Ok(files)
     }
 
-    let result = stream::unfold(vec![root_path], move |mut to_visit| {
-        let ext = ext.clone();
-        async move {
-            match to_visit.pop() {
-                None => None,
-                Some(path) => {
-                    let file_stream =
-                        match find_files_in_dir(path, &mut to_visit, ext).await {
-                            Ok(files) => stream::iter(files).map(Ok).left_stream(),
-                            Err(e) => stream::once(async { Err(e) }).right_stream(),
-                        };
-
-                    Some((file_stream, to_visit))
+    if tokio::fs::metadata(&root_path).await?.is_file() {
+        Ok(Box::pin(stream::once(async { Ok(root_path) })))
+    } else {
+        let result = stream::unfold(vec![root_path], move |mut to_visit| {
+            let ext = ext.clone();
+            async move {
+                match to_visit.pop() {
+                    None => None,
+                    Some(path) => {
+                        let file_stream =
+                            match find_files_in_dir(path, &mut to_visit, ext).await {
+                                Ok(files) => stream::iter(files).map(Ok).left_stream(),
+                                Err(e) => stream::once(async { Err(e) }).right_stream(),
+                            };
+
+                        Some((file_stream, to_visit))
+                    }
                 }
             }
-        }
-    })
-    .flatten();
-    Ok(Box::pin(result))
+        })
+            .flatten();
+        Ok(Box::pin(result))
+    }
 }
diff --git a/datafusion/src/datasource/parquet.rs b/datafusion/src/datasource/parquet.rs
index bfc893bf8cc7..1a8733c0ac97 100644
--- a/datafusion/src/datasource/parquet.rs
+++ b/datafusion/src/datasource/parquet.rs
@@ -493,7 +493,7 @@ mod tests {
     use arrow::record_batch::RecordBatch;
     use futures::StreamExt;
 
-    #[tokio::test]
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
     async fn read_small_batches() -> Result<()> {
         let table = load_table("alltypes_plain.parquet")?;
         let projection = None;
@@ -516,7 +516,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
     async fn read_alltypes_plain_parquet() -> Result<()> {
         let table = load_table("alltypes_plain.parquet")?;
 
@@ -551,7 +551,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
     async fn read_bool_alltypes_plain_parquet() -> Result<()> {
         let table = load_table("alltypes_plain.parquet")?;
         let projection = Some(vec![1]);
@@ -578,7 +578,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
     async fn read_i32_alltypes_plain_parquet() -> Result<()> {
         let table = load_table("alltypes_plain.parquet")?;
         let projection = Some(vec![0]);
@@ -602,7 +602,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
     async fn read_i96_alltypes_plain_parquet() -> Result<()> {
         let table = load_table("alltypes_plain.parquet")?;
         let projection = Some(vec![10]);
@@ -626,7 +626,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
     async fn read_f32_alltypes_plain_parquet() -> Result<()> {
         let table = load_table("alltypes_plain.parquet")?;
         let projection = Some(vec![6]);
@@ -653,7 +653,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
     async fn read_f64_alltypes_plain_parquet() -> Result<()> {
         let table = load_table("alltypes_plain.parquet")?;
         let projection = Some(vec![7]);
@@ -680,7 +680,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
     async fn read_binary_alltypes_plain_parquet() -> Result<()> {
         let table = load_table("alltypes_plain.parquet")?;
         let projection = Some(vec![9]);
diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs
index 91fe9cddc657..6a41f50ab0a1 100644
--- a/datafusion/src/execution/context.rs
+++ b/datafusion/src/execution/context.rs
@@ -97,7 +97,8 @@ use parquet::file::properties::WriterProperties;
 /// ```
 /// use datafusion::prelude::*;
 /// # use datafusion::error::Result;
-/// # fn main() -> Result<()> {
+/// #[tokio::main]
+/// # async fn main() -> Result<()> {
 /// let mut ctx = ExecutionContext::new();
 /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
 /// let df = df.filter(col("a").lt_eq(col("b")))?
@@ -114,7 +115,8 @@ use parquet::file::properties::WriterProperties;
 /// use datafusion::prelude::*;
 ///
 /// # use datafusion::error::Result;
-/// # fn main() -> Result<()> {
+/// #[tokio::main]
+/// # async fn main() -> Result<()> {
 /// let mut ctx = ExecutionContext::new();
 /// ctx.register_csv("example", "tests/example.csv", CsvReadOptions::new())?;
 /// let results = ctx.sql("SELECT a, MIN(b) FROM example GROUP BY a LIMIT 100")?;
@@ -2708,7 +2710,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
     async fn write_parquet_results() -> Result<()> {
         // create partitioned input file and context
         let tmp_dir = TempDir::new()?;
diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs
index 8f7db72484c9..ecbcd054b105 100644
--- a/datafusion/src/physical_plan/mod.rs
+++ b/datafusion/src/physical_plan/mod.rs
@@ -217,6 +217,9 @@ pub trait ExecutionPlan: Debug + Send + Sync {
 /// use datafusion::prelude::*;
 /// use datafusion::physical_plan::displayable;
 ///
+/// #[tokio::main]
+/// # async fn main() {
+///
 /// // Hard code concurrency as it appears in the RepartitionExec output
 /// let config = ExecutionConfig::new()
 ///     .with_concurrency(3);
@@ -242,6 +245,7 @@ pub trait ExecutionPlan: Debug + Send + Sync {
 ///            \n      RepartitionExec: partitioning=RoundRobinBatch(3)\
 ///            \n        CsvExec: source=Path(tests/example.csv: [tests/example.csv]), has_header=true",
 ///             plan_string.trim());
+/// # }
 /// ```
 ///
 pub fn displayable(plan: &dyn ExecutionPlan) -> DisplayableExecutionPlan<'_> {
diff --git a/datafusion/src/physical_plan/parquet.rs b/datafusion/src/physical_plan/parquet.rs
index 6eca2c396e06..4b205fab0b6c 100644
--- a/datafusion/src/physical_plan/parquet.rs
+++ b/datafusion/src/physical_plan/parquet.rs
@@ -667,7 +667,7 @@ mod tests {
         assert_eq!(1, chunks[4].len());
     }
 
-    #[tokio::test]
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
     async fn test() -> Result<()> {
         let testdata = crate::test_util::parquet_test_data();
         let filename = format!("{}/alltypes_plain.parquet", testdata);
diff --git a/datafusion/tests/parquet_pruning.rs b/datafusion/tests/parquet_pruning.rs
index 789f0810c983..99e19a4789fb 100644
--- a/datafusion/tests/parquet_pruning.rs
+++ b/datafusion/tests/parquet_pruning.rs
@@ -41,7 +41,7 @@ use hashbrown::HashMap;
 use parquet::{arrow::ArrowWriter, file::properties::WriterProperties};
 use tempfile::NamedTempFile;
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn prune_timestamps_nanos() {
     let output = ContextWithParquet::new(Scenario::Timestamps)
         .await
@@ -54,7 +54,7 @@ async fn prune_timestamps_nanos() {
     assert_eq!(output.result_rows, 10, "{}", output.description());
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn prune_timestamps_micros() {
     let output = ContextWithParquet::new(Scenario::Timestamps)
         .await
@@ -69,7 +69,7 @@ async fn prune_timestamps_micros() {
     assert_eq!(output.result_rows, 10, "{}", output.description());
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn prune_timestamps_millis() {
     let output = ContextWithParquet::new(Scenario::Timestamps)
         .await
@@ -84,7 +84,7 @@ async fn prune_timestamps_millis() {
     assert_eq!(output.result_rows, 10, "{}", output.description());
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn prune_timestamps_seconds() {
     let output = ContextWithParquet::new(Scenario::Timestamps)
         .await
@@ -99,7 +99,7 @@ async fn prune_timestamps_seconds() {
     assert_eq!(output.result_rows, 10, "{}", output.description());
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn prune_date32() {
     let output = ContextWithParquet::new(Scenario::Dates)
         .await
@@ -112,7 +112,7 @@ async fn prune_date32() {
     assert_eq!(output.result_rows, 1, "{}", output.description());
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn prune_date64() {
     // work around for not being able to cast Date32 to Date64 automatically
     let date = "2020-01-02"
@@ -137,7 +137,7 @@ async fn prune_date64() {
     assert_eq!(output.result_rows, 1, "{}", output.description());
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn prune_disabled() {
     let query = "SELECT * FROM t where nanos < to_timestamp('2020-01-02 01:01:11Z')";
     let expected_rows = 10;
@@ -178,7 +178,7 @@ async fn prune_disabled() {
     );
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn prune_int32_lt() {
     let (expected_errors, expected_row_group_pruned, expected_results) =
         (Some(0), Some(1), 11);
@@ -218,7 +218,7 @@ async fn prune_int32_lt() {
     );
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn prune_int32_eq() {
     // resulrt of sql "SELECT * FROM t where i = 1"
     let output = ContextWithParquet::new(Scenario::Int32)
@@ -233,7 +233,7 @@ async fn prune_int32_eq() {
     assert_eq!(output.result_rows, 1, "{}", output.description());
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn prune_int32_scalar_fun_and_eq() {
     // resulrt of sql "SELECT * FROM t where abs(i) = 1 and i = 1"
     // only use "i = 1" to prune
@@ -249,7 +249,7 @@ async fn prune_int32_scalar_fun_and_eq() {
     assert_eq!(output.result_rows, 1, "{}", output.description());
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn prune_int32_scalar_fun() {
     // resulrt of sql "SELECT * FROM t where abs(i) = 1" is not supported
     let output = ContextWithParquet::new(Scenario::Int32)
@@ -265,7 +265,7 @@ async fn prune_int32_scalar_fun() {
     assert_eq!(output.result_rows, 3, "{}", output.description());
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn prune_int32_complex_expr() {
     // resulrt of sql "SELECT * FROM t where i+1 = 1" is not supported
     let output = ContextWithParquet::new(Scenario::Int32)
@@ -281,7 +281,7 @@ async fn prune_int32_complex_expr() {
     assert_eq!(output.result_rows, 2, "{}", output.description());
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn prune_int32_complex_expr_subtract() {
     // resulrt of sql "SELECT * FROM t where 1-i > 1" is not supported
     let output = ContextWithParquet::new(Scenario::Int32)
@@ -297,7 +297,7 @@ async fn prune_int32_complex_expr_subtract() {
     assert_eq!(output.result_rows, 9, "{}", output.description());
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn prune_f64_lt() {
     let (expected_errors, expected_row_group_pruned, expected_results) =
         (Some(0), Some(1), 11);
@@ -337,7 +337,7 @@ async fn prune_f64_lt() {
     );
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn prune_f64_scalar_fun_and_gt() {
     // resulrt of sql "SELECT * FROM t where abs(f - 1) <= 0.000001  and f >= 0.1"
     // only use "f >= 0" to prune
@@ -353,7 +353,7 @@ async fn prune_f64_scalar_fun_and_gt() {
     assert_eq!(output.result_rows, 1, "{}", output.description());
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn prune_f64_scalar_fun() {
     // resulrt of sql "SELECT * FROM t where abs(f-1) <= 0.000001" is not supported
     let output = ContextWithParquet::new(Scenario::Float64)
@@ -369,7 +369,7 @@ async fn prune_f64_scalar_fun() {
     assert_eq!(output.result_rows, 1, "{}", output.description());
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn prune_f64_complex_expr() {
     // resulrt of sql "SELECT * FROM t where f+1 > 1.1"" is not supported
     let output = ContextWithParquet::new(Scenario::Float64)
@@ -385,7 +385,7 @@ async fn prune_f64_complex_expr() {
     assert_eq!(output.result_rows, 9, "{}", output.description());
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn prune_f64_complex_expr_subtract() {
     // resulrt of sql "SELECT * FROM t where 1-f > 1" is not supported
     let output = ContextWithParquet::new(Scenario::Float64)
diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs
index 0f385680deed..b70ce155b6f3 100644
--- a/datafusion/tests/sql.rs
+++ b/datafusion/tests/sql.rs
@@ -110,7 +110,7 @@ async fn nyc() -> Result<()> {
     Ok(())
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn parquet_query() {
     let mut ctx = ExecutionContext::new();
     register_alltypes_parquet(&mut ctx);
@@ -136,7 +136,7 @@ async fn parquet_query() {
     assert_batches_eq!(expected, &actual);
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 async fn parquet_single_nan_schema() {
     let mut ctx = ExecutionContext::new();
     let testdata = datafusion::test_util::parquet_test_data();

From f6239b57d09ef68e90431bdffd93bd13296745e9 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Thu, 19 Aug 2021 14:07:59 +0800
Subject: [PATCH 16/16] fix fmt

---
 datafusion/src/datasource/object_store/local.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/src/datasource/object_store/local.rs b/datafusion/src/datasource/object_store/local.rs
index 36bb7e29de40..47f455280d5b 100644
--- a/datafusion/src/datasource/object_store/local.rs
+++ b/datafusion/src/datasource/object_store/local.rs
@@ -165,7 +165,7 @@ async fn list_all_async(root_path: String, ext: String) -> Result<FileNameStream
                 }
             }
         })
-            .flatten();
+        .flatten();
         Ok(Box::pin(result))
     }
 }