From a5bee79a635b2cc58ec3993391002c320cf54058 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Fri, 3 Dec 2021 15:23:31 +0800 Subject: [PATCH] [SPARK-37530][Core] Spark reads many paths very slow though newAPIHadoopFile --- core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala index 7fdbe7b971ac..c6959a5a4daf 100644 --- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala @@ -123,6 +123,10 @@ class NewHadoopRDD[K, V]( override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.getConstructor().newInstance() + // setMinPartitions below will call FileInputFormat.listStatus(), which can be quite slow when + // traversing a large number of directories and files. Parallelize it. + _conf.setIfUnset(FileInputFormat.LIST_STATUS_NUM_THREADS, + Runtime.getRuntime.availableProcessors().toString) inputFormat match { case configurable: Configurable => configurable.setConf(_conf)