Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions project/MimaExcludes.scala
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,12 @@ object MimaExcludes {
"org.apache.spark.sql.parquet.ParquetRelation2$PartitionValues"),
ProblemFilters.exclude[MissingClassProblem](
"org.apache.spark.sql.parquet.ParquetRelation2$PartitionValues$"),
ProblemFilters.exclude[MissingClassProblem](
"org.apache.spark.sql.parquet.ParquetRelation2"),
ProblemFilters.exclude[MissingClassProblem](
"org.apache.spark.sql.parquet.ParquetRelation2$"),
ProblemFilters.exclude[MissingClassProblem](
"org.apache.spark.sql.parquet.ParquetRelation2$MetadataCache"),
// These test support classes were moved out of src/main and into src/test:
ProblemFilters.exclude[MissingClassProblem](
"org.apache.spark.sql.parquet.ParquetTestData"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,11 @@ import scala.reflect.runtime.universe.TypeTag
import scala.util.control.NonFatal

import com.google.common.reflect.TypeToken
import org.apache.hadoop.fs.Path

import org.apache.spark.annotation.{DeveloperApi, Experimental}
import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.catalyst.expressions._
Expand All @@ -42,6 +44,7 @@ import org.apache.spark.sql.catalyst.{CatalystTypeConverters, ScalaReflection, e
import org.apache.spark.sql.execution.{Filter, _}
import org.apache.spark.sql.jdbc.{JDBCPartition, JDBCPartitioningInfo, JDBCRelation}
import org.apache.spark.sql.json._
import org.apache.spark.sql.parquet.FSBasedParquetRelation
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types._
import org.apache.spark.util.Utils
Expand Down Expand Up @@ -641,7 +644,10 @@ class SQLContext(@transient val sparkContext: SparkContext)
if (paths.isEmpty) {
emptyDataFrame
} else if (conf.parquetUseDataSourceApi) {
baseRelationToDataFrame(parquet.ParquetRelation2(paths, Map.empty)(this))
val globbedPaths = paths.map(new Path(_)).flatMap(SparkHadoopUtil.get.globPath).toArray
baseRelationToDataFrame(
new FSBasedParquetRelation(
globbedPaths.map(_.toString), None, None, Map.empty[String, String])(this))
} else {
DataFrame(this, parquet.ParquetRelation(
paths.mkString(","), Some(sparkContext.hadoopConfiguration), this))
Expand Down
278 changes: 167 additions & 111 deletions sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,7 @@ private[parquet] object FileSystemHelper {
def findMaxTaskId(pathStr: String, conf: Configuration): Int = {
val files = FileSystemHelper.listFiles(pathStr, conf)
// filename pattern is part-r-<int>.parquet
val nameP = new scala.util.matching.Regex("""part-r-(\d{1,}).parquet""", "taskid")
val nameP = new scala.util.matching.Regex("""part-.-(\d{1,}).*""", "taskid")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original code doesn't handle file names like parquet-r-00001.gz.parquet.

val hiddenFileP = new scala.util.matching.Regex("_.*")
files.map(_.getName).map {
case nameP(taskid) => taskid.toInt
Expand Down
Loading