@@ -22,7 +22,7 @@ import java.util.{List => JList}
2222
2323import scala .collection .JavaConversions ._
2424import scala .collection .mutable
25- import scala .util .Try
25+ import scala .util .{ Failure , Try }
2626
2727import com .google .common .base .Objects
2828import org .apache .hadoop .fs .{FileStatus , Path }
@@ -31,12 +31,11 @@ import org.apache.hadoop.mapreduce._
3131import org .apache .hadoop .mapreduce .lib .input .FileInputFormat
3232import org .apache .parquet .filter2 .predicate .FilterApi
3333import org .apache .parquet .hadoop ._
34- import org .apache .parquet .hadoop .metadata .{ FileMetaData , CompressionCodecName }
34+ import org .apache .parquet .hadoop .metadata .CompressionCodecName
3535import org .apache .parquet .hadoop .util .ContextUtil
3636import org .apache .parquet .schema .MessageType
3737
3838import org .apache .spark .broadcast .Broadcast
39- import org .apache .spark .deploy .SparkHadoopUtil
4039import org .apache .spark .rdd .RDD
4140import org .apache .spark .rdd .RDD ._
4241import org .apache .spark .sql ._
@@ -278,19 +277,13 @@ private[sql] class ParquetRelation2(
278277 // Create the function to set input paths at the driver side.
279278 val setInputPaths = ParquetRelation2 .initializeDriverSideJobFunc(inputFiles) _
280279
281- val footers = inputFiles.map(f => metadataCache.footers(f.getPath))
282-
283280 Utils .withDummyCallSite(sqlContext.sparkContext) {
284- // TODO Stop using `FilteringParquetRowInputFormat` and overriding `getPartition`.
285- // After upgrading to Parquet 1.6.0, we should be able to stop caching `FileStatus` objects
286- // and footers. Especially when a global arbitrative schema (either from metastore or data
287- // source DDL) is available.
288281 new SqlNewHadoopRDD (
289282 sc = sqlContext.sparkContext,
290283 broadcastedConf = broadcastedConf,
291284 initDriverSideJobFuncOpt = Some (setInputPaths),
292285 initLocalJobFuncOpt = Some (initLocalJobFuncOpt),
293- inputFormatClass = classOf [FilteringParquetRowInputFormat ],
286+ inputFormatClass = classOf [ParquetInputFormat [ InternalRow ] ],
294287 keyClass = classOf [Void ],
295288 valueClass = classOf [InternalRow ]) {
296289
@@ -306,12 +299,6 @@ private[sql] class ParquetRelation2(
306299 f.getAccessTime, f.getPermission, f.getOwner, f.getGroup, pathWithEscapedAuthority)
307300 }.toSeq
308301
309- @ transient val cachedFooters = footers.map { f =>
310- // In order to encode the authority of a Path containing special characters such as /,
311- // we need to use the string returned by the URI of the path to create a new Path.
312- new Footer (escapePathUserInfo(f.getFile), f.getParquetMetadata)
313- }.toSeq
314-
315302 private def escapePathUserInfo (path : Path ): Path = {
316303 val uri = path.toUri
317304 new Path (new URI (
@@ -321,13 +308,10 @@ private[sql] class ParquetRelation2(
321308
322309 // Overridden so we can inject our own cached files statuses.
323310 override def getPartitions : Array [SparkPartition ] = {
324- val inputFormat = if (cacheMetadata) {
325- new FilteringParquetRowInputFormat {
326- override def listStatus (jobContext : JobContext ): JList [FileStatus ] = cachedStatuses
327- override def getFooters (jobContext : JobContext ): JList [Footer ] = cachedFooters
311+ val inputFormat = new ParquetInputFormat [InternalRow ] {
312+ override def listStatus (jobContext : JobContext ): JList [FileStatus ] = {
313+ if (cacheMetadata) cachedStatuses else super .listStatus(jobContext)
328314 }
329- } else {
330- new FilteringParquetRowInputFormat
331315 }
332316
333317 val jobContext = newJobContext(getConf(isDriverSide = true ), jobId)
@@ -348,9 +332,6 @@ private[sql] class ParquetRelation2(
348332 // `FileStatus` objects of all "_common_metadata" files.
349333 private var commonMetadataStatuses : Array [FileStatus ] = _
350334
351- // Parquet footer cache.
352- var footers : Map [Path , Footer ] = _
353-
354335 // `FileStatus` objects of all data files (Parquet part-files).
355336 var dataStatuses : Array [FileStatus ] = _
356337
@@ -376,20 +357,6 @@ private[sql] class ParquetRelation2(
376357 commonMetadataStatuses =
377358 leaves.filter(_.getPath.getName == ParquetFileWriter .PARQUET_COMMON_METADATA_FILE )
378359
379- footers = {
380- val conf = SparkHadoopUtil .get.conf
381- val taskSideMetaData = conf.getBoolean(ParquetInputFormat .TASK_SIDE_METADATA , true )
382- val rawFooters = if (shouldMergeSchemas) {
383- ParquetFileReader .readAllFootersInParallel(
384- conf, seqAsJavaList(leaves), taskSideMetaData)
385- } else {
386- ParquetFileReader .readAllFootersInParallelUsingSummaryFiles(
387- conf, seqAsJavaList(leaves), taskSideMetaData)
388- }
389-
390- rawFooters.map(footer => footer.getFile -> footer).toMap
391- }
392-
393360 // If we already get the schema, don't need to re-compute it since the schema merging is
394361 // time-consuming.
395362 if (dataSchema == null ) {
@@ -422,7 +389,7 @@ private[sql] class ParquetRelation2(
422389 // Always tries the summary files first if users don't require a merged schema. In this case,
423390 // "_common_metadata" is more preferable than "_metadata" because it doesn't contain row
424391 // groups information, and could be much smaller for large Parquet files with lots of row
425- // groups.
392+ // groups. If no summary file is available, falls back to some random part-file.
426393 //
427394 // NOTE: Metadata stored in the summary files are merged from all part-files. However, for
428395 // user defined key-value metadata (in which we store Spark SQL schema), Parquet doesn't know
@@ -457,10 +424,10 @@ private[sql] class ParquetRelation2(
457424
458425 assert(
459426 filesToTouch.nonEmpty || maybeDataSchema.isDefined || maybeMetastoreSchema.isDefined,
460- " No schema defined , " +
461- s " and no Parquet data file or summary file found under ${paths.mkString(" , " )}. " )
427+ " No predefined schema found , " +
428+ s " and no Parquet data files or summary files found under ${paths.mkString(" , " )}. " )
462429
463- ParquetRelation2 .readSchema (filesToTouch.map(f => footers.apply(f.getPath)) , sqlContext)
430+ ParquetRelation2 .mergeSchemasInParallel (filesToTouch, sqlContext)
464431 }
465432 }
466433}
@@ -519,6 +486,7 @@ private[sql] object ParquetRelation2 extends Logging {
519486 private [parquet] def initializeDriverSideJobFunc (
520487 inputFiles : Array [FileStatus ])(job : Job ): Unit = {
521488 // We side the input paths at the driver side.
489+ logInfo(s " Reading Parquet file(s) from ${inputFiles.map(_.getPath).mkString(" , " )}" )
522490 if (inputFiles.nonEmpty) {
523491 FileInputFormat .setInputPaths(job, inputFiles.map(_.getPath): _* )
524492 }
@@ -543,7 +511,7 @@ private[sql] object ParquetRelation2 extends Logging {
543511 .getKeyValueMetaData
544512 .toMap
545513 .get(RowReadSupport .SPARK_METADATA_KEY )
546- if (serializedSchema == None ) {
514+ if (serializedSchema.isEmpty ) {
547515 // Falls back to Parquet schema if no Spark SQL schema found.
548516 Some (parseParquetSchema(metadata.getSchema))
549517 } else if (! seen.contains(serializedSchema.get)) {
@@ -646,4 +614,106 @@ private[sql] object ParquetRelation2 extends Logging {
646614 .filter(_.nullable)
647615 StructType (parquetSchema ++ missingFields)
648616 }
617+
618+ /**
619+ * Figures out a merged Parquet schema with a distributed Spark job.
620+ *
621+ * Note that locality is not taken into consideration here because:
622+ *
623+ * 1. For a single Parquet part-file, in most cases the footer only resides in the last block of
624+ * that file. Thus we only need to retrieve the location of the last block. However, Hadoop
625+ * `FileSystem` only provides API to retrieve locations of all blocks, which can be
626+ * potentially expensive.
627+ *
628+ * 2. This optimization is mainly useful for S3, where file metadata operations can be pretty
629+ * slow. And basically locality is not available when using S3 (you can't run computation on
630+ * S3 nodes).
631+ */
632+ def mergeSchemasInParallel (
633+ filesToTouch : Seq [FileStatus ], sqlContext : SQLContext ): Option [StructType ] = {
634+ val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString
635+ val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp
636+ val followParquetFormatSpec = sqlContext.conf.followParquetFormatSpec
637+ val serializedConf = new SerializableConfiguration (sqlContext.sparkContext.hadoopConfiguration)
638+
639+ // HACK ALERT:
640+ //
641+ // Parquet requires `FileStatus`es to read footers. Here we try to send cached `FileStatus`es
642+ // to executor side to avoid fetching them again. However, `FileStatus` is not `Serializable`
643+ // but only `Writable`. What makes it worth, for some reason, `FileStatus` doesn't play well
644+ // with `SerializableWritable[T]` and always causes a weird `IllegalStateException`. These
645+ // facts virtually prevents us to serialize `FileStatus`es.
646+ //
647+ // Since Parquet only relies on path and length information of those `FileStatus`es to read
648+ // footers, here we just extract them (which can be easily serialized), send them to executor
649+ // side, and resemble fake `FileStatus`es there.
650+ val partialFileStatusInfo = filesToTouch.map(f => (f.getPath.toString, f.getLen))
651+
652+ // Issues a Spark job to read Parquet schema in parallel.
653+ val partiallyMergedSchemas =
654+ sqlContext
655+ .sparkContext
656+ .parallelize(partialFileStatusInfo)
657+ .mapPartitions { iterator =>
658+ // Resembles fake `FileStatus`es with serialized path and length information.
659+ val fakeFileStatuses = iterator.map { case (path, length) =>
660+ new FileStatus (length, false , 0 , 0 , 0 , 0 , null , null , null , new Path (path))
661+ }.toSeq
662+
663+ // Skips row group information since we only need the schema
664+ val skipRowGroups = true
665+
666+ // Reads footers in multi-threaded manner within each task
667+ val footers =
668+ ParquetFileReader .readAllFootersInParallel(
669+ serializedConf.value, fakeFileStatuses, skipRowGroups)
670+
671+ // Converter used to convert Parquet `MessageType` to Spark SQL `StructType`
672+ val converter =
673+ new CatalystSchemaConverter (
674+ assumeBinaryIsString = assumeBinaryIsString,
675+ assumeInt96IsTimestamp = assumeInt96IsTimestamp,
676+ followParquetFormatSpec = followParquetFormatSpec)
677+
678+ footers.map { footer =>
679+ ParquetRelation2 .readSchemaFromFooter(footer, converter)
680+ }.reduceOption(_ merge _).iterator
681+ }.collect()
682+
683+ partiallyMergedSchemas.reduceOption(_ merge _)
684+ }
685+
686+ /**
687+ * Reads Spark SQL schema from a Parquet footer. If a valid serialized Spark SQL schema string
688+ * can be found in the file metadata, returns the deserialized [[StructType ]], otherwise, returns
689+ * a [[StructType ]] converted from the [[MessageType ]] stored in this footer.
690+ */
691+ def readSchemaFromFooter (
692+ footer : Footer , converter : CatalystSchemaConverter ): StructType = {
693+ val fileMetaData = footer.getParquetMetadata.getFileMetaData
694+ fileMetaData
695+ .getKeyValueMetaData
696+ .toMap
697+ .get(RowReadSupport .SPARK_METADATA_KEY )
698+ .flatMap(deserializeSchemaString)
699+ .getOrElse(converter.convert(fileMetaData.getSchema))
700+ }
701+
702+ private def deserializeSchemaString (schemaString : String ): Option [StructType ] = {
703+ // Tries to deserialize the schema string as JSON first, then falls back to the case class
704+ // string parser (data generated by older versions of Spark SQL uses this format).
705+ Try (DataType .fromJson(schemaString).asInstanceOf [StructType ]).recover {
706+ case _ : Throwable =>
707+ logInfo(
708+ s " Serialized Spark schema in Parquet key-value metadata is not in JSON format, " +
709+ " falling back to the deprecated DataType.fromCaseClassString parser." )
710+ DataType .fromCaseClassString(schemaString).asInstanceOf [StructType ]
711+ }.recoverWith {
712+ case cause : Throwable =>
713+ logWarning(
714+ " Failed to parse and ignored serialized Spark schema in " +
715+ s " Parquet key-value metadata: \n\t $schemaString" , cause)
716+ Failure (cause)
717+ }.toOption
718+ }
649719}
0 commit comments