@@ -413,10 +413,85 @@ case class DataSource(
413413 relation
414414 }
415415
416- /** Writes the given [[DataFrame ]] out to this [[DataSource ]]. */
417- def write (
418- mode : SaveMode ,
419- data : DataFrame ): BaseRelation = {
416+ /**
417+ * Writes the given [[DataFrame ]] out in this [[FileFormat ]].
418+ */
419+ private def writeInFileFormat (format : FileFormat , mode : SaveMode , data : DataFrame ): Unit = {
420+ // Don't glob path for the write path. The contracts here are:
421+ // 1. Only one output path can be specified on the write path;
422+ // 2. Output path must be a legal HDFS style file system path;
423+ // 3. It's OK that the output path doesn't exist yet;
424+ val allPaths = paths ++ caseInsensitiveOptions.get(" path" )
425+ val outputPath = if (allPaths.length == 1 ) {
426+ val path = new Path (allPaths.head)
427+ val fs = path.getFileSystem(sparkSession.sessionState.newHadoopConf())
428+ path.makeQualified(fs.getUri, fs.getWorkingDirectory)
429+ } else {
430+ throw new IllegalArgumentException (" Expected exactly one path to be specified, but " +
431+ s " got: ${allPaths.mkString(" , " )}" )
432+ }
433+
434+ val caseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis
435+ PartitioningUtils .validatePartitionColumn(data.schema, partitionColumns, caseSensitive)
436+
437+ // If we are appending to a table that already exists, make sure the partitioning matches
438+ // up. If we fail to load the table for whatever reason, ignore the check.
439+ if (mode == SaveMode .Append ) {
440+ val existingPartitionColumns = Try {
441+ getOrInferFileFormatSchema(format, justPartitioning = true )._2.fieldNames.toList
442+ }.getOrElse(Seq .empty[String ])
443+ // TODO: Case sensitivity.
444+ val sameColumns =
445+ existingPartitionColumns.map(_.toLowerCase()) == partitionColumns.map(_.toLowerCase())
446+ if (existingPartitionColumns.nonEmpty && ! sameColumns) {
447+ throw new AnalysisException (
448+ s """ Requested partitioning does not match existing partitioning.
449+ |Existing partitioning columns:
450+ | ${existingPartitionColumns.mkString(" , " )}
451+ |Requested partitioning columns:
452+ | ${partitionColumns.mkString(" , " )}
453+ | """ .stripMargin)
454+ }
455+ }
456+
457+ // SPARK-17230: Resolve the partition columns so InsertIntoHadoopFsRelationCommand does
458+ // not need to have the query as child, to avoid to analyze an optimized query,
459+ // because InsertIntoHadoopFsRelationCommand will be optimized first.
460+ val partitionAttributes = partitionColumns.map { name =>
461+ val plan = data.logicalPlan
462+ plan.resolve(name :: Nil , data.sparkSession.sessionState.analyzer.resolver).getOrElse {
463+ throw new AnalysisException (
464+ s " Unable to resolve $name given [ ${plan.output.map(_.name).mkString(" , " )}] " )
465+ }.asInstanceOf [Attribute ]
466+ }
467+ val fileIndex = catalogTable.map(_.identifier).map { tableIdent =>
468+ sparkSession.table(tableIdent).queryExecution.analyzed.collect {
469+ case LogicalRelation (t : HadoopFsRelation , _, _) => t.location
470+ }.head
471+ }
472+ // For partitioned relation r, r.schema's column ordering can be different from the column
473+ // ordering of data.logicalPlan (partition columns are all moved after data column). This
474+ // will be adjusted within InsertIntoHadoopFsRelation.
475+ val plan =
476+ InsertIntoHadoopFsRelationCommand (
477+ outputPath = outputPath,
478+ staticPartitions = Map .empty,
479+ partitionColumns = partitionAttributes,
480+ bucketSpec = bucketSpec,
481+ fileFormat = format,
482+ options = options,
483+ query = data.logicalPlan,
484+ mode = mode,
485+ catalogTable = catalogTable,
486+ fileIndex = fileIndex)
487+ sparkSession.sessionState.executePlan(plan).toRdd
488+ }
489+
490+ /**
491+ * Writes the given [[DataFrame ]] out to this [[DataSource ]] and returns a [[BaseRelation ]] for
492+ * the following reading.
493+ */
494+ def writeAndRead (mode : SaveMode , data : DataFrame ): BaseRelation = {
420495 if (data.schema.map(_.dataType).exists(_.isInstanceOf [CalendarIntervalType ])) {
421496 throw new AnalysisException (" Cannot save interval data type into external storage." )
422497 }
@@ -425,78 +500,27 @@ case class DataSource(
425500 case dataSource : CreatableRelationProvider =>
426501 dataSource.createRelation(sparkSession.sqlContext, mode, caseInsensitiveOptions, data)
427502 case format : FileFormat =>
428- // Don't glob path for the write path. The contracts here are:
429- // 1. Only one output path can be specified on the write path;
430- // 2. Output path must be a legal HDFS style file system path;
431- // 3. It's OK that the output path doesn't exist yet;
432- val allPaths = paths ++ caseInsensitiveOptions.get(" path" )
433- val outputPath = if (allPaths.length == 1 ) {
434- val path = new Path (allPaths.head)
435- val fs = path.getFileSystem(sparkSession.sessionState.newHadoopConf())
436- path.makeQualified(fs.getUri, fs.getWorkingDirectory)
437- } else {
438- throw new IllegalArgumentException (" Expected exactly one path to be specified, but " +
439- s " got: ${allPaths.mkString(" , " )}" )
440- }
441-
442- val caseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis
443- PartitioningUtils .validatePartitionColumn(
444- data.schema, partitionColumns, caseSensitive)
445-
446- // If we are appending to a table that already exists, make sure the partitioning matches
447- // up. If we fail to load the table for whatever reason, ignore the check.
448- if (mode == SaveMode .Append ) {
449- val existingPartitionColumns = Try {
450- getOrInferFileFormatSchema(format, justPartitioning = true )._2.fieldNames.toList
451- }.getOrElse(Seq .empty[String ])
452- // TODO: Case sensitivity.
453- val sameColumns =
454- existingPartitionColumns.map(_.toLowerCase()) == partitionColumns.map(_.toLowerCase())
455- if (existingPartitionColumns.nonEmpty && ! sameColumns) {
456- throw new AnalysisException (
457- s """ Requested partitioning does not match existing partitioning.
458- |Existing partitioning columns:
459- | ${existingPartitionColumns.mkString(" , " )}
460- |Requested partitioning columns:
461- | ${partitionColumns.mkString(" , " )}
462- | """ .stripMargin)
463- }
464- }
465-
466- // SPARK-17230: Resolve the partition columns so InsertIntoHadoopFsRelationCommand does
467- // not need to have the query as child, to avoid to analyze an optimized query,
468- // because InsertIntoHadoopFsRelationCommand will be optimized first.
469- val partitionAttributes = partitionColumns.map { name =>
470- val plan = data.logicalPlan
471- plan.resolve(name :: Nil , data.sparkSession.sessionState.analyzer.resolver).getOrElse {
472- throw new AnalysisException (
473- s " Unable to resolve $name given [ ${plan.output.map(_.name).mkString(" , " )}] " )
474- }.asInstanceOf [Attribute ]
475- }
476- val fileIndex = catalogTable.map(_.identifier).map { tableIdent =>
477- sparkSession.table(tableIdent).queryExecution.analyzed.collect {
478- case LogicalRelation (t : HadoopFsRelation , _, _) => t.location
479- }.head
480- }
481- // For partitioned relation r, r.schema's column ordering can be different from the column
482- // ordering of data.logicalPlan (partition columns are all moved after data column). This
483- // will be adjusted within InsertIntoHadoopFsRelation.
484- val plan =
485- InsertIntoHadoopFsRelationCommand (
486- outputPath = outputPath,
487- staticPartitions = Map .empty,
488- partitionColumns = partitionAttributes,
489- bucketSpec = bucketSpec,
490- fileFormat = format,
491- options = options,
492- query = data.logicalPlan,
493- mode = mode,
494- catalogTable = catalogTable,
495- fileIndex = fileIndex)
496- sparkSession.sessionState.executePlan(plan).toRdd
497- // Replace the schema with that of the DataFrame we just wrote out to avoid re-inferring it.
503+ writeInFileFormat(format, mode, data)
504+ // Replace the schema with that of the DataFrame we just wrote out to avoid re-inferring
498505 copy(userSpecifiedSchema = Some (data.schema.asNullable)).resolveRelation()
506+ case _ =>
507+ sys.error(s " ${providingClass.getCanonicalName} does not allow create table as select. " )
508+ }
509+ }
499510
511+ /**
512+ * Writes the given [[DataFrame ]] out to this [[DataSource ]].
513+ */
514+ def write (mode : SaveMode , data : DataFrame ): Unit = {
515+ if (data.schema.map(_.dataType).exists(_.isInstanceOf [CalendarIntervalType ])) {
516+ throw new AnalysisException (" Cannot save interval data type into external storage." )
517+ }
518+
519+ providingClass.newInstance() match {
520+ case dataSource : CreatableRelationProvider =>
521+ dataSource.createRelation(sparkSession.sqlContext, mode, caseInsensitiveOptions, data)
522+ case format : FileFormat =>
523+ writeInFileFormat(format, mode, data)
500524 case _ =>
501525 sys.error(s " ${providingClass.getCanonicalName} does not allow create table as select. " )
502526 }
0 commit comments