From 3f1e4efbd8f67a3bb1a9e33be91e2a07e0aa01b0 Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Thu, 2 Jul 2015 15:58:42 -0700 Subject: [PATCH 01/35] fix string order for non-ascii character --- .../main/scala/org/apache/spark/sql/types/UTF8String.scala | 3 ++- .../scala/org/apache/spark/sql/types/UTF8StringSuite.scala | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UTF8String.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UTF8String.scala index f5d8fcced362b..e17743a9f0cf4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UTF8String.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UTF8String.scala @@ -156,7 +156,8 @@ final class UTF8String extends Ordered[UTF8String] with Serializable { var i: Int = 0 val b = other.getBytes while (i < bytes.length && i < b.length) { - val res = bytes(i).compareTo(b(i)) + // UTF8 should be compared as unsigned int + val res = (bytes(i) & 0xFF) - (b(i) & 0xFF) if (res != 0) return res i += 1 } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala index 81d7ab010f394..02feee4313cdb 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/UTF8StringSuite.scala @@ -40,6 +40,11 @@ class UTF8StringSuite extends SparkFunSuite { check("世 界", 3) } + test("compare") { + assert(UTF8String("abc") > UTF8String("ABC")) + assert(UTF8String("世 界") > UTF8String("abc")) + } + test("contains") { assert(UTF8String("hello").contains(UTF8String("ello"))) assert(!UTF8String("hello").contains(UTF8String("vello"))) From de0802499a188a249bd5faf2f411b2e8ee522c33 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Thu, 2 Jul 2015 21:30:13 -0700 Subject: [PATCH 02/35] [SPARK-8501] [SQL] Avoids reading schema from empty ORC files (backport to 1.4) This PR backports #7199 to branch-1.4 Author: Cheng Lian Closes #7200 from liancheng/spark-8501-for-1.4 and squashes the following commits: 725e9e3 [Cheng Lian] Addresses comments 0fa25af [Cheng Lian] Avoids reading schema from empty ORC files --- .../spark/sql/hive/orc/OrcFileOperator.scala | 62 +++++++++++++++---- .../spark/sql/hive/orc/OrcRelation.scala | 44 +++++++------ .../spark/sql/hive/orc/OrcQuerySuite.scala | 56 ++++++++++++++--- .../spark/sql/hive/orc/OrcSourceSuite.scala | 22 +++---- 4 files changed, 136 insertions(+), 48 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala index e5e92e6caf2cd..bc5cc55b7b5c5 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala @@ -24,30 +24,70 @@ import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.Logging import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.hive.HiveMetastoreTypes import org.apache.spark.sql.types.StructType -private[orc] object OrcFileOperator extends Logging{ - def getFileReader(pathStr: String, config: Option[Configuration] = None ): Reader = { +private[orc] object OrcFileOperator extends Logging { + /** + * Retrieves a ORC file reader from a given path. The path can point to either a directory or a + * single ORC file. If it points to an directory, it picks any non-empty ORC file within that + * directory. + * + * The reader returned by this method is mainly used for two purposes: + * + * 1. Retrieving file metadata (schema and compression codecs, etc.) + * 2. Read the actual file content (in this case, the given path should point to the target file) + * + * @note As recorded by SPARK-8501, ORC writes an empty schema (struct<>) to an + * ORC file if the file contains zero rows. This is OK for Hive since the schema of the + * table is managed by metastore. But this becomes a problem when reading ORC files + * directly from HDFS via Spark SQL, because we have to discover the schema from raw ORC + * files. So this method always tries to find a ORC file whose schema is non-empty, and + * create the result reader from that file. If no such file is found, it returns `None`. + * + * @todo Needs to consider all files when schema evolution is taken into account. + */ + def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = { + def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = { + reader.getObjectInspector match { + case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 => + logInfo( + s"ORC file $path has empty schema, it probably contains no rows. " + + "Trying to read another ORC file to figure out the schema.") + false + case _ => true + } + } + val conf = config.getOrElse(new Configuration) - val fspath = new Path(pathStr) - val fs = fspath.getFileSystem(conf) - val orcFiles = listOrcFiles(pathStr, conf) - logDebug(s"Creating ORC Reader from ${orcFiles.head}") - // TODO Need to consider all files when schema evolution is taken into account. - OrcFile.createReader(fs, orcFiles.head) + val fs = { + val hdfsPath = new Path(basePath) + hdfsPath.getFileSystem(conf) + } + + listOrcFiles(basePath, conf).iterator.map { path => + path -> OrcFile.createReader(fs, path) + }.collectFirst { + case (path, reader) if isWithNonEmptySchema(path, reader) => reader + } } def readSchema(path: String, conf: Option[Configuration]): StructType = { - val reader = getFileReader(path, conf) + val reader = getFileReader(path, conf).getOrElse { + throw new AnalysisException( + s"Failed to discover schema from ORC files stored in $path. " + + "Probably there are either no ORC files or only empty ORC files.") + } val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector] val schema = readerInspector.getTypeName logDebug(s"Reading schema from file $path, got Hive schema string: $schema") HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType] } - def getObjectInspector(path: String, conf: Option[Configuration]): StructObjectInspector = { - getFileReader(path, conf).getObjectInspector.asInstanceOf[StructObjectInspector] + def getObjectInspector( + path: String, conf: Option[Configuration]): Option[StructObjectInspector] = { + getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector]) } def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala index 3713f6fd94f70..204bbe03af111 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala @@ -241,26 +241,34 @@ private[orc] case class OrcTableScan( nonPartitionKeyAttrs: Seq[(Attribute, Int)], mutableRow: MutableRow): Iterator[Row] = { val deserializer = new OrcSerde - val soi = OrcFileOperator.getObjectInspector(path, Some(conf)) - val (fieldRefs, fieldOrdinals) = nonPartitionKeyAttrs.map { - case (attr, ordinal) => - soi.getStructFieldRef(attr.name.toLowerCase) -> ordinal - }.unzip - val unwrappers = fieldRefs.map(unwrapperFor) - // Map each tuple to a row object - iterator.map { value => - val raw = deserializer.deserialize(value) - var i = 0 - while (i < fieldRefs.length) { - val fieldValue = soi.getStructFieldData(raw, fieldRefs(i)) - if (fieldValue == null) { - mutableRow.setNullAt(fieldOrdinals(i)) - } else { - unwrappers(i)(fieldValue, mutableRow, fieldOrdinals(i)) + val maybeStructOI = OrcFileOperator.getObjectInspector(path, Some(conf)) + + // SPARK-8501: ORC writes an empty schema ("struct<>") to an ORC file if the file contains zero + // rows, and thus couldn't give a proper ObjectInspector. In this case we just return an empty + // partition since we know that this file is empty. + maybeStructOI.map { soi => + val (fieldRefs, fieldOrdinals) = nonPartitionKeyAttrs.map { + case (attr, ordinal) => + soi.getStructFieldRef(attr.name.toLowerCase) -> ordinal + }.unzip + val unwrappers = fieldRefs.map(unwrapperFor) + // Map each tuple to a row object + iterator.map { value => + val raw = deserializer.deserialize(value) + var i = 0 + while (i < fieldRefs.length) { + val fieldValue = soi.getStructFieldData(raw, fieldRefs(i)) + if (fieldValue == null) { + mutableRow.setNullAt(fieldOrdinals(i)) + } else { + unwrappers(i)(fieldValue, mutableRow, fieldOrdinals(i)) + } + i += 1 } - i += 1 + mutableRow: Row } - mutableRow: Row + }.getOrElse { + Iterator.empty } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala index 57c23fe77f8b5..dd63a8e1bd184 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala @@ -23,9 +23,7 @@ import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.hadoop.hive.ql.io.orc.CompressionKind import org.scalatest.BeforeAndAfterAll -import org.apache.spark.SparkFunSuite import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.expressions.Row import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.test.TestHive.implicits._ @@ -54,8 +52,6 @@ case class Person(name: String, age: Int, contacts: Seq[Contact]) class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest { override val sqlContext = TestHive - import TestHive.read - def getTempFilePath(prefix: String, suffix: String = ""): File = { val tempFile = File.createTempFile(prefix, suffix) tempFile.delete() @@ -173,7 +169,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest { test("Default compression options for writing to an ORC file") { withOrcFile((1 to 100).map(i => (i, s"val_$i"))) { file => assertResult(CompressionKind.ZLIB) { - OrcFileOperator.getFileReader(file).getCompression + OrcFileOperator.getFileReader(file).get.getCompression } } } @@ -186,21 +182,21 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest { conf.set(ConfVars.HIVE_ORC_DEFAULT_COMPRESS.varname, "SNAPPY") withOrcFile(data) { file => assertResult(CompressionKind.SNAPPY) { - OrcFileOperator.getFileReader(file).getCompression + OrcFileOperator.getFileReader(file).get.getCompression } } conf.set(ConfVars.HIVE_ORC_DEFAULT_COMPRESS.varname, "NONE") withOrcFile(data) { file => assertResult(CompressionKind.NONE) { - OrcFileOperator.getFileReader(file).getCompression + OrcFileOperator.getFileReader(file).get.getCompression } } conf.set(ConfVars.HIVE_ORC_DEFAULT_COMPRESS.varname, "LZO") withOrcFile(data) { file => assertResult(CompressionKind.LZO) { - OrcFileOperator.getFileReader(file).getCompression + OrcFileOperator.getFileReader(file).get.getCompression } } } @@ -292,4 +288,48 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest { List(Row("same", "run_5", 100))) } } + + test("SPARK-8501: Avoids discovery schema from empty ORC files") { + withTempPath { dir => + val path = dir.getCanonicalPath + + withTable("empty_orc") { + withTempTable("empty", "single") { + sqlContext.sql( + s"""CREATE TABLE empty_orc(key INT, value STRING) + |STORED AS ORC + |LOCATION '$path' + """.stripMargin) + + val emptyDF = Seq.empty[(Int, String)].toDF("key", "value").coalesce(1) + emptyDF.registerTempTable("empty") + + // This creates 1 empty ORC file with Hive ORC SerDe. We are using this trick because + // Spark SQL ORC data source always avoids write empty ORC files. + sqlContext.sql( + s"""INSERT INTO TABLE empty_orc + |SELECT key, value FROM empty + """.stripMargin) + + val errorMessage = intercept[AnalysisException] { + sqlContext.read.format("orc").load(path) + }.getMessage + + assert(errorMessage.contains("Failed to discover schema from ORC files")) + + val singleRowDF = Seq((0, "foo")).toDF("key", "value").coalesce(1) + singleRowDF.registerTempTable("single") + + sqlContext.sql( + s"""INSERT INTO TABLE empty_orc + |SELECT key, value FROM single + """.stripMargin) + + val df = sqlContext.read.format("orc").load(path) + assert(df.schema === singleRowDF.schema.asNullable) + checkAnswer(df, singleRowDF) + } + } + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala index 433ea9b8534ef..82e08caf46457 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala @@ -44,7 +44,7 @@ abstract class OrcSuite extends QueryTest with BeforeAndAfterAll { import org.apache.spark.sql.hive.test.TestHive.implicits._ sparkContext - .makeRDD(1 to 100) + .makeRDD(1 to 10) .map(i => OrcData(i, s"part-$i")) .toDF() .registerTempTable(s"orc_temp_table") @@ -70,35 +70,35 @@ abstract class OrcSuite extends QueryTest with BeforeAndAfterAll { } test("create temporary orc table") { - checkAnswer(sql("SELECT COUNT(*) FROM normal_orc_source"), Row(100)) + checkAnswer(sql("SELECT COUNT(*) FROM normal_orc_source"), Row(10)) checkAnswer( sql("SELECT * FROM normal_orc_source"), - (1 to 100).map(i => Row(i, s"part-$i"))) + (1 to 10).map(i => Row(i, s"part-$i"))) checkAnswer( sql("SELECT * FROM normal_orc_source where intField > 5"), - (6 to 100).map(i => Row(i, s"part-$i"))) + (6 to 10).map(i => Row(i, s"part-$i"))) checkAnswer( sql("SELECT COUNT(intField), stringField FROM normal_orc_source GROUP BY stringField"), - (1 to 100).map(i => Row(1, s"part-$i"))) + (1 to 10).map(i => Row(1, s"part-$i"))) } test("create temporary orc table as") { - checkAnswer(sql("SELECT COUNT(*) FROM normal_orc_as_source"), Row(100)) + checkAnswer(sql("SELECT COUNT(*) FROM normal_orc_as_source"), Row(10)) checkAnswer( sql("SELECT * FROM normal_orc_source"), - (1 to 100).map(i => Row(i, s"part-$i"))) + (1 to 10).map(i => Row(i, s"part-$i"))) checkAnswer( sql("SELECT * FROM normal_orc_source WHERE intField > 5"), - (6 to 100).map(i => Row(i, s"part-$i"))) + (6 to 10).map(i => Row(i, s"part-$i"))) checkAnswer( sql("SELECT COUNT(intField), stringField FROM normal_orc_source GROUP BY stringField"), - (1 to 100).map(i => Row(1, s"part-$i"))) + (1 to 10).map(i => Row(1, s"part-$i"))) } test("appending insert") { @@ -106,7 +106,7 @@ abstract class OrcSuite extends QueryTest with BeforeAndAfterAll { checkAnswer( sql("SELECT * FROM normal_orc_source"), - (1 to 5).map(i => Row(i, s"part-$i")) ++ (6 to 100).flatMap { i => + (1 to 5).map(i => Row(i, s"part-$i")) ++ (6 to 10).flatMap { i => Seq.fill(2)(Row(i, s"part-$i")) }) } @@ -119,7 +119,7 @@ abstract class OrcSuite extends QueryTest with BeforeAndAfterAll { checkAnswer( sql("SELECT * FROM normal_orc_as_source"), - (6 to 100).map(i => Row(i, s"part-$i"))) + (6 to 10).map(i => Row(i, s"part-$i"))) } } From f142867ecee59a635df91aee888351bee5f29c0e Mon Sep 17 00:00:00 2001 From: Yin Huai Date: Thu, 2 Jul 2015 22:09:07 -0700 Subject: [PATCH 03/35] [SPARK-8776] Increase the default MaxPermSize I am increasing the perm gen size to 256m. https://issues.apache.org/jira/browse/SPARK-8776 Author: Yin Huai Closes #7196 from yhuai/SPARK-8776 and squashes the following commits: 60901b4 [Yin Huai] Fix test. d44b713 [Yin Huai] Make sparkShell and hiveConsole use 256m PermGen size. 30aaf8e [Yin Huai] Increase the default PermGen size to 256m. (cherry picked from commit f743c79abe5a2fb66be32a896ea47e858569b0c7) Signed-off-by: Yin Huai --- .../org/apache/spark/launcher/AbstractCommandBuilder.java | 2 +- .../apache/spark/launcher/SparkSubmitCommandBuilderSuite.java | 2 +- project/SparkBuild.scala | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java index b8f02b961113d..b2591c2ec4af6 100644 --- a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java +++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java @@ -133,7 +133,7 @@ void addPermGenSizeOpt(List cmd) { } } - cmd.add("-XX:MaxPermSize=128m"); + cmd.add("-XX:MaxPermSize=256m"); } void addOptionString(List cmd, String options) { diff --git a/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java b/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java index 97043a76cc612..7329ac9f7fb8c 100644 --- a/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java +++ b/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java @@ -194,7 +194,7 @@ private void testCmdBuilder(boolean isDriver) throws Exception { if (isDriver) { assertEquals("-XX:MaxPermSize=256m", arg); } else { - assertEquals("-XX:MaxPermSize=128m", arg); + assertEquals("-XX:MaxPermSize=256m", arg); } } } diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index aa59d74cf422b..4b00a66f3fd5a 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -206,7 +206,7 @@ object SparkBuild extends PomBuild { fork := true, outputStrategy in run := Some (StdoutOutput), - javaOptions ++= Seq("-Xmx2G", "-XX:MaxPermSize=1g"), + javaOptions ++= Seq("-Xmx2G", "-XX:MaxPermSize=256m"), sparkShell := { (runMain in Compile).toTask(" org.apache.spark.repl.Main -usejavacp").value @@ -307,7 +307,7 @@ object SQL { object Hive { lazy val settings = Seq( - javaOptions += "-XX:MaxPermSize=1g", + javaOptions += "-XX:MaxPermSize=256m", // Specially disable assertions since some Hive tests fail them javaOptions in Test := (javaOptions in Test).value.filterNot(_ == "-ea"), // Multiple queries rely on the TestHive singleton. See comments there for more details. From ff76b33b67cec0614143e60f8dc88435521cf3aa Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Thu, 2 Jul 2015 22:10:24 -0700 Subject: [PATCH 04/35] [SPARK-8803] handle special characters in elements in crosstab cc rxin Having back ticks or null as elements causes problems. Since elements become column names, we have to drop them from the element as back ticks are special characters. Having null throws exceptions, we could replace them with empty strings. Handling back ticks should be improved for 1.5 Author: Burak Yavuz Closes #7201 from brkyvz/weird-ct-elements and squashes the following commits: e06b840 [Burak Yavuz] fix scalastyle 93a0d3f [Burak Yavuz] added tests for NaN and Infinity 9dba6ce [Burak Yavuz] address cr1 db71dbd [Burak Yavuz] handle special characters in elements in crosstab (cherry picked from commit 9b23e92c727881ff9038b4fe9643c49b96914159) Signed-off-by: Reynold Xin Conflicts: sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala --- .../spark/sql/DataFrameNaFunctions.scala | 2 +- .../spark/sql/DataFrameStatFunctions.scala | 3 ++ .../sql/execution/stat/StatFunctions.scala | 20 ++++++++++--- .../apache/spark/sql/DataFrameStatSuite.scala | 30 +++++++++++++++++++ 4 files changed, 50 insertions(+), 5 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala index b4c2daa055868..8681a56c82f1e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala @@ -391,7 +391,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) { * Returns a [[Column]] expression that replaces null value in `col` with `replacement`. */ private def fillCol[T](col: StructField, replacement: T): Column = { - coalesce(df.col(col.name), lit(replacement).cast(col.dataType)).as(col.name) + coalesce(df.col("`" + col.name + "`"), lit(replacement).cast(col.dataType)).as(col.name) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index edb9ed7bba56a..587869e57f96e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -78,6 +78,9 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * The first column of each row will be the distinct values of `col1` and the column names will * be the distinct values of `col2`. The name of the first column will be `$col1_$col2`. Counts * will be returned as `Long`s. Pairs that have no occurrences will have `null` as their counts. + * Null elements will be replaced by "null", and back ticks will be dropped from elements if they + * exist. + * * * @param col1 The name of the first column. Distinct items will make the first item of * each row. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala index e4a525e37690b..5a0c9a66b858a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala @@ -109,8 +109,12 @@ private[sql] object StatFunctions extends Logging { logWarning("The maximum limit of 1e6 pairs have been collected, which may not be all of " + "the pairs. Please try reducing the amount of distinct items in your columns.") } + def cleanElement(element: Any): String = { + if (element == null) "null" else element.toString + } // get the distinct values of column 2, so that we can make them the column names - val distinctCol2: Map[Any, Int] = counts.map(_.get(1)).distinct.zipWithIndex.toMap + val distinctCol2: Map[Any, Int] = + counts.map(e => cleanElement(e.get(1))).distinct.zipWithIndex.toMap val columnSize = distinctCol2.size require(columnSize < 1e4, s"The number of distinct values for $col2, can't " + s"exceed 1e4. Currently $columnSize") @@ -120,15 +124,23 @@ private[sql] object StatFunctions extends Logging { // row.get(0) is column 1 // row.get(1) is column 2 // row.get(2) is the frequency - countsRow.setLong(distinctCol2.get(row.get(1)).get + 1, row.getLong(2)) + val columnIndex = distinctCol2.get(cleanElement(row.get(1))).get + countsRow.setLong(columnIndex + 1, row.getLong(2)) } // the value of col1 is the first value, the rest are the counts - countsRow.setString(0, col1Item.toString) + countsRow.setString(0, cleanElement(col1Item.toString)) countsRow }.toSeq + // Back ticks can't exist in DataFrame column names, therefore drop them. To be able to accept + // special keywords and `.`, wrap the column names in ``. + def cleanColumnName(name: String): String = { + name.replace("`", "") + } // In the map, the column names (._1) are not ordered by the index (._2). This was the bug in // SPARK-8681. We need to explicitly sort by the column index and assign the column names. - val headerNames = distinctCol2.toSeq.sortBy(_._2).map(r => StructField(r._1.toString, LongType)) + val headerNames = distinctCol2.toSeq.sortBy(_._2).map { r => + StructField(cleanColumnName(r._1.toString), LongType) + } val schema = StructType(StructField(tableName, StringType) +: headerNames) new DataFrame(df.sqlContext, LocalRelation(schema.toAttributes, table)).na.fill(0.0) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala index 3e87ebad08bc7..7e2605bc68916 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala @@ -85,6 +85,36 @@ class DataFrameStatSuite extends SparkFunSuite { } } + test("special crosstab elements (., '', null, ``)") { + val data = Seq( + ("a", Double.NaN, "ho"), + (null, 2.0, "ho"), + ("a.b", Double.NegativeInfinity, ""), + ("b", Double.PositiveInfinity, "`ha`"), + ("a", 1.0, null) + ) + val df = data.toDF("1", "2", "3") + val ct1 = df.stat.crosstab("1", "2") + // column fields should be 1 + distinct elements of second column + assert(ct1.schema.fields.length === 6) + assert(ct1.collect().length === 4) + val ct2 = df.stat.crosstab("1", "3") + assert(ct2.schema.fields.length === 5) + assert(ct2.schema.fieldNames.contains("ha")) + assert(ct2.collect().length === 4) + val ct3 = df.stat.crosstab("3", "2") + assert(ct3.schema.fields.length === 6) + assert(ct3.schema.fieldNames.contains("NaN")) + assert(ct3.schema.fieldNames.contains("Infinity")) + assert(ct3.schema.fieldNames.contains("-Infinity")) + assert(ct3.collect().length === 4) + val ct4 = df.stat.crosstab("3", "1") + assert(ct4.schema.fields.length === 5) + assert(ct4.schema.fieldNames.contains("null")) + assert(ct4.schema.fieldNames.contains("a.b")) + assert(ct4.collect().length === 4) + } + test("Frequent Items") { val rows = Seq.tabulate(1000) { i => if (i % 3 == 0) (1, toLetter(1), -1.0) else (i, toLetter(i), i * -1.0) From 07b95c7adf88f0662b7ab1c47e302ff5e6859606 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Thu, 2 Jul 2015 23:18:48 -0700 Subject: [PATCH 05/35] Preparing Spark release v1.4.1-rc2 --- assembly/pom.xml | 2 +- bagel/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka-assembly/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml | 2 +- external/zeromq/pom.xml | 2 +- extras/java8-tests/pom.xml | 2 +- extras/kinesis-asl/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- launcher/pom.xml | 2 +- mllib/pom.xml | 2 +- network/common/pom.xml | 2 +- network/shuffle/pom.xml | 2 +- network/yarn/pom.xml | 2 +- pom.xml | 2 +- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- unsafe/pom.xml | 2 +- yarn/pom.xml | 2 +- 30 files changed, 30 insertions(+), 30 deletions(-) diff --git a/assembly/pom.xml b/assembly/pom.xml index 228db597a0264..ba233e70a29f5 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../pom.xml diff --git a/bagel/pom.xml b/bagel/pom.xml index ce791a689fa68..c5e9183ddc945 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../pom.xml diff --git a/core/pom.xml b/core/pom.xml index 176ea9bfa9ff3..f0d236df406c1 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../pom.xml diff --git a/examples/pom.xml b/examples/pom.xml index 877c2fbd7b5d6..e9a9cc2ebe276 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../pom.xml diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index ad431fa3d8023..7eae7a7a846a1 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../../pom.xml diff --git a/external/flume/pom.xml b/external/flume/pom.xml index 9789435368037..b3ad09adae008 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../../pom.xml diff --git a/external/kafka-assembly/pom.xml b/external/kafka-assembly/pom.xml index 18b1d8690f82e..c05bd1bad0d82 100644 --- a/external/kafka-assembly/pom.xml +++ b/external/kafka-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../../pom.xml diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml index 2cfee582368c8..c8f844084ce1f 100644 --- a/external/kafka/pom.xml +++ b/external/kafka/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../../pom.xml diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml index 48c7ba6c14070..908bd6378ed30 100644 --- a/external/mqtt/pom.xml +++ b/external/mqtt/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../../pom.xml diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml index f717e702c89ae..68e67b8a7c3b6 100644 --- a/external/twitter/pom.xml +++ b/external/twitter/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../../pom.xml diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml index db33d56bb31f5..e12a87370c3f9 100644 --- a/external/zeromq/pom.xml +++ b/external/zeromq/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../../pom.xml diff --git a/extras/java8-tests/pom.xml b/extras/java8-tests/pom.xml index e249b5630e150..f57dadb367d1e 100644 --- a/extras/java8-tests/pom.xml +++ b/extras/java8-tests/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../../pom.xml diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml index 7ab64da0f4bb1..7b542c388fcc4 100644 --- a/extras/kinesis-asl/pom.xml +++ b/extras/kinesis-asl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../../pom.xml diff --git a/extras/spark-ganglia-lgpl/pom.xml b/extras/spark-ganglia-lgpl/pom.xml index 5269209d6028e..a9f1fad415fb8 100644 --- a/extras/spark-ganglia-lgpl/pom.xml +++ b/extras/spark-ganglia-lgpl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../../pom.xml diff --git a/graphx/pom.xml b/graphx/pom.xml index d994f1af20104..901ec740f6b68 100644 --- a/graphx/pom.xml +++ b/graphx/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../pom.xml diff --git a/launcher/pom.xml b/launcher/pom.xml index ba9579880ed2d..9804ca700c811 100644 --- a/launcher/pom.xml +++ b/launcher/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../pom.xml diff --git a/mllib/pom.xml b/mllib/pom.xml index c1b2e8cb1d99e..911c8b4d8f0d7 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../pom.xml diff --git a/network/common/pom.xml b/network/common/pom.xml index 784ade449dfe8..7ee504b929c37 100644 --- a/network/common/pom.xml +++ b/network/common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../../pom.xml diff --git a/network/shuffle/pom.xml b/network/shuffle/pom.xml index 2751eb91da834..48837df6e6205 100644 --- a/network/shuffle/pom.xml +++ b/network/shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../../pom.xml diff --git a/network/yarn/pom.xml b/network/yarn/pom.xml index b2aceb4ceab23..3847a34162a4a 100644 --- a/network/yarn/pom.xml +++ b/network/yarn/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../../pom.xml diff --git a/pom.xml b/pom.xml index 75b27f95b5b6a..721bf145c9d84 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 pom Spark Project Parent POM http://spark.apache.org/ diff --git a/repl/pom.xml b/repl/pom.xml index 783cf25526022..e9b833bae88a7 100644 --- a/repl/pom.xml +++ b/repl/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../pom.xml diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml index e31627d83d60e..d33b9eeb5a7fa 100644 --- a/sql/catalyst/pom.xml +++ b/sql/catalyst/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../../pom.xml diff --git a/sql/core/pom.xml b/sql/core/pom.xml index 0cb85bff39200..b5121059a8167 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../../pom.xml diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml index d13afc5df16a6..81aa1376734ac 100644 --- a/sql/hive-thriftserver/pom.xml +++ b/sql/hive-thriftserver/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../../pom.xml diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index ddc85b9f83658..a2ee048ec564c 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../../pom.xml diff --git a/streaming/pom.xml b/streaming/pom.xml index d24fd799e329e..974f5a52c77bc 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../pom.xml diff --git a/tools/pom.xml b/tools/pom.xml index 5fd85a6a1a364..eb14e4fe28322 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../pom.xml diff --git a/unsafe/pom.xml b/unsafe/pom.xml index fa5085e1a9d96..606a0c8005e97 100644 --- a/unsafe/pom.xml +++ b/unsafe/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../pom.xml diff --git a/yarn/pom.xml b/yarn/pom.xml index 56366b852d6bd..09f9f0f66dc36 100644 --- a/yarn/pom.xml +++ b/yarn/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.10 - 1.4.2-SNAPSHOT + 1.4.1 ../pom.xml From e990561ce0133402a362c7ba7f828ea5a19fcf58 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Thu, 2 Jul 2015 23:18:53 -0700 Subject: [PATCH 06/35] Preparing development version 1.4.2-SNAPSHOT --- assembly/pom.xml | 2 +- bagel/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka-assembly/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml | 2 +- external/zeromq/pom.xml | 2 +- extras/java8-tests/pom.xml | 2 +- extras/kinesis-asl/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- launcher/pom.xml | 2 +- mllib/pom.xml | 2 +- network/common/pom.xml | 2 +- network/shuffle/pom.xml | 2 +- network/yarn/pom.xml | 2 +- pom.xml | 2 +- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- unsafe/pom.xml | 2 +- yarn/pom.xml | 2 +- 30 files changed, 30 insertions(+), 30 deletions(-) diff --git a/assembly/pom.xml b/assembly/pom.xml index ba233e70a29f5..228db597a0264 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../pom.xml diff --git a/bagel/pom.xml b/bagel/pom.xml index c5e9183ddc945..ce791a689fa68 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../pom.xml diff --git a/core/pom.xml b/core/pom.xml index f0d236df406c1..176ea9bfa9ff3 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../pom.xml diff --git a/examples/pom.xml b/examples/pom.xml index e9a9cc2ebe276..877c2fbd7b5d6 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../pom.xml diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index 7eae7a7a846a1..ad431fa3d8023 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../../pom.xml diff --git a/external/flume/pom.xml b/external/flume/pom.xml index b3ad09adae008..9789435368037 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../../pom.xml diff --git a/external/kafka-assembly/pom.xml b/external/kafka-assembly/pom.xml index c05bd1bad0d82..18b1d8690f82e 100644 --- a/external/kafka-assembly/pom.xml +++ b/external/kafka-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../../pom.xml diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml index c8f844084ce1f..2cfee582368c8 100644 --- a/external/kafka/pom.xml +++ b/external/kafka/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../../pom.xml diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml index 908bd6378ed30..48c7ba6c14070 100644 --- a/external/mqtt/pom.xml +++ b/external/mqtt/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../../pom.xml diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml index 68e67b8a7c3b6..f717e702c89ae 100644 --- a/external/twitter/pom.xml +++ b/external/twitter/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../../pom.xml diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml index e12a87370c3f9..db33d56bb31f5 100644 --- a/external/zeromq/pom.xml +++ b/external/zeromq/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../../pom.xml diff --git a/extras/java8-tests/pom.xml b/extras/java8-tests/pom.xml index f57dadb367d1e..e249b5630e150 100644 --- a/extras/java8-tests/pom.xml +++ b/extras/java8-tests/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../../pom.xml diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml index 7b542c388fcc4..7ab64da0f4bb1 100644 --- a/extras/kinesis-asl/pom.xml +++ b/extras/kinesis-asl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../../pom.xml diff --git a/extras/spark-ganglia-lgpl/pom.xml b/extras/spark-ganglia-lgpl/pom.xml index a9f1fad415fb8..5269209d6028e 100644 --- a/extras/spark-ganglia-lgpl/pom.xml +++ b/extras/spark-ganglia-lgpl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../../pom.xml diff --git a/graphx/pom.xml b/graphx/pom.xml index 901ec740f6b68..d994f1af20104 100644 --- a/graphx/pom.xml +++ b/graphx/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../pom.xml diff --git a/launcher/pom.xml b/launcher/pom.xml index 9804ca700c811..ba9579880ed2d 100644 --- a/launcher/pom.xml +++ b/launcher/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../pom.xml diff --git a/mllib/pom.xml b/mllib/pom.xml index 911c8b4d8f0d7..c1b2e8cb1d99e 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../pom.xml diff --git a/network/common/pom.xml b/network/common/pom.xml index 7ee504b929c37..784ade449dfe8 100644 --- a/network/common/pom.xml +++ b/network/common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../../pom.xml diff --git a/network/shuffle/pom.xml b/network/shuffle/pom.xml index 48837df6e6205..2751eb91da834 100644 --- a/network/shuffle/pom.xml +++ b/network/shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../../pom.xml diff --git a/network/yarn/pom.xml b/network/yarn/pom.xml index 3847a34162a4a..b2aceb4ceab23 100644 --- a/network/yarn/pom.xml +++ b/network/yarn/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../../pom.xml diff --git a/pom.xml b/pom.xml index 721bf145c9d84..75b27f95b5b6a 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT pom Spark Project Parent POM http://spark.apache.org/ diff --git a/repl/pom.xml b/repl/pom.xml index e9b833bae88a7..783cf25526022 100644 --- a/repl/pom.xml +++ b/repl/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../pom.xml diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml index d33b9eeb5a7fa..e31627d83d60e 100644 --- a/sql/catalyst/pom.xml +++ b/sql/catalyst/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../../pom.xml diff --git a/sql/core/pom.xml b/sql/core/pom.xml index b5121059a8167..0cb85bff39200 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../../pom.xml diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml index 81aa1376734ac..d13afc5df16a6 100644 --- a/sql/hive-thriftserver/pom.xml +++ b/sql/hive-thriftserver/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../../pom.xml diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index a2ee048ec564c..ddc85b9f83658 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../../pom.xml diff --git a/streaming/pom.xml b/streaming/pom.xml index 974f5a52c77bc..d24fd799e329e 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../pom.xml diff --git a/tools/pom.xml b/tools/pom.xml index eb14e4fe28322..5fd85a6a1a364 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../pom.xml diff --git a/unsafe/pom.xml b/unsafe/pom.xml index 606a0c8005e97..fa5085e1a9d96 100644 --- a/unsafe/pom.xml +++ b/unsafe/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../pom.xml diff --git a/yarn/pom.xml b/yarn/pom.xml index 09f9f0f66dc36..56366b852d6bd 100644 --- a/yarn/pom.xml +++ b/yarn/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.10 - 1.4.1 + 1.4.2-SNAPSHOT ../pom.xml From 4d813833df57a8a75df58aadabe061acd114431d Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 6 Jul 2015 17:16:44 -0700 Subject: [PATCH 07/35] [SPARK-8463][SQL] Use DriverRegistry to load jdbc driver at writing path JIRA: https://issues.apache.org/jira/browse/SPARK-8463 Currently, at the reading path, `DriverRegistry` is used to load needed jdbc driver at executors. However, at the writing path, we also need `DriverRegistry` to load jdbc driver. Author: Liang-Chi Hsieh Closes #6900 from viirya/jdbc_write_driver and squashes the following commits: 16cd04b [Liang-Chi Hsieh] Use DriverRegistry to load jdbc driver at writing path. (cherry picked from commit d4d6d31db5cc5c69ac369f754b7489f444c9ba2f) Signed-off-by: Reynold Xin --- .../main/scala/org/apache/spark/sql/jdbc/jdbc.scala | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala index dd8aaf6474895..f7ea852fe7f58 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala @@ -58,13 +58,12 @@ package object jdbc { * are used. */ def savePartition( - url: String, + getConnection: () => Connection, table: String, iterator: Iterator[Row], rddSchema: StructType, - nullTypes: Array[Int], - properties: Properties): Iterator[Byte] = { - val conn = DriverManager.getConnection(url, properties) + nullTypes: Array[Int]): Iterator[Byte] = { + val conn = getConnection() var committed = false try { conn.setAutoCommit(false) // Everything in the same db transaction. @@ -185,8 +184,10 @@ package object jdbc { } val rddSchema = df.schema + val driver: String = DriverRegistry.getDriverClassName(url) + val getConnection: () => Connection = JDBCRDD.getConnector(driver, url, properties) df.foreachPartition { iterator => - JDBCWriteDetails.savePartition(url, table, iterator, rddSchema, nullTypes, properties) + JDBCWriteDetails.savePartition(getConnection, table, iterator, rddSchema, nullTypes) } } From 947b845988a48dfd1fbdda74dd7fe5a537e12f8c Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Mon, 6 Jul 2015 19:22:30 -0700 Subject: [PATCH 08/35] [SPARK-8819] Fix build for maven 3.3.x This is a workaround for MSHADE-148, which leads to an infinite loop when building Spark with maven 3.3.x. This was originally caused by #6441, which added a bunch of test dependencies on the spark-core test module. Recently, it was revealed by #7193. This patch adds a `-Prelease` profile. If present, it will set `createDependencyReducedPom` to true. The consequences are: - If you are releasing Spark with this profile, you are fine as long as you use maven 3.2.x or before. - If you are releasing Spark without this profile, you will run into SPARK-8781. - If you are not releasing Spark but you are using this profile, you may run into SPARK-8819. - If you are not releasing Spark and you did not include this profile, you are fine. This is all documented in `pom.xml` and tested locally with both versions of maven. Author: Andrew Or Closes #7219 from andrewor14/fix-maven-build and squashes the following commits: 1d37e87 [Andrew Or] Merge branch 'master' of github.com:apache/spark into fix-maven-build 3574ae4 [Andrew Or] Review comments f39199c [Andrew Or] Create a -Prelease profile that flags `createDependencyReducedPom` (cherry picked from commit 9eae5fa642317dd11fc783d832d4cbb7e62db471) Signed-off-by: Andrew Or --- dev/create-release/create-release.sh | 4 ++-- pom.xml | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh index 54274a83f6d66..cfe2cd4752b3f 100755 --- a/dev/create-release/create-release.sh +++ b/dev/create-release/create-release.sh @@ -118,13 +118,13 @@ if [[ ! "$@" =~ --skip-publish ]]; then rm -rf $SPARK_REPO - build/mvn -DskipTests -Pyarn -Phive \ + build/mvn -DskipTests -Pyarn -Phive -Prelease-profile\ -Phive-thriftserver -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \ clean install ./dev/change-version-to-2.11.sh - build/mvn -DskipTests -Pyarn -Phive \ + build/mvn -DskipTests -Pyarn -Phive -Prelease-profile\ -Dscala-2.11 -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \ clean install diff --git a/pom.xml b/pom.xml index 75b27f95b5b6a..e1b3e879b8241 100644 --- a/pom.xml +++ b/pom.xml @@ -162,6 +162,8 @@ 2.4.4 1.1.1.7 1.1.2 + + false ${java.home} @@ -1428,6 +1430,8 @@ 2.3 false + + ${create.dependency.reduced.pom} @@ -1812,6 +1816,26 @@ + + + release-profile + + + true + + + - release-profile + release org.apache.maven.plugins maven-surefire-plugin 2.18.1 ... true ... ``` but is absent in the config ScalaTest. Author: Kousuke Saruta Closes #7308 from sarutak/add-setting-for-memory-leak and squashes the following commits: 95644e7 [Kousuke Saruta] Added a setting for memory leak (cherry picked from commit aba5784dab24c03ddad89f7a1b5d3d0dc8d109be) Signed-off-by: Kousuke Saruta --- pom.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/pom.xml b/pom.xml index 9953ad03aaed4..512bc87f2f907 100644 --- a/pom.xml +++ b/pom.xml @@ -1294,6 +1294,7 @@ false false true + true From dfc99715e3f581ae96682df04805dc9ce271a0b7 Mon Sep 17 00:00:00 2001 From: zsxwing Date: Thu, 9 Jul 2015 13:22:17 -0700 Subject: [PATCH 30/35] [SPARK-7419] [STREAMING] [TESTS] Fix CheckpointSuite.recovery with file input stream Fix this failure: https://amplab.cs.berkeley.edu/jenkins/job/Spark-Master-SBT/2886/AMPLAB_JENKINS_BUILD_PROFILE=hadoop2.3,label=centos/testReport/junit/org.apache.spark.streaming/CheckpointSuite/recovery_with_file_input_stream/ To reproduce this failure, you can add `Thread.sleep(2000)` before this line https://github.com/apache/spark/blob/a9c4e29950a14e32acaac547e9a0e8879fd37fc9/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala#L477 Author: zsxwing Closes #7323 from zsxwing/SPARK-7419 and squashes the following commits: b3caf58 [zsxwing] Fix CheckpointSuite.recovery with file input stream --- .../spark/streaming/CheckpointSuite.scala | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala index 6b0a3f91d4d06..6a94928076236 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala @@ -424,11 +424,11 @@ class CheckpointSuite extends TestSuiteBase { } } } - clock.advance(batchDuration.milliseconds) eventually(eventuallyTimeout) { // Wait until all files have been recorded and all batches have started assert(recordedFiles(ssc) === Seq(1, 2, 3) && batchCounter.getNumStartedBatches === 3) } + clock.advance(batchDuration.milliseconds) // Wait for a checkpoint to be written eventually(eventuallyTimeout) { assert(Checkpoint.getCheckpointFiles(checkpointDir).size === 6) @@ -454,9 +454,12 @@ class CheckpointSuite extends TestSuiteBase { // recorded before failure were saved and successfully recovered logInfo("*********** RESTARTING ************") withStreamingContext(new StreamingContext(checkpointDir)) { ssc => - // So that the restarted StreamingContext's clock has gone forward in time since failure - ssc.conf.set("spark.streaming.manualClock.jump", (batchDuration * 3).milliseconds.toString) - val oldClockTime = clock.getTimeMillis() + // "batchDuration.milliseconds * 3" has gone before restarting StreamingContext. And because + // the recovery time is read from the checkpoint time but the original clock doesn't align + // with the batch time, we need to add the offset "batchDuration.milliseconds / 2". + ssc.conf.set("spark.streaming.manualClock.jump", + (batchDuration.milliseconds / 2 + batchDuration.milliseconds * 3).toString) + val oldClockTime = clock.getTimeMillis() // 15000ms clock = ssc.scheduler.clock.asInstanceOf[ManualClock] val batchCounter = new BatchCounter(ssc) val outputStream = ssc.graph.getOutputStreams().head.asInstanceOf[TestOutputStream[Int]] @@ -467,10 +470,10 @@ class CheckpointSuite extends TestSuiteBase { ssc.start() // Verify that the clock has traveled forward to the expected time eventually(eventuallyTimeout) { - clock.getTimeMillis() === oldClockTime + assert(clock.getTimeMillis() === oldClockTime) } - // Wait for pre-failure batch to be recomputed (3 while SSC was down plus last batch) - val numBatchesAfterRestart = 4 + // There are 5 batches between 6000ms and 15000ms (inclusive). + val numBatchesAfterRestart = 5 eventually(eventuallyTimeout) { assert(batchCounter.getNumCompletedBatches === numBatchesAfterRestart) } @@ -483,7 +486,6 @@ class CheckpointSuite extends TestSuiteBase { assert(batchCounter.getNumCompletedBatches === index + numBatchesAfterRestart + 1) } } - clock.advance(batchDuration.milliseconds) logInfo("Output after restart = " + outputStream.output.mkString("[", ", ", "]")) assert(outputStream.output.size > 0, "No files processed after restart") ssc.stop() From 990f434e6eea5a78c1cff51595197c7986545167 Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Thu, 9 Jul 2015 13:25:11 -0700 Subject: [PATCH 31/35] [SPARK-2017] [UI] Stage page hangs with many tasks (This reopens a patch that was closed in the past: #6248) When you view the stage page while running the following: ``` sc.parallelize(1 to X, 10000).count() ``` The page never loads, the job is stalled, and you end up running into an OOM: ``` HTTP ERROR 500 Problem accessing /stages/stage/. Reason: Server Error Caused by: java.lang.OutOfMemoryError: Java heap space at java.util.Arrays.copyOf(Arrays.java:2367) at java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:130) ``` This patch compresses Jetty responses in gzip. The correct long-term fix is to add pagination. Author: Andrew Or Closes #7296 from andrewor14/gzip-jetty and squashes the following commits: a051c64 [Andrew Or] Use GZIP to compress Jetty responses (cherry picked from commit ebdf58538058e57381c04b6725d4be0c37847ed3) Signed-off-by: Andrew Or --- .../main/scala/org/apache/spark/ui/JettyUtils.scala | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala index 06e616220c706..f413c1d37fbb6 100644 --- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala @@ -210,10 +210,16 @@ private[spark] object JettyUtils extends Logging { conf: SparkConf, serverName: String = ""): ServerInfo = { - val collection = new ContextHandlerCollection - collection.setHandlers(handlers.toArray) addFilters(handlers, conf) + val collection = new ContextHandlerCollection + val gzipHandlers = handlers.map { h => + val gzipHandler = new GzipHandler + gzipHandler.setHandler(h) + gzipHandler + } + collection.setHandlers(gzipHandlers.toArray) + // Bind to the given port, or throw a java.net.BindException if the port is occupied def connect(currentPort: Int): (Server, Int) = { val server = new Server(new InetSocketAddress(hostName, currentPort)) From 2f2f9da4b71ec9aeb67044cf0065ce951a3ac985 Mon Sep 17 00:00:00 2001 From: guowei2 Date: Thu, 9 Jul 2015 15:01:53 -0700 Subject: [PATCH 32/35] [SPARK-8865] [STREAMING] FIX BUG: check key in kafka params Author: guowei2 Closes #7254 from guowei2/spark-8865 and squashes the following commits: 48ca17a [guowei2] fix contains key (cherry picked from commit 897700369f3aedf1a8fdb0984dd3d6d8e498e3af) Signed-off-by: Tathagata Das --- .../scala/org/apache/spark/streaming/kafka/KafkaCluster.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala index 65d51d87f8486..c88b5db3f491a 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala @@ -402,7 +402,7 @@ object KafkaCluster { } Seq("zookeeper.connect", "group.id").foreach { s => - if (!props.contains(s)) { + if (!props.containsKey(s)) { props.setProperty(s, "") } } From bef05915088aaba42678fc74b2f359017e218d27 Mon Sep 17 00:00:00 2001 From: Michael Vogiatzis Date: Thu, 9 Jul 2015 19:53:23 -0700 Subject: [PATCH 33/35] [DOCS] Added important updateStateByKey details Runs for *all* existing keys and returning "None" will remove the key-value pair. Author: Michael Vogiatzis Closes #7229 from mvogiatzis/patch-1 and squashes the following commits: e7a2946 [Michael Vogiatzis] Updated updateStateByKey text 00283ed [Michael Vogiatzis] Removed space c2656f9 [Michael Vogiatzis] Moved description farther up 0a42551 [Michael Vogiatzis] Added important updateStateByKey details (cherry picked from commit d538919cc4fd3ab940d478c62dce1bae0270cfeb) Signed-off-by: Tathagata Das --- docs/streaming-programming-guide.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md index 1eb3b30332e4f..33b279cf3492f 100644 --- a/docs/streaming-programming-guide.md +++ b/docs/streaming-programming-guide.md @@ -854,6 +854,8 @@ it with new information. To use this, you will have to do two steps. 1. Define the state update function - Specify with a function how to update the state using the previous state and the new values from an input stream. +In every batch, Spark will apply the state update function for all existing keys, regardless of whether they have new data in a batch or not. If the update function returns `None` then the key-value pair will be eliminated. + Let's illustrate this with an example. Say you want to maintain a running count of each word seen in a text data stream. Here, the running count is the state and it is an integer. We define the update function as: From 898e5f76f56ef0c1c6563b0cb59f38a7dd95dae5 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Sat, 11 Jul 2015 01:05:40 -0700 Subject: [PATCH 34/35] [SPARK-8990] [SQL] SPARK-8990 DataFrameReader.parquet() should respect user specified options (for branch-1.4) Backports PR #7347 (SPARK-8990) to branch-1.4. Author: Cheng Lian Closes #7351 from liancheng/spark-8990-for-1.4 and squashes the following commits: ffb5a73 [Cheng Lian] Backports PR #7347 (SPARK-8990) to branch-1.4 --- .../apache/spark/sql/DataFrameReader.scala | 2 +- .../spark/sql/parquet/ParquetQuerySuite.scala | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index b44d4c86ac5d3..472172ef56041 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -263,7 +263,7 @@ class DataFrameReader private[sql](sqlContext: SQLContext) { val globbedPaths = paths.map(new Path(_)).flatMap(SparkHadoopUtil.get.globPath).toArray sqlContext.baseRelationToDataFrame( new ParquetRelation2( - globbedPaths.map(_.toString), None, None, Map.empty[String, String])(sqlContext)) + globbedPaths.map(_.toString), None, None, extraOptions.toMap)(sqlContext)) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala index 304936fb2be8e..520edfc5b1087 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.parquet +import org.apache.hadoop.fs.Path import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql.types._ @@ -124,6 +125,24 @@ class ParquetQuerySuiteBase extends QueryTest with ParquetTest { checkAnswer(df2, df.collect().toSeq) } } + + test("SPARK-8990 DataFrameReader.parquet() should respect user specified options") { + withTempPath { dir => + val basePath = dir.getCanonicalPath + sqlContext.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString) + sqlContext.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=a").toString) + + assertResult(2) { + // Disables schema merging via data source option + sqlContext.read.option("mergeSchema", "false").parquet(basePath).columns.length + } + + assertResult(3) { + // Enables schema merging via data source option + sqlContext.read.option("mergeSchema", "true").parquet(basePath).columns.length + } + } + } } class ParquetDataSourceOnQuerySuite extends ParquetQuerySuiteBase with BeforeAndAfterAll { From 5819266295dd8f31379c0f353c39b4acd4633697 Mon Sep 17 00:00:00 2001 From: Mark Hamstra Date: Mon, 13 Jul 2015 00:12:16 -0700 Subject: [PATCH 35/35] bumped version for 1.4.1 --- assembly/pom.xml | 2 +- bagel/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka-assembly/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml | 2 +- external/zeromq/pom.xml | 2 +- graphx/pom.xml | 2 +- launcher/pom.xml | 2 +- mllib/pom.xml | 2 +- network/common/pom.xml | 2 +- network/shuffle/pom.xml | 2 +- network/yarn/pom.xml | 2 +- pom.xml | 2 +- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- unsafe/pom.xml | 2 +- yarn/pom.xml | 2 +- 27 files changed, 27 insertions(+), 27 deletions(-) diff --git a/assembly/pom.xml b/assembly/pom.xml index 884447a7b4943..ba2350b5c17f5 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../pom.xml diff --git a/bagel/pom.xml b/bagel/pom.xml index d19118c418fcb..fbde3e22bb85f 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../pom.xml diff --git a/core/pom.xml b/core/pom.xml index 4e62d294e07a2..436ac62b06152 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../pom.xml diff --git a/examples/pom.xml b/examples/pom.xml index 4c4a40bed4d98..25c0e4a8f56f0 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../pom.xml diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index 052aeba88cf97..5feb8460fb585 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../../pom.xml diff --git a/external/flume/pom.xml b/external/flume/pom.xml index ea5f07766cba0..d5ec8112d189c 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../../pom.xml diff --git a/external/kafka-assembly/pom.xml b/external/kafka-assembly/pom.xml index 2cde6966a7372..a9a96bd87a033 100644 --- a/external/kafka-assembly/pom.xml +++ b/external/kafka-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../../pom.xml diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml index 9b107766ad061..f97511ebbc4ba 100644 --- a/external/kafka/pom.xml +++ b/external/kafka/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../../pom.xml diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml index 143b2a21f9b0b..00f0b2cae129d 100644 --- a/external/mqtt/pom.xml +++ b/external/mqtt/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../../pom.xml diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml index d8d624479a4ad..e55ec4b9349ec 100644 --- a/external/twitter/pom.xml +++ b/external/twitter/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../../pom.xml diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml index aac75e0d6cb53..acdd5e447b25c 100644 --- a/external/zeromq/pom.xml +++ b/external/zeromq/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../../pom.xml diff --git a/graphx/pom.xml b/graphx/pom.xml index 7a5e0d92059cf..986612d7f31c0 100644 --- a/graphx/pom.xml +++ b/graphx/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../pom.xml diff --git a/launcher/pom.xml b/launcher/pom.xml index 973f7cf021619..757dcc94a915d 100644 --- a/launcher/pom.xml +++ b/launcher/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../pom.xml diff --git a/mllib/pom.xml b/mllib/pom.xml index 7f365a04de9ea..0fe3ac62f79c5 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../pom.xml diff --git a/network/common/pom.xml b/network/common/pom.xml index 0fc0a536ec0c9..90a0c3d6ce7ac 100644 --- a/network/common/pom.xml +++ b/network/common/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../../pom.xml diff --git a/network/shuffle/pom.xml b/network/shuffle/pom.xml index 612c32e58db8f..563d8c0bf014f 100644 --- a/network/shuffle/pom.xml +++ b/network/shuffle/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../../pom.xml diff --git a/network/yarn/pom.xml b/network/yarn/pom.xml index a9ba04ec5b759..afbb5e84326be 100644 --- a/network/yarn/pom.xml +++ b/network/yarn/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../../pom.xml diff --git a/pom.xml b/pom.xml index 209e5a33b14ae..29c0e5640b7a0 100644 --- a/pom.xml +++ b/pom.xml @@ -25,7 +25,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT pom Spark Project Parent POM http://spark.apache.org/ diff --git a/repl/pom.xml b/repl/pom.xml index a0df097bce290..c2a0fbb621dde 100644 --- a/repl/pom.xml +++ b/repl/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../pom.xml diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml index b7e4bc9f80f41..2d4b1ddb920f3 100644 --- a/sql/catalyst/pom.xml +++ b/sql/catalyst/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../../pom.xml diff --git a/sql/core/pom.xml b/sql/core/pom.xml index a3e0c5df02d64..fac059e14c0e5 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../../pom.xml diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml index 8286d6c47193e..8fab04e76d855 100644 --- a/sql/hive-thriftserver/pom.xml +++ b/sql/hive-thriftserver/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../../pom.xml diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index f157b88725c71..1967c2f9ace25 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../../pom.xml diff --git a/streaming/pom.xml b/streaming/pom.xml index 271a8f59794e6..f031dd24be7ca 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../pom.xml diff --git a/tools/pom.xml b/tools/pom.xml index a7b0cb4bd4737..9f1b1f86979ef 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../pom.xml diff --git a/unsafe/pom.xml b/unsafe/pom.xml index 046448ed4238a..7d8b5532e5fd5 100644 --- a/unsafe/pom.xml +++ b/unsafe/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../pom.xml diff --git a/yarn/pom.xml b/yarn/pom.xml index 8311958397110..65386d8eeb41a 100644 --- a/yarn/pom.xml +++ b/yarn/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.10 - 1.4.0-csd-5-SNAPSHOT + 1.4.1-csd-1-SNAPSHOT ../pom.xml