Skip to content

Commit a83accf

Browse files
zsxwingcloud-fan
authored andcommitted
[SPARK-19065][SQL] Don't inherit expression id in dropDuplicates
## What changes were proposed in this pull request? `dropDuplicates` will create an Alias using the same exprId, so `StreamExecution` should also replace Alias if necessary. ## How was this patch tested? test("SPARK-19065: dropDuplicates should not create expressions using the same id") Author: Shixiong Zhu <[email protected]> Closes #16564 from zsxwing/SPARK-19065.
1 parent 20e6280 commit a83accf

File tree

3 files changed

+27
-11
lines changed

3 files changed

+27
-11
lines changed

sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2003,10 +2003,7 @@ class Dataset[T] private[sql](
20032003
if (groupColExprIds.contains(attr.exprId)) {
20042004
attr
20052005
} else {
2006-
// Removing duplicate rows should not change output attributes. We should keep
2007-
// the original exprId of the attribute. Otherwise, to select a column in original
2008-
// dataset will cause analysis exception due to unresolved attribute.
2009-
Alias(new First(attr).toAggregateExpression(), attr.name)(exprId = attr.exprId)
2006+
Alias(new First(attr).toAggregateExpression(), attr.name)()
20102007
}
20112008
}
20122009
Aggregate(groupCols, aggCols, logicalPlan)

sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -898,13 +898,6 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
898898
(1, 2), (1, 1), (2, 1), (2, 2))
899899
}
900900

901-
test("dropDuplicates should not change child plan output") {
902-
val ds = Seq(("a", 1), ("a", 2), ("b", 1), ("a", 1)).toDS()
903-
checkDataset(
904-
ds.dropDuplicates("_1").select(ds("_1").as[String], ds("_2").as[Int]),
905-
("a", 1), ("b", 1))
906-
}
907-
908901
test("SPARK-16097: Encoders.tuple should handle null object correctly") {
909902
val enc = Encoders.tuple(Encoders.tuple(Encoders.STRING, Encoders.STRING), Encoders.STRING)
910903
val data = Seq((("a", "b"), "c"), (null, "d"))

sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,32 @@ class StreamSuite extends StreamTest {
304304
q.stop()
305305
}
306306
}
307+
308+
test("SPARK-19065: dropDuplicates should not create expressions using the same id") {
309+
withTempPath { testPath =>
310+
val data = Seq((1, 2), (2, 3), (3, 4))
311+
data.toDS.write.mode("overwrite").json(testPath.getCanonicalPath)
312+
val schema = spark.read.json(testPath.getCanonicalPath).schema
313+
val query = spark
314+
.readStream
315+
.schema(schema)
316+
.json(testPath.getCanonicalPath)
317+
.dropDuplicates("_1")
318+
.writeStream
319+
.format("memory")
320+
.queryName("testquery")
321+
.outputMode("complete")
322+
.start()
323+
try {
324+
query.processAllAvailable()
325+
if (query.exception.isDefined) {
326+
throw query.exception.get
327+
}
328+
} finally {
329+
query.stop()
330+
}
331+
}
332+
}
307333
}
308334

309335
/**

0 commit comments

Comments
 (0)