-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-18671][SS][TEST] Added tests to ensure stability of that all Structured Streaming log formats #16128
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-18671][SS][TEST] Added tests to ensure stability of that all Structured Streaming log formats #16128
Changes from all commits
84efce9
3d54494
49e940b
d9be1c5
4150e56
a7529b3
8d4ca5e
e3a7422
26a86d6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -89,4 +89,16 @@ class KafkaSourceOffsetSuite extends OffsetSuite with SharedSQLContext { | |
| Array(0 -> batch0Serialized, 1 -> batch1Serialized)) | ||
| } | ||
| } | ||
|
|
||
| test("read Spark 2.1.0 log format") { | ||
| val offset = readFromResource("kafka-source-offset-version-2.1.0.txt") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: maybe not need to read json from a file since we never write them into a single file.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah. but its good to have it in a separate file in the same place as other formats. will be easier to track all the things that need compatibility guarantees. |
||
| assert(KafkaSourceOffset(offset) === | ||
| KafkaSourceOffset(("topic1", 0, 456L), ("topic1", 1, 789L), ("topic2", 0, 0L))) | ||
| } | ||
|
|
||
| private def readFromResource(file: String): SerializedOffset = { | ||
| import scala.io.Source | ||
| val str = Source.fromFile(getClass.getResource(s"/structured-streaming/$file").toURI).mkString | ||
| SerializedOffset(str) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| v1 | ||
| {"path":"/a/b/0","size":1,"isDir":false,"modificationTime":1,"blockReplication":1,"blockSize":100,"action":"add"} | ||
| {"path":"/a/b/1","size":100,"isDir":false,"modificationTime":100,"blockReplication":1,"blockSize":100,"action":"add"} | ||
| {"path":"/a/b/2","size":200,"isDir":false,"modificationTime":200,"blockReplication":1,"blockSize":100,"action":"add"} | ||
| {"path":"/a/b/3","size":300,"isDir":false,"modificationTime":300,"blockReplication":1,"blockSize":100,"action":"add"} | ||
| {"path":"/a/b/4","size":400,"isDir":false,"modificationTime":400,"blockReplication":1,"blockSize":100,"action":"add"} | ||
| {"path":"/a/b/5","size":500,"isDir":false,"modificationTime":500,"blockReplication":1,"blockSize":100,"action":"add"} | ||
| {"path":"/a/b/6","size":600,"isDir":false,"modificationTime":600,"blockReplication":1,"blockSize":100,"action":"add"} | ||
| {"path":"/a/b/7","size":700,"isDir":false,"modificationTime":700,"blockReplication":1,"blockSize":100,"action":"add"} |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| v1 | ||
| {"path":"/a/b/8","size":800,"isDir":false,"modificationTime":800,"blockReplication":1,"blockSize":100,"action":"add"} | ||
| {"path":"/a/b/0","size":100,"isDir":false,"modificationTime":100,"blockReplication":1,"blockSize":100,"action":"delete"} |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| v1 | ||
| {"path":"/a/b/9","size":900,"isDir":false,"modificationTime":900,"blockReplication":3,"blockSize":200,"action":"add"} |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| v1 | ||
| {"path":"/a/b/0","timestamp":1480730949000,"batchId":0} | ||
| {"path":"/a/b/1","timestamp":1480730950000,"batchId":1} | ||
| {"path":"/a/b/2","timestamp":1480730950000,"batchId":2} |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| v1 | ||
| {"path":"/a/b/3","timestamp":1480730950000,"batchId":3} |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| v1 | ||
| {"path":"/a/b/4","timestamp":1480730951000,"batchId":4} |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| 345 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| {"topic1":{"0":456,"1":789},"topic2":{"0":0}} |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| v1 | ||
| {"batchWatermarkMs":0,"batchTimestampMs":1480981499528} | ||
| 0 | ||
| {"topic-0":{"0":1}} |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,14 +19,13 @@ package org.apache.spark.sql.streaming | |
|
|
||
| import java.io.File | ||
|
|
||
| import scala.collection.mutable | ||
|
|
||
| import org.scalatest.PrivateMethodTester | ||
| import org.scalatest.time.SpanSugar._ | ||
|
|
||
| import org.apache.spark.sql._ | ||
| import org.apache.spark.sql.catalyst.util._ | ||
| import org.apache.spark.sql.execution.streaming._ | ||
| import org.apache.spark.sql.execution.streaming.FileStreamSource.FileEntry | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.test.SharedSQLContext | ||
| import org.apache.spark.sql.types._ | ||
|
|
@@ -1022,6 +1021,33 @@ class FileStreamSourceSuite extends FileStreamSourceTest { | |
| val options = new FileStreamOptions(Map("maxfilespertrigger" -> "1")) | ||
| assert(options.maxFilesPerTrigger == Some(1)) | ||
| } | ||
|
|
||
| test("FileStreamSource offset - read Spark 2.1.0 log format") { | ||
| val offset = readOffsetFromResource("file-source-offset-version-2.1.0.txt") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: maybe not need to read json from a file since we never write them into a single file.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same comment as above. |
||
| assert(LongOffset.convert(offset) === Some(LongOffset(345))) | ||
| } | ||
|
|
||
| test("FileStreamSourceLog - read Spark 2.1.0 log format") { | ||
| assert(readLogFromResource("file-source-log-version-2.1.0") === Seq( | ||
| FileEntry("/a/b/0", 1480730949000L, 0L), | ||
| FileEntry("/a/b/1", 1480730950000L, 1L), | ||
| FileEntry("/a/b/2", 1480730950000L, 2L), | ||
| FileEntry("/a/b/3", 1480730950000L, 3L), | ||
| FileEntry("/a/b/4", 1480730951000L, 4L) | ||
| )) | ||
| } | ||
|
|
||
| private def readLogFromResource(dir: String): Seq[FileEntry] = { | ||
| val input = getClass.getResource(s"/structured-streaming/$dir") | ||
| val log = new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, input.toString) | ||
| log.allFiles() | ||
| } | ||
|
|
||
| private def readOffsetFromResource(file: String): SerializedOffset = { | ||
| import scala.io.Source | ||
| val str = Source.fromFile(getClass.getResource(s"/structured-streaming/$file").toURI).mkString | ||
| SerializedOffset(str.trim) | ||
| } | ||
| } | ||
|
|
||
| class FileStreamSourceStressTestSuite extends FileStreamSourceTest { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: You can use
partitionOffsets.toSeq.sortBy(_._1).foreach { case (tp, off) =>to simplify the codes.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I want to sort by topic and partitions together. so that partitions are ordered when json is generated (currently is not) and hard to read.