@@ -1894,8 +1894,8 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
18941894 }
18951895 }
18961896
1897- def testLineSeparator (lineSep : String ): Unit = {
1898- test(s " Support line separator - lineSep: ' $lineSep ' " ) {
1897+ def testLineSeparator (lineSep : String , encoding : String , inferSchema : Boolean , id : Int ): Unit = {
1898+ test(s " Support line separator in ${encoding} # ${id} " ) {
18991899 // Read
19001900 val data =
19011901 s """ "a",1 $lineSep
@@ -1905,17 +1905,23 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
19051905
19061906 Seq (data, dataWithTrailingLineSep).foreach { lines =>
19071907 withTempPath { path =>
1908- Files .write(path.toPath, lines.getBytes(StandardCharsets . UTF_8 ))
1909- val schema = StructType (StructField (" f " , StringType )
1910- :: StructField (" f0 " , LongType ) :: Nil )
1908+ Files .write(path.toPath, lines.getBytes(encoding ))
1909+ val schema = StructType (StructField (" _c0 " , StringType )
1910+ :: StructField (" _c1 " , LongType ) :: Nil )
19111911
1912- val expected = Seq ((" a" , 1 ), (" \n c" , 2 ), (" \n d" , 3 )).toDF()
1912+ val expected = Seq ((" a" , 1 ), (" \n c" , 2 ), (" \n d" , 3 ))
1913+ .toDF(" _c0" , " _c1" )
19131914 Seq (false , true ).foreach { multiLine =>
1914- val df = spark.read
1915- .schema(schema)
1915+ val reader = spark
1916+ .read
19161917 .option(" lineSep" , lineSep)
19171918 .option(" multiLine" , multiLine)
1918- .csv(path.getAbsolutePath)
1919+ .option(" encoding" , encoding)
1920+ val df = if (inferSchema) {
1921+ reader.option(" inferSchema" , true ).csv(path.getAbsolutePath)
1922+ } else {
1923+ reader.schema(schema).csv(path.getAbsolutePath)
1924+ }
19191925 checkAnswer(df, expected)
19201926 }
19211927 }
@@ -1924,26 +1930,50 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
19241930 // Write
19251931 withTempPath { path =>
19261932 Seq (" a" , " b" , " c" ).toDF(" value" ).coalesce(1 )
1927- .write.option(" lineSep" , lineSep).csv(path.getAbsolutePath)
1933+ .write
1934+ .option(" lineSep" , lineSep)
1935+ .option(" encoding" , encoding)
1936+ .csv(path.getAbsolutePath)
19281937 val partFile = TestUtils .recursiveList(path).filter(f => f.getName.startsWith(" part-" )).head
1929- val readBack = new String (Files .readAllBytes(partFile.toPath), StandardCharsets . UTF_8 )
1938+ val readBack = new String (Files .readAllBytes(partFile.toPath), encoding )
19301939 assert(
19311940 readBack === s " a ${lineSep}b ${lineSep}c ${lineSep}" )
19321941 }
19331942
19341943 // Roundtrip
19351944 withTempPath { path =>
19361945 val df = Seq (" a" , " b" , " c" ).toDF()
1937- df.write.option(" lineSep" , lineSep).csv(path.getAbsolutePath)
1938- val readBack = spark.read.option(" lineSep" , lineSep).csv(path.getAbsolutePath)
1946+ df.write
1947+ .option(" lineSep" , lineSep)
1948+ .option(" encoding" , encoding)
1949+ .csv(path.getAbsolutePath)
1950+ val readBack = spark
1951+ .read
1952+ .option(" lineSep" , lineSep)
1953+ .option(" encoding" , encoding)
1954+ .csv(path.getAbsolutePath)
19391955 checkAnswer(df, readBack)
19401956 }
19411957 }
19421958 }
19431959
19441960 // scalastyle:off nonascii
1945- Seq (" |" , " ^" , " ::" , 0x1E .toChar.toString).foreach { lineSep =>
1946- testLineSeparator(lineSep)
1961+ List (
1962+ (0 , " |" , " UTF-8" , false ),
1963+ (1 , " ^" , " UTF-16BE" , true ),
1964+ (2 , " ::" , " ISO-8859-1" , true ),
1965+ (3 , " !!" , " UTF-32LE" , false ),
1966+ (4 , 0x1E .toChar.toString, " UTF-8" , true ),
1967+ (5 , " 아" , " UTF-32BE" , false ),
1968+ (6 , " ку" , " CP1251" , true ),
1969+ (8 , " \r\n " , " UTF-16LE" , true ),
1970+ (9 , " \r\n " , " utf-16be" , false ),
1971+ (10 , " \u000d\u000a " , " UTF-32BE" , false ),
1972+ (11 , " \u000a\u000d " , " UTF-8" , true ),
1973+ (12 , " ==" , " US-ASCII" , false ),
1974+ (13 , " $^" , " utf-32le" , true )
1975+ ).foreach { case (testNum, sep, encoding, inferSchema) =>
1976+ testLineSeparator(sep, encoding, inferSchema, testNum)
19471977 }
19481978 // scalastyle:on nonascii
19491979
0 commit comments