@@ -55,44 +55,43 @@ object DataSourceWriteBenchmark {
5555 }
5656 }
5757
58- def writeInt (table : String , format : String , benchmark : Benchmark ): Unit = {
59- spark.sql(s " create table $table(c1 INT, c2 STRING) using $format" )
60- benchmark.addCase(" Output Single Int Column" ) { _ =>
61- spark.sql(s " INSERT overwrite table $table select cast(id as INT) as " +
62- s " c1, cast(id as STRING) as c2 from $tempTable" )
58+ def writeNumeric (table : String , format : String , benchmark : Benchmark , dataType : String ): Unit = {
59+ spark.sql(s " create table $table(id $dataType) using $format" )
60+ benchmark.addCase(s " Output Single $dataType Column " ) { _ =>
61+ spark.sql(s " INSERT OVERWRITE TABLE $table SELECT CAST(id AS $dataType) AS c1 FROM $tempTable" )
6362 }
6463 }
6564
6665 def writeIntString (table : String , format : String , benchmark : Benchmark ): Unit = {
67- spark.sql(s " create table $table(c1 INT, c2 STRING) using $format" )
66+ spark.sql(s " CREATE TABLE $table(c1 INT, c2 STRING) USING $format" )
6867 benchmark.addCase(" Output Int and String Column" ) { _ =>
69- spark.sql(s " INSERT overwrite table $table select cast (id as INT) as " +
70- s " c1, cast (id as STRING) as c2 from $tempTable" )
68+ spark.sql(s " INSERT OVERWRITE TABLE $table SELECT CAST (id AS INT) AS " +
69+ s " c1, CAST (id AS STRING) AS c2 FROM $tempTable" )
7170 }
7271 }
7372
7473 def writePartition (table : String , format : String , benchmark : Benchmark ): Unit = {
75- spark.sql(s " create table $table(p INT, id INT) using $format PARTITIONED BY (p) " )
74+ spark.sql(s " CREATE TABLE $table(p INT, id INT) USING $format PARTITIONED BY (p) " )
7675 benchmark.addCase(" Output Partitions" ) { _ =>
77- spark.sql(s " INSERT overwrite table $table select cast (id as INT) as id, " +
78- s " cast (id % 2 as INT) as p from $tempTable" )
76+ spark.sql(s " INSERT OVERWRITE TABLE $table SELECT CAST (id AS INT) AS id, " +
77+ s " CAST (id % 2 AS INT) AS p FROM $tempTable" )
7978 }
8079 }
8180
8281 def writeBucket (table : String , format : String , benchmark : Benchmark ): Unit = {
83- spark.sql(s " create table $table(c1 INT, c2 INT) using $format CLUSTERED BY (c2) INTO 2 BUCKETS " )
82+ spark.sql(s " CREATE TABLE $table(c1 INT, c2 INT) USING $format CLUSTERED BY (c2) INTO 2 BUCKETS " )
8483 benchmark.addCase(" Output Buckets" ) { _ =>
85- spark.sql(s " INSERT overwrite table $table select cast (id as INT) as " +
86- s " c1, cast (id as INT) as c2 from $tempTable" )
84+ spark.sql(s " INSERT OVERWRITE TABLE $table SELECT CAST (id AS INT) AS " +
85+ s " c1, CAST (id AS INT) AS c2 FROM $tempTable" )
8786 }
8887 }
8988
9089 def main (args : Array [String ]): Unit = {
9190 val tableInt = " tableInt"
91+ val tableDouble = " tableDouble"
9292 val tableIntString = " tableIntString"
9393 val tablePartition = " tablePartition"
9494 val tableBucket = " tableBucket"
95- // If the
9695 val formats : Seq [String ] = if (args.isEmpty) {
9796 Seq (" Parquet" , " ORC" , " JSON" , " CSV" )
9897 } else {
@@ -102,38 +101,43 @@ object DataSourceWriteBenchmark {
102101 Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz
103102 Parquet writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
104103 ------------------------------------------------------------------------------------------------
105- Output Single Int Column 6054 / 6070 2.6 384.9 1.0X
106- Output Int and String Column 5784 / 5800 2.7 367.8 1.0X
107- Output Partitions 3891 / 3904 4.0 247.4 1.6X
108- Output Buckets 5446 / 5729 2.9 346.2 1.1X
104+ Output Single Int Column 1815 / 1932 8.7 115.4 1.0X
105+ Output Single Double Column 1877 / 1878 8.4 119.3 1.0X
106+ Output Int and String Column 6265 / 6543 2.5 398.3 0.3X
107+ Output Partitions 4067 / 4457 3.9 258.6 0.4X
108+ Output Buckets 5608 / 5820 2.8 356.6 0.3X
109109
110110 ORC writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
111111 ------------------------------------------------------------------------------------------------
112- Output Single Int Column 5734 / 5823 2.7 364.6 1.0X
113- Output Int and String Column 5802 / 5839 2.7 368.9 1.0X
114- Output Partitions 3384 / 3671 4.6 215.1 1.7X
115- Output Buckets 4950 / 4988 3.2 314.7 1.2X
112+ Output Single Int Column 1201 / 1239 13.1 76.3 1.0X
113+ Output Single Double Column 1542 / 1600 10.2 98.0 0.8X
114+ Output Int and String Column 6495 / 6580 2.4 412.9 0.2X
115+ Output Partitions 3648 / 3842 4.3 231.9 0.3X
116+ Output Buckets 5022 / 5145 3.1 319.3 0.2X
116117
117118 JSON writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
118119 ------------------------------------------------------------------------------------------------
119- Output Single Int Column 5576 / 5594 2.8 354.5 1.0X
120- Output Int and String Column 5550 / 5620 2.8 352.9 1.0X
121- Output Partitions 3727 / 4100 4.2 237.0 1.5X
122- Output Buckets 5316 / 5852 3.0 338.0 1.0X
120+ Output Single Int Column 1988 / 2093 7.9 126.4 1.0X
121+ Output Single Double Column 2854 / 2911 5.5 181.4 0.7X
122+ Output Int and String Column 6467 / 6653 2.4 411.1 0.3X
123+ Output Partitions 4548 / 5055 3.5 289.1 0.4X
124+ Output Buckets 5664 / 5765 2.8 360.1 0.4X
123125
124126 CSV writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
125127 ------------------------------------------------------------------------------------------------
126- Output Single Int Column 7064 / 8714 2.2 449.1 1.0X
127- Output Int and String Column 7114 / 7663 2.2 452.3 1.0X
128- Output Partitions 5771 / 6228 2.7 366.9 1.2X
129- Output Buckets 7414 / 7479 2.1 471.3 1.0X
128+ Output Single Int Column 3025 / 3190 5.2 192.3 1.0X
129+ Output Single Double Column 3575 / 3634 4.4 227.3 0.8X
130+ Output Int and String Column 7313 / 7399 2.2 464.9 0.4X
131+ Output Partitions 5105 / 5190 3.1 324.6 0.6X
132+ Output Buckets 6986 / 6992 2.3 444.1 0.4X
130133 */
131134 withTempTable(tempTable) {
132135 spark.range(numRows).createOrReplaceTempView(tempTable)
133136 formats.foreach { format =>
134- withTable(tableInt, tableIntString, tablePartition, tableBucket) {
137+ withTable(tableInt, tableDouble, tableIntString, tablePartition, tableBucket) {
135138 val benchmark = new Benchmark (s " $format writer benchmark " , numRows)
136- writeInt(tableInt, format, benchmark)
139+ writeNumeric(tableInt, format, benchmark, " Int" )
140+ writeNumeric(tableDouble, format, benchmark, " Double" )
137141 writeIntString(tableIntString, format, benchmark)
138142 writePartition(tablePartition, format, benchmark)
139143 writeBucket(tableBucket, format, benchmark)
0 commit comments