-
Notifications
You must be signed in to change notification settings - Fork 28.9k
SPARK-12417. [SQL] Orc bloom filter options are not propagated during… #10842
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -162,6 +162,10 @@ private[sql] class OrcRelation( | |
| extends HadoopFsRelation(maybePartitionSpec, parameters) | ||
| with Logging { | ||
|
|
||
| /* org.apache.orc contains OrcConf which can be used later */ | ||
| private[sql] val ORC_BLOOM_FILTER_COLUMNS = "orc.bloom.filter.columns" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it is really confusing to have ORC_BLOOM_FILTER_COLUMNS and orcBloomFilterCols in the same class. I'd just inline the string in orcBloomFilterCols. Same for orcBloomFilterFpp. |
||
| private[sql] val ORC_BLOOM_FILTER_FPP = "orc.bloom.filter.fpp" | ||
|
|
||
| private[sql] def this( | ||
| paths: Array[String], | ||
| maybeDataSchema: Option[StructType], | ||
|
|
@@ -177,6 +181,9 @@ private[sql] class OrcRelation( | |
| parameters)(sqlContext) | ||
| } | ||
|
|
||
| private val orcBloomFilterCols = parameters.get(ORC_BLOOM_FILTER_COLUMNS) | ||
| private val orcBloomFilterFpp = parameters.get(ORC_BLOOM_FILTER_FPP) | ||
|
|
||
| override val dataSchema: StructType = maybeDataSchema.getOrElse { | ||
| OrcFileOperator.readSchema( | ||
| paths.head, Some(sqlContext.sparkContext.hadoopConfiguration)) | ||
|
|
@@ -211,6 +218,9 @@ private[sql] class OrcRelation( | |
| } | ||
|
|
||
| override def prepareJobForWrite(job: Job): BucketedOutputWriterFactory = { | ||
| val conf = job.getConfiguration | ||
| orcBloomFilterCols.map(conf.set(ORC_BLOOM_FILTER_COLUMNS, _)) | ||
| orcBloomFilterFpp.map(conf.set(ORC_BLOOM_FILTER_FPP, _)) | ||
| job.getConfiguration match { | ||
| case conf: JobConf => | ||
| conf.setOutputFormat(classOf[OrcOutputFormat]) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,10 +17,10 @@ | |
|
|
||
| package org.apache.spark.sql.hive.orc | ||
|
|
||
| import java.io.File | ||
| import java.io.{PrintStream, ByteArrayOutputStream, File} | ||
|
|
||
| import org.apache.hadoop.hive.conf.HiveConf.ConfVars | ||
| import org.apache.hadoop.hive.ql.io.orc.CompressionKind | ||
| import org.apache.hadoop.hive.ql.io.orc.{FileDump, CompressionKind} | ||
| import org.scalatest.BeforeAndAfterAll | ||
|
|
||
| import org.apache.spark.sql._ | ||
|
|
@@ -394,4 +394,42 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest { | |
| } | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-12417. Orc bloom filter options are not propagated during file " + | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Change the title to "SPARK-12417: Propagate Bloom filters when writing out ORC files" |
||
| "generation") { | ||
| withTempPath { dir => | ||
| // Create separate sub dir for bloom filter testing | ||
| val path = new File(dir, "orc").getCanonicalPath | ||
|
|
||
| // Write some data | ||
| val data = (0 until 10).map { i => | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do you need data to be in this form? can't it just be a list of numbers from 0 to 10? |
||
| val maybeInt = if (i % 2 == 0) None else Some(i) | ||
| val nullValue: Option[String] = None | ||
| (maybeInt, nullValue) | ||
| } | ||
|
|
||
| // Dump data to orc | ||
| createDataFrame(data).toDF("a", "b") | ||
| .write.option("orc.bloom.filter.columns", "*").orc(path) | ||
|
|
||
| // Verify if orc bloom filters are present. This can be verified via | ||
| // ORC RecordReaderImpl when it is made public. Until then, verify by | ||
| // dumping file statistics and checking whether bloom filter was added. | ||
| new File(path).listFiles().filter(_.getName.endsWith("orc")) map { file => | ||
| withTempStream { buf => | ||
| val fileDumpArgs = Array(file.getCanonicalPath) | ||
| FileDump.main(fileDumpArgs) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this seems really hacky and is depending on some non-public API.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't you just read the files directly? |
||
| assert(buf.toString.contains("BLOOM_FILTER")) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| def withTempStream(f: ByteArrayOutputStream => Unit): Unit = { | ||
| val oriStream = System.out | ||
| val buf = new ByteArrayOutputStream() | ||
| val stream = new PrintStream(buf) | ||
| System.setOut(stream) | ||
| try f(buf) finally System.setOut(oriStream) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it's unclear what this message actually means - is this a todo for the future?