-
Notifications
You must be signed in to change notification settings - Fork 227
Fix case for self closing tag inside row #321
Changes from all commits
0a4eb71
e8bc119
b5b3c03
c9a918b
2958b4b
a4c6054
6543e60
f292b9a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,7 +15,7 @@ | |
| */ | ||
| package com.databricks.spark.xml | ||
|
|
||
| import java.io.{InputStream, IOException} | ||
| import java.io.{IOException, InputStream} | ||
| import java.nio.charset.Charset | ||
|
|
||
| import org.apache.hadoop.conf.Configuration | ||
|
|
@@ -25,14 +25,17 @@ import org.apache.hadoop.io.{DataOutputBuffer, LongWritable, Text} | |
| import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext} | ||
| import org.apache.hadoop.mapreduce.lib.input.{FileSplit, TextInputFormat} | ||
|
|
||
| import scala.collection.mutable.ArrayBuffer | ||
|
|
||
| /** | ||
| * Reads records that are delimited by a specific start/end tag. | ||
| */ | ||
| class XmlInputFormat extends TextInputFormat { | ||
|
|
||
| override def createRecordReader( | ||
| split: InputSplit, | ||
| context: TaskAttemptContext): RecordReader[LongWritable, Text] = { | ||
| split: InputSplit, | ||
| context: TaskAttemptContext): | ||
| RecordReader[LongWritable, Text] = { | ||
| new XmlRecordReader | ||
| } | ||
| } | ||
|
|
@@ -47,9 +50,9 @@ object XmlInputFormat { | |
| } | ||
|
|
||
| /** | ||
| * XMLRecordReader class to read through a given xml document to output xml blocks as records | ||
| * as specified by the start tag and end tag | ||
| */ | ||
| * XMLRecordReader class to read through a given xml document to output xml blocks as records | ||
| * as specified by the start tag and end tag | ||
| */ | ||
| private[xml] class XmlRecordReader extends RecordReader[LongWritable, Text] { | ||
| private var startTag: Array[Byte] = _ | ||
| private var currentStartTag: Array[Byte] = _ | ||
|
|
@@ -111,7 +114,7 @@ private[xml] class XmlRecordReader extends RecordReader[LongWritable, Text] { | |
| // So we have a split that is only part of a file stored using | ||
| // a Compression codec that cannot be split. | ||
| throw new IOException("Cannot seek in " + | ||
| codec.getClass.getSimpleName + " compressed stream") | ||
| codec.getClass.getSimpleName + " compressed stream") | ||
| } | ||
| val cIn = c.createInputStream(fsin, decompressor) | ||
| in = cIn | ||
|
|
@@ -131,13 +134,13 @@ private[xml] class XmlRecordReader extends RecordReader[LongWritable, Text] { | |
| } | ||
|
|
||
| /** | ||
| * Finds the start of the next record. | ||
| * It treats data from `startTag` and `endTag` as a record. | ||
| * | ||
| * @param key the current key that will be written | ||
| * @param value the object that will be written | ||
| * @return whether it reads successfully | ||
| */ | ||
| * Finds the start of the next record. | ||
| * It treats data from `startTag` and `endTag` as a record. | ||
| * | ||
| * @param key the current key that will be written | ||
| * @param value the object that will be written | ||
| * @return whether it reads successfully | ||
| */ | ||
| private def next(key: LongWritable, value: Text): Boolean = { | ||
| if (readUntilStartElement()) { | ||
| try { | ||
|
|
@@ -189,9 +192,24 @@ private[xml] class XmlRecordReader extends RecordReader[LongWritable, Text] { | |
| false | ||
| } | ||
|
|
||
| private def checkEmptyTag(currentLetter: Int, position: Int): Boolean = { | ||
| private def checkEmptyTag(currentLetter: Int, position: Int, | ||
| buffer: DataOutputBuffer): Boolean = { | ||
| def checkStartTagBefore = { | ||
| val startAngleInByte = '<'.toByte | ||
| val spaceInByte = ' '.toByte | ||
| val rootTagName = buffer.getData | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why define a method here?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because it's used only here.
maybe, but this code run only in rare situations
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let me think about it and maybe try a different approach. I think we just need to look for a self-close tag that can only come before any other tag close.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (BTW here I meant, why define a method at all? it's only used here, as you say) |
||
| .reverse | ||
| .takeWhile(_ != startAngleInByte) | ||
| .reverse | ||
| .takeWhile(_ != spaceInByte) | ||
| val result = startAngleInByte +: rootTagName | ||
|
|
||
| result.sameElements(startTag.dropRight(1)) | ||
| } | ||
|
|
||
| if (position >= endEmptyTag.length) false | ||
| else currentLetter == endEmptyTag(position) | ||
| else currentLetter == endEmptyTag(position) && | ||
| checkStartTagBefore | ||
| } | ||
|
|
||
| private def readUntilEndElement(): Boolean = { | ||
|
|
@@ -207,7 +225,7 @@ private[xml] class XmlRecordReader extends RecordReader[LongWritable, Text] { | |
| } else { | ||
| buffer.write(rb) | ||
| val b = rb.toByte | ||
| if (b == startTag(si) && (b == endTag(ei) || checkEmptyTag(b, ei))) { | ||
| if (b == startTag(si) && (b == endTag(ei) || checkEmptyTag(b, ei, buffer))) { | ||
| // In start tag or end tag. | ||
| si += 1 | ||
| ei += 1 | ||
|
|
@@ -222,9 +240,9 @@ private[xml] class XmlRecordReader extends RecordReader[LongWritable, Text] { | |
| si += 1 | ||
| ei = 0 | ||
| } | ||
| } else if (b == endTag(ei) || checkEmptyTag(b, ei)) { | ||
| } else if (b == endTag(ei) || checkEmptyTag(b, ei, buffer)) { | ||
| if ((b == endTag(ei) && ei >= endTag.length - 1) || | ||
| (checkEmptyTag(b, ei) && ei >= endEmptyTag.length - 1)) { | ||
| (checkEmptyTag(b, ei, buffer) && ei >= endEmptyTag.length - 1)) { | ||
| if (depth == 0) { | ||
| // Found closing end tag. | ||
| return true | ||
|
|
@@ -253,7 +271,7 @@ private[xml] class XmlRecordReader extends RecordReader[LongWritable, Text] { | |
| private def checkAttributes(current: Int): Boolean = { | ||
| var len = 0 | ||
| var b = current | ||
| while(len < space.length && b == space(len)) { | ||
| while (len < space.length && b == space(len)) { | ||
| len += 1 | ||
| if (len >= space.length) { | ||
| currentStartTag = startTag.take(startTag.length - angleBracket.length) ++ space | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| <ROWSET> | ||
| <ROW> | ||
| <non-empty-tag>1</non-empty-tag> | ||
| <self-closing-tag/> | ||
| </ROW> | ||
| </ROWSET> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Likewise this needs to be reverted
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can't revert this change, because goal
scalacheckstylewill fail on it.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The line isn't too long if you don't add the deep continuation indent. The code passed style checks before.