Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,10 @@
* of keys.
*
* Currently the mapper extracts all leaf values of the JSON object, converts them to their text
* representations, and indexes each one as a keyword. As an example, given a json field called
* 'json_field' and the following input
* representations, and indexes each one as a keyword. It creates both a 'keyed' version of the token
* to allow searches on particular key-value pairs, as well as a 'root' token without the key
*
* As an example, given a json field called 'json_field' and the following input
*
* {
* "json_field: {
Expand All @@ -63,13 +65,18 @@
* }
* }
*
* the mapper will produce untokenized string fields with the values "some value" and "true".
* the mapper will produce untokenized string fields called "json_field" with values "some value" and "true",
* as well as string fields called "json_field._keyed" with values "key\0some value" and "key2.key3\0true".
*
* Note that \0 is a reserved separator character, and cannot be used in the keys of the JSON object
* (see {@link JsonFieldParser#SEPARATOR}).
*/
public final class JsonFieldMapper extends FieldMapper {

public static final String CONTENT_TYPE = "json";
public static final NamedAnalyzer WHITESPACE_ANALYZER = new NamedAnalyzer(
"whitespace", AnalyzerScope.INDEX, new WhitespaceAnalyzer());
public static final String KEYED_FIELD_SUFFIX = "._keyed";

private static class Defaults {
public static final MappedFieldType FIELD_TYPE = new JsonFieldType();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,53 +31,116 @@

/**
* A helper class for {@link JsonFieldMapper} parses a JSON object
* and produces an indexable field for each leaf value.
* and produces a pair of indexable fields for each leaf value.
*/
public class JsonFieldParser {
private static final String SEPARATOR = "\0";
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The null character \0 seemed like a reasonable choice for a separator, as (1) it shouldn’t show up too often in field keys, and (2) there is already precedent for it, as we use it when storing percolator queries (PercolatorFieldMapper#FIELD_VALUE_SEPARATOR).


private final MappedFieldType fieldType;
private final int ignoreAbove;

private final String rootFieldName;
private final String keyedFieldName;

JsonFieldParser(MappedFieldType fieldType,
int ignoreAbove) {
this.fieldType = fieldType;
this.ignoreAbove = ignoreAbove;

this.rootFieldName = fieldType.name();
this.keyedFieldName = fieldType.name() + JsonFieldMapper.KEYED_FIELD_SUFFIX;
}

public List<IndexableField> parse(XContentParser parser) throws IOException {
XContentParserUtils.ensureExpectedToken(XContentParser.Token.START_OBJECT,
parser.currentToken(),
parser::getTokenLocation);

ContentPath path = new ContentPath();
List<IndexableField> fields = new ArrayList<>();
int openObjects = 1;

parseObject(parser, path, fields);
return fields;
}

private void parseObject(XContentParser parser,
ContentPath path,
List<IndexableField> fields) throws IOException {
String currentName = null;
while (true) {
if (openObjects == 0) {
return fields;
XContentParser.Token token = parser.nextToken();
if (token == XContentParser.Token.END_OBJECT) {
return;
}

if (token == XContentParser.Token.FIELD_NAME) {
currentName = parser.currentName();
} else {
assert currentName != null;
parseFieldValue(token, parser, path, currentName, fields);
}
}
}

private void parseArray(XContentParser parser,
ContentPath path,
String currentName,
List<IndexableField> fields) throws IOException {
while (true) {
XContentParser.Token token = parser.nextToken();
assert token != null;

if (token == XContentParser.Token.START_OBJECT) {
openObjects++;
} else if (token == XContentParser.Token.END_OBJECT) {
openObjects--;
} else if (token.isValue()) {
String value = parser.text();
addField(value, fields);
} else if (token == XContentParser.Token.VALUE_NULL) {
String value = fieldType.nullValueAsString();
if (value != null) {
addField(value, fields);
}
if (token == XContentParser.Token.END_ARRAY) {
return;
}
parseFieldValue(token, parser, path, currentName, fields);
}
}

private void parseFieldValue(XContentParser.Token token,
XContentParser parser,
ContentPath path,
String currentName,
List<IndexableField> fields) throws IOException {
if (token == XContentParser.Token.START_OBJECT) {
path.add(currentName);
parseObject(parser, path, fields);
path.remove();
} else if (token == XContentParser.Token.START_ARRAY) {
parseArray(parser, path, currentName, fields);
} else if (token.isValue()) {
String value = parser.text();
addField(path, currentName, value, fields);
} else if (token == XContentParser.Token.VALUE_NULL) {
String value = fieldType.nullValueAsString();
if (value != null) {
addField(path, currentName, value, fields);
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we throw an exception in an else block here? If we encounter something like an array of arrays we should probably reject the document rather than silently ignoring it? The same probably applies for parseObject above?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops, I think this is actually a bug, since an array of arrays is valid JSON and should be accepted. Will fix.

I agree it's a good idea to add an else with an exception, so we'll fail fast when encountering something unexpected rather than attempt to proceed in a potentially wrong state.

} else {
// Note that we throw an exception here just to be safe. We don't actually expect to reach
// this case, since XContentParser verifies that the input is well-formed as it parses.
throw new IllegalArgumentException("Encountered unexpected token [" + token.toString() + "].");
}
}

private void addField(String value, List<IndexableField> fields) {
if (value.length() <= ignoreAbove) {
fields.add(new Field(fieldType.name(), new BytesRef(value), fieldType));
private void addField(ContentPath path,
String currentName,
String value,
List<IndexableField> fields) {
if (value.length() > ignoreAbove) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For prefixed values, the alternative option here would be to check the whole length of the prefixed token, as opposed to just the value. I think that this behavior is more intuitive (and I also don't think we're as concerned about field keys being really long, as opposed to values?)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, we should probably put in some kind of soft limit on the depth of these objects at some point and the ignore above plus that soft limit will give us an upper bound on the term lengths here anyway

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense to me, I'll make a note on the meta-issue to add a limit.

return;
}

String key = path.pathAsText(currentName);
if (key.contains(SEPARATOR)) {
throw new IllegalArgumentException("Keys in [json] fields cannot contain the reserved character \\0."
+ " Offending key: [" + key + "].");
}
String keyedValue = createKeyedValue(key, value);

fields.add(new Field(rootFieldName, new BytesRef(value), fieldType));
fields.add(new Field(keyedFieldName, new BytesRef(keyedValue), fieldType));
}

private static String createKeyedValue(String key, String value) {
return key + SEPARATOR + value;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -72,30 +72,31 @@ public void testDefaults() throws Exception {

BytesReference doc = BytesReference.bytes(XContentFactory.jsonBuilder().startObject()
.startObject("field")
.field("key1", "value")
.field("key2", true)
.field("key", "value")
.endObject()
.endObject());

ParsedDocument parsedDoc = mapper.parse(SourceToParse.source("test", "type", "1", doc, XContentType.JSON));

IndexableField[] fields = parsedDoc.rootDoc().getFields("field");
assertEquals(2, fields.length);
assertEquals(1, fields.length);

assertEquals("field", fields[0].name());
assertEquals(new BytesRef("value"), fields[0].binaryValue());
assertFalse(fields[0].fieldType().stored());
assertTrue(fields[0].fieldType().omitNorms());

IndexableField field1 = fields[0];
assertEquals("field", field1.name());
assertEquals(new BytesRef("value"), field1.binaryValue());
assertTrue(field1.fieldType().omitNorms());
IndexableField[] keyedFields = parsedDoc.rootDoc().getFields("field._keyed");
assertEquals(1, keyedFields.length);

IndexableField field2 = fields[1];
assertEquals("field", field2.name());
assertEquals(new BytesRef("true"), field2.binaryValue());
assertTrue(field2.fieldType().omitNorms());
assertEquals("field._keyed", keyedFields[0].name());
assertEquals(new BytesRef("key\0value"), keyedFields[0].binaryValue());
assertFalse(keyedFields[0].fieldType().stored());
assertTrue(keyedFields[0].fieldType().omitNorms());

IndexableField[] fieldNamesFields = parsedDoc.rootDoc().getFields(FieldNamesFieldMapper.NAME);
assertEquals(1, fieldNamesFields.length);

IndexableField fieldNamesField = fieldNamesFields[0];
assertEquals("field", fieldNamesField.stringValue());
assertEquals("field", fieldNamesFields[0].stringValue());
}

public void testDisableIndex() throws Exception {
Expand Down Expand Up @@ -248,20 +249,18 @@ public void testFieldMultiplicity() throws Exception {
.endObject());

ParsedDocument parsedDoc = mapper.parse(SourceToParse.source("test", "type", "1", doc, XContentType.JSON));

IndexableField[] fields = parsedDoc.rootDoc().getFields("field");
assertEquals(3, fields.length);

IndexableField field1 = fields[0];
assertEquals("field", field1.name());
assertEquals(new BytesRef("value"), field1.binaryValue());

IndexableField field2 = fields[1];
assertEquals("field", field2.name());
assertEquals(new BytesRef("true"), field2.binaryValue());

IndexableField field3 = fields[2];
assertEquals("field", field3.name());
assertEquals(new BytesRef("false"), field3.binaryValue());
assertEquals(new BytesRef("value"), fields[0].binaryValue());
assertEquals(new BytesRef("true"), fields[1].binaryValue());
assertEquals(new BytesRef("false"), fields[2].binaryValue());

IndexableField[] keyedFields = parsedDoc.rootDoc().getFields("field._keyed");
assertEquals(3, keyedFields.length);
assertEquals(new BytesRef("key1\0value"), keyedFields[0].binaryValue());
assertEquals(new BytesRef("key2\0true"), keyedFields[1].binaryValue());
assertEquals(new BytesRef("key3\0false"), keyedFields[2].binaryValue());
}

public void testIgnoreAbove() throws IOException {
Expand Down Expand Up @@ -292,7 +291,6 @@ public void testIgnoreAbove() throws IOException {
assertEquals(0, fields.length);
}


public void testNullValues() throws Exception {
String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject()
.startObject("type")
Expand Down Expand Up @@ -326,8 +324,11 @@ public void testNullValues() throws Exception {

IndexableField[] otherFields = parsedDoc.rootDoc().getFields("other_field");
assertEquals(1, otherFields.length);
IndexableField field = otherFields[0];
assertEquals(new BytesRef("placeholder"), field.binaryValue());
assertEquals(new BytesRef("placeholder"), otherFields[0].binaryValue());

IndexableField[] prefixedOtherFields = parsedDoc.rootDoc().getFields("other_field._keyed");
assertEquals(1, prefixedOtherFields.length);
assertEquals(new BytesRef("key\0placeholder"), prefixedOtherFields[0].binaryValue());
}

public void testSplitQueriesOnWhitespace() throws IOException {
Expand Down
Loading