Skip to content

Commit d802136

Browse files
authored
Enable _terms_enum on ip fields (#94322)
The _terms_enum API currently does not support ip fields. However, type-ahead-like completion is useful for UI purposes. This change adds the ability to query ip fields via the _terms_enum API by leveraging the terms enumeration available when doc_values are enabled on the field, which is the default. In order to make prefix filtering fast, we internally create a fast prefix automaton from the user-supplied prefix that gets intersected with the shards terms enumeration, similar to what we do for keyword fields already. Closes #89933
1 parent 706d065 commit d802136

File tree

8 files changed

+802
-110
lines changed

8 files changed

+802
-110
lines changed

docs/changelog/94322.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 94322
2+
summary: Enable `_terms_enum` on `ip` fields
3+
area: Mapping
4+
type: enhancement
5+
issues:
6+
- 89933

docs/reference/search/terms-enum.asciidoc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66

77
The terms enum API can be used to discover terms in the index that match
88
a partial string. Supported field types are <<keyword-field-type,`keyword`>>,
9-
<<constant-keyword-field-type,`constant_keyword`>>, <<flattened,`flattened`>>
10-
and <<version, `version`>>. This is used for auto-complete:
9+
<<constant-keyword-field-type,`constant_keyword`>>, <<flattened,`flattened`>>,
10+
<<version, `version`>> and <<ip, `ip`>>. This is used for auto-complete:
1111

1212
[source,console]
1313
--------------------------------------------------

server/src/main/java/org/elasticsearch/index/mapper/IpFieldMapper.java

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,16 @@
1212
import org.apache.lucene.document.InetAddressPoint;
1313
import org.apache.lucene.document.SortedSetDocValuesField;
1414
import org.apache.lucene.document.StoredField;
15+
import org.apache.lucene.index.IndexReader;
1516
import org.apache.lucene.index.LeafReaderContext;
17+
import org.apache.lucene.index.Terms;
18+
import org.apache.lucene.index.TermsEnum;
1619
import org.apache.lucene.search.IndexOrDocValuesQuery;
1720
import org.apache.lucene.search.MatchNoDocsQuery;
1821
import org.apache.lucene.search.PointRangeQuery;
1922
import org.apache.lucene.search.Query;
2023
import org.apache.lucene.util.BytesRef;
24+
import org.apache.lucene.util.automaton.CompiledAutomaton;
2125
import org.elasticsearch.Version;
2226
import org.elasticsearch.common.logging.DeprecationCategory;
2327
import org.elasticsearch.common.logging.DeprecationLogger;
@@ -49,7 +53,11 @@
4953
import java.util.Objects;
5054
import java.util.function.BiFunction;
5155

52-
/** A {@link FieldMapper} for ip addresses. */
56+
import static org.elasticsearch.index.mapper.IpPrefixAutomatonUtil.buildIpPrefixAutomaton;
57+
58+
/**
59+
* A {@link FieldMapper} for ip addresses.
60+
*/
5361
public class IpFieldMapper extends FieldMapper {
5462

5563
private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(IpFieldMapper.class);
@@ -417,6 +425,31 @@ public DocValueFormat docValueFormat(@Nullable String format, ZoneId timeZone) {
417425
return DocValueFormat.IP;
418426
}
419427

428+
@Override
429+
public TermsEnum getTerms(IndexReader reader, String prefix, boolean caseInsensitive, String searchAfter) throws IOException {
430+
431+
Terms terms = null;
432+
// terms_enum for ip only works if doc values are enabled
433+
if (hasDocValues()) {
434+
terms = SortedSetDocValuesTerms.getTerms(reader, name());
435+
}
436+
if (terms == null) {
437+
// Field does not exist on this shard.
438+
return null;
439+
}
440+
BytesRef searchBytes = searchAfter == null ? null : new BytesRef(InetAddressPoint.encode(InetAddress.getByName(searchAfter)));
441+
CompiledAutomaton prefixAutomaton = buildIpPrefixAutomaton(prefix);
442+
443+
if (prefixAutomaton.type == CompiledAutomaton.AUTOMATON_TYPE.ALL) {
444+
TermsEnum result = terms.iterator();
445+
if (searchAfter != null) {
446+
result = new SearchAfterTermsEnum(result, searchBytes);
447+
}
448+
return result;
449+
}
450+
return terms.intersect(prefixAutomaton, searchBytes);
451+
}
452+
420453
/**
421454
* @return true if field has been marked as a dimension field
422455
*/
Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0 and the Server Side Public License, v 1; you may not use this file except
5+
* in compliance with, at your election, the Elastic License 2.0 or the Server
6+
* Side Public License, v 1.
7+
*/
8+
9+
package org.elasticsearch.index.mapper;
10+
11+
import org.apache.lucene.util.BytesRef;
12+
import org.apache.lucene.util.automaton.Automata;
13+
import org.apache.lucene.util.automaton.Automaton;
14+
import org.apache.lucene.util.automaton.CompiledAutomaton;
15+
import org.apache.lucene.util.automaton.MinimizationOperations;
16+
import org.apache.lucene.util.automaton.Operations;
17+
18+
import java.util.ArrayList;
19+
import java.util.Collections;
20+
import java.util.HashMap;
21+
import java.util.List;
22+
import java.util.Map;
23+
import java.util.regex.Matcher;
24+
import java.util.regex.Pattern;
25+
import java.util.stream.IntStream;
26+
27+
import static org.apache.lucene.util.automaton.Operations.concatenate;
28+
29+
/**
30+
* This class contains utility functionality to build an Automaton based
31+
* on a prefix String on an `ip` field.
32+
*/
33+
public class IpPrefixAutomatonUtil {
34+
35+
private static final Automaton EMPTY_AUTOMATON = Automata.makeEmpty();
36+
private static final Automaton IPV4_PREFIX = Automata.makeBinary(new BytesRef(new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1 }));
37+
38+
static final Map<Integer, Automaton> INCOMPLETE_IP4_GROUP_AUTOMATON_LOOKUP = new HashMap<>();
39+
static {
40+
for (int c = 0; c <= 255; c++) {
41+
Automaton a = Automata.makeChar(c);
42+
if (c > 0 && c < 10) {
43+
// all one digit prefixes expand to the two digit range, i.e. 1 -> [10..19]
44+
a = Operations.union(a, Automata.makeCharRange(c * 10, c * 10 + 9));
45+
// 1 and 2 even to three digit ranges
46+
if (c == 1) {
47+
a = Operations.union(a, Automata.makeCharRange(100, 199));
48+
}
49+
if (c == 2) {
50+
a = Operations.union(a, Automata.makeCharRange(200, 255));
51+
}
52+
}
53+
if (c >= 10 && c < 26) {
54+
int min = c * 10;
55+
int max = Math.min(c * 10 + 9, 255);
56+
a = Operations.union(a, Automata.makeCharRange(min, max));
57+
}
58+
INCOMPLETE_IP4_GROUP_AUTOMATON_LOOKUP.put(c, a);
59+
}
60+
}
61+
62+
/**
63+
* Create a {@link CompiledAutomaton} from the ip Prefix.
64+
* If the prefix is empty, the automaton returned will accept everything.
65+
*/
66+
static CompiledAutomaton buildIpPrefixAutomaton(String ipPrefix) {
67+
Automaton result;
68+
if (ipPrefix.isEmpty() == false) {
69+
Automaton ipv4Automaton = createIp4Automaton(ipPrefix);
70+
if (ipv4Automaton != null) {
71+
ipv4Automaton = concatenate(IPV4_PREFIX, ipv4Automaton);
72+
}
73+
Automaton ipv6Automaton = getIpv6Automaton(ipPrefix);
74+
result = Operations.union(ipv4Automaton, ipv6Automaton);
75+
} else {
76+
result = Automata.makeAnyBinary();
77+
}
78+
result = MinimizationOperations.minimize(result, Integer.MAX_VALUE);
79+
return new CompiledAutomaton(result, null, false, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT, true);
80+
}
81+
82+
private static Automaton getIpv6Automaton(String ipPrefix) {
83+
Automaton ipv6Automaton = EMPTY_AUTOMATON;
84+
List<String> ip6Groups = parseIp6Prefix(ipPrefix);
85+
if (ip6Groups.isEmpty() == false) {
86+
ipv6Automaton = Automata.makeString("");
87+
int groupsAdded = 0;
88+
for (String group : ip6Groups) {
89+
if (group.contains(".")) {
90+
// try to parse this as ipv4 ending part, but only if we already have some ipv6 specific stuff in front
91+
if (groupsAdded > 0) {
92+
ipv6Automaton = concatenate(ipv6Automaton, createIp4Automaton(group));
93+
groupsAdded += 2; // this counts as two bytes, missing bytes are padded already
94+
} else {
95+
return EMPTY_AUTOMATON;
96+
}
97+
} else if (group.endsWith(":")) {
98+
groupsAdded++;
99+
// full block
100+
if (group.length() > 1) {
101+
group = group.substring(0, group.length() - 1);
102+
ipv6Automaton = concatenate(ipv6Automaton, automatonFromIPv6Group(padWithZeros(group, 4 - group.length())));
103+
} else {
104+
// single colon denotes left out zeros
105+
ipv6Automaton = concatenate(ipv6Automaton, Operations.repeat(Automata.makeChar(0)));
106+
}
107+
} else {
108+
groupsAdded++;
109+
// partial block, we need to create all possibilities of byte sequences this could match
110+
ipv6Automaton = concatenate(ipv6Automaton, automatonFromIPv6Group(group));
111+
}
112+
}
113+
// fill up the remainder of the 16 address bytes with wildcard matches, each group added so far counts for two bytes
114+
for (int i = 0; i < 16 - groupsAdded * 2; i++) {
115+
ipv6Automaton = concatenate(ipv6Automaton, Operations.optional(Automata.makeCharRange(0, 255)));
116+
}
117+
}
118+
return ipv6Automaton;
119+
}
120+
121+
static Automaton automatonFromIPv6Group(String ipv6Group) {
122+
assert ipv6Group.length() > 0 && ipv6Group.length() <= 4 : "expected a full ipv6 group or prefix";
123+
Automaton result = Automata.makeString("");
124+
for (int leadingZeros = 0; leadingZeros <= 4 - ipv6Group.length(); leadingZeros++) {
125+
int bytesAdded = 0;
126+
String padded = padWithZeros(ipv6Group, leadingZeros);
127+
Automaton a = Automata.makeString("");
128+
while (padded.length() >= 2) {
129+
a = concatenate(a, Automata.makeChar(Integer.parseInt(padded.substring(0, 2), 16)));
130+
padded = padded.substring(2);
131+
bytesAdded++;
132+
}
133+
if (padded.length() == 1) {
134+
int value = Integer.parseInt(padded, 16);
135+
a = concatenate(a, Operations.union(Automata.makeChar(value), Automata.makeCharRange(value * 16, value * 16 + 15)));
136+
bytesAdded++;
137+
}
138+
if (bytesAdded != 2) {
139+
a = concatenate(a, Automata.makeCharRange(0, 255));
140+
}
141+
result = Operations.union(result, a);
142+
}
143+
return result;
144+
}
145+
146+
private static Pattern IPV4_GROUP_MATCHER = Pattern.compile(
147+
"^((?:0|[1-9][0-9]{0,2})\\.)?" + "((?:0|[1-9][0-9]{0,2})\\.)?" + "((?:0|[1-9][0-9]{0,2})\\.)?" + "((?:0|[1-9][0-9]{0,2}))?$"
148+
);
149+
150+
/**
151+
* Creates an {@link Automaton} that accepts all ipv4 address byte representation
152+
* that start with the given prefix. If the prefix is no valid ipv4 prefix, an automaton
153+
* that accepts the empty language is returned.
154+
*/
155+
static Automaton createIp4Automaton(String prefix) {
156+
Matcher ip4Matcher = IPV4_GROUP_MATCHER.matcher(prefix);
157+
if (ip4Matcher.matches() == false) {
158+
return EMPTY_AUTOMATON;
159+
}
160+
int prefixBytes = 0;
161+
byte[] completeByteGroups = new byte[4];
162+
int completeBytes = 0;
163+
// scan the groups the prefix matches
164+
Automaton incompleteGroupAutomaton = Automata.makeString("");
165+
for (int g = 1; g <= 4; g++) {
166+
String group = ip4Matcher.group(g);
167+
// note that intermediate groups might not match anything and can be empty
168+
if (group != null) {
169+
if (group.endsWith(".")) {
170+
// complete group found
171+
int value = Integer.parseInt(group.substring(0, group.length() - 1));
172+
if (value < 0 || value > 255) {
173+
// invalid value, append the empty result to the current one to make it match nothing
174+
return EMPTY_AUTOMATON;
175+
} else {
176+
completeByteGroups[completeBytes] = (byte) value;
177+
completeBytes++;
178+
prefixBytes++;
179+
}
180+
} else {
181+
// if present, this is the last group
182+
int numberPrefix = Integer.parseInt(group);
183+
if (numberPrefix < 255) {
184+
incompleteGroupAutomaton = INCOMPLETE_IP4_GROUP_AUTOMATON_LOOKUP.get(numberPrefix);
185+
prefixBytes++;
186+
} else {
187+
// this cannot be a valid ip4 groups
188+
return EMPTY_AUTOMATON;
189+
}
190+
}
191+
}
192+
}
193+
return concatenate(
194+
List.of(
195+
Automata.makeBinary(new BytesRef(completeByteGroups, 0, completeBytes)),
196+
incompleteGroupAutomaton,
197+
Operations.repeat(Automata.makeCharRange(0, 255), 4 - prefixBytes, 4 - prefixBytes)
198+
)
199+
);
200+
}
201+
202+
private static String padWithZeros(String input, int leadingZeros) {
203+
return new StringBuilder("0".repeat(leadingZeros)).append(input).toString();
204+
}
205+
206+
private static Pattern IP6_BLOCK_MATCHER = Pattern.compile(
207+
"([a-f0-9]{0,4}:)|([a-f0-9]{1,4}$)" // the ipv6 specific notation
208+
+ "|((?:(?:0|[1-9][0-9]{0,2})\\.){1,3}(?:0|[1-9][0-9]{0,2})?$)" // the optional ipv4 part
209+
);
210+
211+
static List<String> parseIp6Prefix(String ip6Prefix) {
212+
Matcher ip6blockMatcher = IP6_BLOCK_MATCHER.matcher(ip6Prefix);
213+
int position = 0;
214+
List<String> groups = new ArrayList<>();
215+
while (ip6blockMatcher.find(position)) {
216+
if (ip6blockMatcher.start() == position) {
217+
position = ip6blockMatcher.end();
218+
IntStream.rangeClosed(1, 3).mapToObj(i -> ip6blockMatcher.group(i)).filter(s -> s != null).forEach(groups::add);
219+
} else {
220+
return Collections.emptyList();
221+
}
222+
}
223+
if (position != ip6Prefix.length()) {
224+
// no full match, return empty list
225+
return Collections.emptyList();
226+
}
227+
return groups;
228+
}
229+
}

0 commit comments

Comments
 (0)