elastic · cbuescher · Mar 7, 2023 · Feb 3, 2022 · Mar 6, 2023 · Mar 6, 2023
diff --git a/docs/changelog/94322.yaml b/docs/changelog/94322.yaml
@@ -0,0 +1,6 @@
+pr: 94322
+summary: Enable `_terms_enum` on `ip` fields
+area: Mapping
+type: enhancement
+issues:
+ - 89933
diff --git a/docs/reference/search/terms-enum.asciidoc b/docs/reference/search/terms-enum.asciidoc
@@ -6,8 +6,8 @@
 
 The terms enum API can be used to discover terms in the index that match
 a partial string. Supported field types are <<keyword-field-type,`keyword`>>,
-<<constant-keyword-field-type,`constant_keyword`>>, <<flattened,`flattened`>>
-and <<version, `version`>>. This is used for auto-complete:
+<<constant-keyword-field-type,`constant_keyword`>>, <<flattened,`flattened`>>,
+<<version, `version`>> and <<ip, `ip`>>. This is used for auto-complete:
 
 [source,console]
 --------------------------------------------------

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/IpFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/IpFieldMapper.java
@@ -12,12 +12,16 @@
 import org.apache.lucene.document.InetAddressPoint;
 import org.apache.lucene.document.SortedSetDocValuesField;
 import org.apache.lucene.document.StoredField;
+import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.IndexOrDocValuesQuery;
 import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.search.PointRangeQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
 import org.elasticsearch.Version;
 import org.elasticsearch.common.logging.DeprecationCategory;
 import org.elasticsearch.common.logging.DeprecationLogger;
@@ -49,7 +53,11 @@
 import java.util.Objects;
 import java.util.function.BiFunction;
 
-/** A {@link FieldMapper} for ip addresses. */
+import static org.elasticsearch.index.mapper.IpPrefixAutomatonUtil.buildIpPrefixAutomaton;
+
+/**
+ * A {@link FieldMapper} for ip addresses.
+ */
 public class IpFieldMapper extends FieldMapper {
 
     private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(IpFieldMapper.class);
@@ -417,6 +425,31 @@ public DocValueFormat docValueFormat(@Nullable String format, ZoneId timeZone) {
             return DocValueFormat.IP;
         }
 
+        @Override
+        public TermsEnum getTerms(IndexReader reader, String prefix, boolean caseInsensitive, String searchAfter) throws IOException {
+
+            Terms terms = null;
+            // terms_enum for ip only works if doc values are enabled
+            if (hasDocValues()) {
+                terms = SortedSetDocValuesTerms.getTerms(reader, name());
+            }
+            if (terms == null) {
+                // Field does not exist on this shard.
+                return null;
+            }
+            BytesRef searchBytes = searchAfter == null ? null : new BytesRef(InetAddressPoint.encode(InetAddress.getByName(searchAfter)));
+            CompiledAutomaton prefixAutomaton = buildIpPrefixAutomaton(prefix);
+
+            if (prefixAutomaton.type == CompiledAutomaton.AUTOMATON_TYPE.ALL) {
+                TermsEnum result = terms.iterator();
+                if (searchAfter != null) {
+                    result = new SearchAfterTermsEnum(result, searchBytes);
+                }
+                return result;
+            }
+            return terms.intersect(prefixAutomaton, searchBytes);
+        }
+
         /**
          * @return true if field has been marked as a dimension field
          */

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/IpPrefixAutomatonUtil.java b/server/src/main/java/org/elasticsearch/index/mapper/IpPrefixAutomatonUtil.java
@@ -0,0 +1,229 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.automaton.MinimizationOperations;
+import org.apache.lucene.util.automaton.Operations;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.IntStream;
+
+import static org.apache.lucene.util.automaton.Operations.concatenate;
+
+/**
+ * This class contains utility functionality to build an Automaton based
+ * on a prefix String on an `ip` field.
+ */
+public class IpPrefixAutomatonUtil {
+
+    private static final Automaton EMPTY_AUTOMATON = Automata.makeEmpty();
+    private static final Automaton IPV4_PREFIX = Automata.makeBinary(new BytesRef(new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1 }));
+
+    static final Map<Integer, Automaton> INCOMPLETE_IP4_GROUP_AUTOMATON_LOOKUP = new HashMap<>();
+    static {
+        for (int c = 0; c <= 255; c++) {
+            Automaton a = Automata.makeChar(c);
+            if (c > 0 && c < 10) {
+                // all one digit prefixes expand to the two digit range, i.e. 1 -> [10..19]
+                a = Operations.union(a, Automata.makeCharRange(c * 10, c * 10 + 9));
+                // 1 and 2 even to three digit ranges
+                if (c == 1) {
+                    a = Operations.union(a, Automata.makeCharRange(100, 199));
+                }
+                if (c == 2) {
+                    a = Operations.union(a, Automata.makeCharRange(200, 255));
+                }
+            }
+            if (c >= 10 && c < 26) {
+                int min = c * 10;
+                int max = Math.min(c * 10 + 9, 255);
+                a = Operations.union(a, Automata.makeCharRange(min, max));
+            }
+            INCOMPLETE_IP4_GROUP_AUTOMATON_LOOKUP.put(c, a);
+        }
+    }
+
+    /**
+     * Create a {@link CompiledAutomaton} from the ip Prefix.
+     * If the prefix is empty, the automaton returned will accept everything.
+     */
+    static CompiledAutomaton buildIpPrefixAutomaton(String ipPrefix) {
+        Automaton result;
+        if (ipPrefix.isEmpty() == false) {
+            Automaton ipv4Automaton = createIp4Automaton(ipPrefix);
+            if (ipv4Automaton != null) {
+                ipv4Automaton = concatenate(IPV4_PREFIX, ipv4Automaton);
+            }
+            Automaton ipv6Automaton = getIpv6Automaton(ipPrefix);
+            result = Operations.union(ipv4Automaton, ipv6Automaton);
+        } else {
+            result = Automata.makeAnyBinary();
+        }
+        result = MinimizationOperations.minimize(result, Integer.MAX_VALUE);
+        return new CompiledAutomaton(result, null, false, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT, true);
+    }
+
+    private static Automaton getIpv6Automaton(String ipPrefix) {
+        Automaton ipv6Automaton = EMPTY_AUTOMATON;
+        List<String> ip6Groups = parseIp6Prefix(ipPrefix);
+        if (ip6Groups.isEmpty() == false) {
+            ipv6Automaton = Automata.makeString("");
+            int groupsAdded = 0;
+            for (String group : ip6Groups) {
+                if (group.contains(".")) {
+                    // try to parse this as ipv4 ending part, but only if we already have some ipv6 specific stuff in front
+                    if (groupsAdded > 0) {
+                        ipv6Automaton = concatenate(ipv6Automaton, createIp4Automaton(group));
+                        groupsAdded += 2; // this counts as two bytes, missing bytes are padded already
+                    } else {
+                        return EMPTY_AUTOMATON;
+                    }
+                } else if (group.endsWith(":")) {
+                    groupsAdded++;
+                    // full block
+                    if (group.length() > 1) {
+                        group = group.substring(0, group.length() - 1);
+                        ipv6Automaton = concatenate(ipv6Automaton, automatonFromIPv6Group(padWithZeros(group, 4 - group.length())));
+                    } else {
+                        // single colon denotes left out zeros
+                        ipv6Automaton = concatenate(ipv6Automaton, Operations.repeat(Automata.makeChar(0)));
+                    }
+                } else {
+                    groupsAdded++;
+                    // partial block, we need to create all possibilities of byte sequences this could match
+                    ipv6Automaton = concatenate(ipv6Automaton, automatonFromIPv6Group(group));
+                }
+            }
+            // fill up the remainder of the 16 address bytes with wildcard matches, each group added so far counts for two bytes
+            for (int i = 0; i < 16 - groupsAdded * 2; i++) {
+                ipv6Automaton = concatenate(ipv6Automaton, Operations.optional(Automata.makeCharRange(0, 255)));
+            }
+        }
+        return ipv6Automaton;
+    }
+
+    static Automaton automatonFromIPv6Group(String ipv6Group) {
+        assert ipv6Group.length() > 0 && ipv6Group.length() <= 4 : "expected a full ipv6 group or prefix";
+        Automaton result = Automata.makeString("");
+        for (int leadingZeros = 0; leadingZeros <= 4 - ipv6Group.length(); leadingZeros++) {
+            int bytesAdded = 0;
+            String padded = padWithZeros(ipv6Group, leadingZeros);
+            Automaton a = Automata.makeString("");
+            while (padded.length() >= 2) {
+                a = concatenate(a, Automata.makeChar(Integer.parseInt(padded.substring(0, 2), 16)));
+                padded = padded.substring(2);
+                bytesAdded++;
+            }
+            if (padded.length() == 1) {
+                int value = Integer.parseInt(padded, 16);
+                a = concatenate(a, Operations.union(Automata.makeChar(value), Automata.makeCharRange(value * 16, value * 16 + 15)));
+                bytesAdded++;
+            }
+            if (bytesAdded != 2) {
+                a = concatenate(a, Automata.makeCharRange(0, 255));
+            }
+            result = Operations.union(result, a);
+        }
+        return result;
+    }
+
+    private static Pattern IPV4_GROUP_MATCHER = Pattern.compile(
+        "^((?:0|[1-9][0-9]{0,2})\\.)?" + "((?:0|[1-9][0-9]{0,2})\\.)?" + "((?:0|[1-9][0-9]{0,2})\\.)?" + "((?:0|[1-9][0-9]{0,2}))?$"
+    );
+
+    /**
+     * Creates an {@link Automaton} that accepts all ipv4 address byte representation
+     * that start with the given prefix. If the prefix is no valid ipv4 prefix, an automaton
+     * that accepts the empty language is returned.
+     */
+    static Automaton createIp4Automaton(String prefix) {
+        Matcher ip4Matcher = IPV4_GROUP_MATCHER.matcher(prefix);
+        if (ip4Matcher.matches() == false) {
+            return EMPTY_AUTOMATON;
+        }
+        int prefixBytes = 0;
+        byte[] completeByteGroups = new byte[4];
+        int completeBytes = 0;
+        // scan the groups the prefix matches
+        Automaton incompleteGroupAutomaton = Automata.makeString("");
+        for (int g = 1; g <= 4; g++) {
+            String group = ip4Matcher.group(g);
+            // note that intermediate groups might not match anything and can be empty
+            if (group != null) {
+                if (group.endsWith(".")) {
+                    // complete group found
+                    int value = Integer.parseInt(group.substring(0, group.length() - 1));
+                    if (value < 0 || value > 255) {
+                        // invalid value, append the empty result to the current one to make it match nothing
+                        return EMPTY_AUTOMATON;
+                    } else {
+                        completeByteGroups[completeBytes] = (byte) value;
+                        completeBytes++;
+                        prefixBytes++;
+                    }
+                } else {
+                    // if present, this is the last group
+                    int numberPrefix = Integer.parseInt(group);
+                    if (numberPrefix < 255) {
+                        incompleteGroupAutomaton = INCOMPLETE_IP4_GROUP_AUTOMATON_LOOKUP.get(numberPrefix);
+                        prefixBytes++;
+                    } else {
+                        // this cannot be a valid ip4 groups
+                        return EMPTY_AUTOMATON;
+                    }
+                }
+            }
+        }
+        return concatenate(
+            List.of(
+                Automata.makeBinary(new BytesRef(completeByteGroups, 0, completeBytes)),
+                incompleteGroupAutomaton,
+                Operations.repeat(Automata.makeCharRange(0, 255), 4 - prefixBytes, 4 - prefixBytes)
+            )
+        );
+    }
+
+    private static String padWithZeros(String input, int leadingZeros) {
+        return new StringBuilder("0".repeat(leadingZeros)).append(input).toString();
+    }
+
+    private static Pattern IP6_BLOCK_MATCHER = Pattern.compile(
+        "([a-f0-9]{0,4}:)|([a-f0-9]{1,4}$)" // the ipv6 specific notation
+            + "|((?:(?:0|[1-9][0-9]{0,2})\\.){1,3}(?:0|[1-9][0-9]{0,2})?$)" // the optional ipv4 part
+    );
+
+    static List<String> parseIp6Prefix(String ip6Prefix) {
+        Matcher ip6blockMatcher = IP6_BLOCK_MATCHER.matcher(ip6Prefix);
+        int position = 0;
+        List<String> groups = new ArrayList<>();
+        while (ip6blockMatcher.find(position)) {
+            if (ip6blockMatcher.start() == position) {
+                position = ip6blockMatcher.end();
+                IntStream.rangeClosed(1, 3).mapToObj(i -> ip6blockMatcher.group(i)).filter(s -> s != null).forEach(groups::add);
+            } else {
+                return Collections.emptyList();
+            }
+        }
+        if (position != ip6Prefix.length()) {
+            // no full match, return empty list
+            return Collections.emptyList();
+        }
+        return groups;
+    }
+}