From 22d8ee66df2cc662b5f63f75ced03349f7e2da13 Mon Sep 17 00:00:00 2001
From: David Roberts <dave.roberts@elastic.co>
Date: Wed, 29 Aug 2018 11:39:50 +0100
Subject: [PATCH] [ML] Fix character set finder bug with unencodable charsets

Some character sets cannot be encoded and this was tripping
up the binary data check in the ML log structure character
set finder.

The fix is to assume that if ICU4J identifies that some bytes
correspond to a character set that cannot be encoded and those
bytes contain zeroes then the data is binary rather than text.

Fixes #33227
---
 .../LogStructureFinderManager.java                   | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManager.java b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManager.java
index 7f18445e505e3..a8fd9d7eb895b 100644
--- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManager.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManager.java
@@ -163,9 +163,15 @@ CharsetMatch findCharset(List<String> explanation, InputStream inputStream) thro
                 // deduction algorithms on binary files is very slow as the binary files generally appear to
                 // have very long lines.
                 boolean spaceEncodingContainsZeroByte = false;
-                byte[] spaceBytes = " ".getBytes(name);
-                for (int i = 0; i < spaceBytes.length && spaceEncodingContainsZeroByte == false; ++i) {
-                    spaceEncodingContainsZeroByte = (spaceBytes[i] == 0);
+                Charset charset = Charset.forName(name);
+                // Some character sets cannot be encoded.  These are extremely rare so it's likely that
+                // they've been chosen based on incorrectly provided binary data.  Therefore, err on
+                // the side of rejecting binary data.
+                if (charset.canEncode()) {
+                    byte[] spaceBytes = " ".getBytes(charset);
+                    for (int i = 0; i < spaceBytes.length && spaceEncodingContainsZeroByte == false; ++i) {
+                        spaceEncodingContainsZeroByte = (spaceBytes[i] == 0);
+                    }
                 }
                 if (containsZeroBytes && spaceEncodingContainsZeroByte == false) {
                     explanation.add("Character encoding [" + name + "] matched the input with [" + charsetMatch.getConfidence() +