77
88import com .ibm .icu .text .CharsetDetector ;
99import com .ibm .icu .text .CharsetMatch ;
10+ import org .elasticsearch .ElasticsearchTimeoutException ;
1011import org .elasticsearch .common .collect .Tuple ;
12+ import org .elasticsearch .common .unit .TimeValue ;
1113
1214import java .io .BufferedInputStream ;
1315import java .io .BufferedReader ;
2325import java .util .HashSet ;
2426import java .util .List ;
2527import java .util .Locale ;
28+ import java .util .Objects ;
2629import java .util .Optional ;
2730import java .util .Set ;
31+ import java .util .concurrent .ScheduledExecutorService ;
2832import java .util .stream .Collectors ;
2933
3034/**
3135 * Runs the high-level steps needed to create ingest configs for the specified file. In order:
3236 * 1. Determine the most likely character set (UTF-8, UTF-16LE, ISO-8859-2, etc.)
3337 * 2. Load a sample of the file, consisting of the first 1000 lines of the file
34- * 3. Determine the most likely file structure - one of ND-JSON, XML, CSV, TSV or semi-structured text
38+ * 3. Determine the most likely file structure - one of ND-JSON, XML, delimited or semi-structured text
3539 * 4. Create an appropriate structure object and delegate writing configs to it
3640 */
3741public final class FileStructureFinderManager {
@@ -81,8 +85,18 @@ public final class FileStructureFinderManager {
8185
8286 private static final int BUFFER_SIZE = 8192 ;
8387
88+ private final ScheduledExecutorService scheduler ;
89+
90+ /**
91+ * Create the file structure manager.
92+ * @param scheduler Used for checking timeouts.
93+ */
94+ public FileStructureFinderManager (ScheduledExecutorService scheduler ) {
95+ this .scheduler = Objects .requireNonNull (scheduler );
96+ }
97+
8498 public FileStructureFinder findFileStructure (Integer idealSampleLineCount , InputStream fromFile ) throws Exception {
85- return findFileStructure (idealSampleLineCount , fromFile , FileStructureOverrides .EMPTY_OVERRIDES );
99+ return findFileStructure (idealSampleLineCount , fromFile , FileStructureOverrides .EMPTY_OVERRIDES , null );
86100 }
87101
88102 /**
@@ -95,42 +109,49 @@ public FileStructureFinder findFileStructure(Integer idealSampleLineCount, Input
95109 * @param overrides Aspects of the file structure that are known in advance. These take precedence over
96110 * values determined by structure analysis. An exception will be thrown if the file structure
97111 * is incompatible with an overridden value.
112+ * @param timeout The maximum time the analysis is permitted to take. If it takes longer than this an
113+ * {@link ElasticsearchTimeoutException} may be thrown (although not necessarily immediately
114+ * the timeout is exceeded).
98115 * @return A {@link FileStructureFinder} object from which the structure and messages can be queried.
99116 * @throws Exception A variety of problems could occur at various stages of the structure finding process.
100117 */
101- public FileStructureFinder findFileStructure (Integer idealSampleLineCount , InputStream fromFile , FileStructureOverrides overrides )
118+ public FileStructureFinder findFileStructure (Integer idealSampleLineCount , InputStream fromFile , FileStructureOverrides overrides ,
119+ TimeValue timeout )
102120 throws Exception {
103121 return findFileStructure (new ArrayList <>(), (idealSampleLineCount == null ) ? DEFAULT_IDEAL_SAMPLE_LINE_COUNT : idealSampleLineCount ,
104- fromFile , overrides );
122+ fromFile , overrides , timeout );
105123 }
106124
107125 public FileStructureFinder findFileStructure (List <String > explanation , int idealSampleLineCount , InputStream fromFile )
108126 throws Exception {
109- return findFileStructure (new ArrayList <>() , idealSampleLineCount , fromFile , FileStructureOverrides .EMPTY_OVERRIDES );
127+ return findFileStructure (explanation , idealSampleLineCount , fromFile , FileStructureOverrides .EMPTY_OVERRIDES , null );
110128 }
111129
112130 public FileStructureFinder findFileStructure (List <String > explanation , int idealSampleLineCount , InputStream fromFile ,
113- FileStructureOverrides overrides ) throws Exception {
114-
115- String charsetName = overrides .getCharset ();
116- Reader sampleReader ;
117- if (charsetName != null ) {
118- // Creating the reader will throw if the specified character set does not exist
119- sampleReader = new InputStreamReader (fromFile , charsetName );
120- explanation .add ("Using specified character encoding [" + charsetName + "]" );
121- } else {
122- CharsetMatch charsetMatch = findCharset (explanation , fromFile );
123- charsetName = charsetMatch .getName ();
124- sampleReader = charsetMatch .getReader ();
125- }
131+ FileStructureOverrides overrides , TimeValue timeout ) throws Exception {
132+
133+ try (TimeoutChecker timeoutChecker = new TimeoutChecker ("structure analysis" , timeout , scheduler )) {
134+
135+ String charsetName = overrides .getCharset ();
136+ Reader sampleReader ;
137+ if (charsetName != null ) {
138+ // Creating the reader will throw if the specified character set does not exist
139+ sampleReader = new InputStreamReader (fromFile , charsetName );
140+ explanation .add ("Using specified character encoding [" + charsetName + "]" );
141+ } else {
142+ CharsetMatch charsetMatch = findCharset (explanation , fromFile , timeoutChecker );
143+ charsetName = charsetMatch .getName ();
144+ sampleReader = charsetMatch .getReader ();
145+ }
126146
127- Tuple <String , Boolean > sampleInfo = sampleFile (sampleReader , charsetName , MIN_SAMPLE_LINE_COUNT ,
128- Math .max (MIN_SAMPLE_LINE_COUNT , idealSampleLineCount ));
147+ Tuple <String , Boolean > sampleInfo = sampleFile (sampleReader , charsetName , MIN_SAMPLE_LINE_COUNT ,
148+ Math .max (MIN_SAMPLE_LINE_COUNT , idealSampleLineCount ), timeoutChecker );
129149
130- return makeBestStructureFinder (explanation , sampleInfo .v1 (), charsetName , sampleInfo .v2 (), overrides );
150+ return makeBestStructureFinder (explanation , sampleInfo .v1 (), charsetName , sampleInfo .v2 (), overrides , timeoutChecker );
151+ }
131152 }
132153
133- CharsetMatch findCharset (List <String > explanation , InputStream inputStream ) throws Exception {
154+ CharsetMatch findCharset (List <String > explanation , InputStream inputStream , TimeoutChecker timeoutChecker ) throws Exception {
134155
135156 // We need an input stream that supports mark and reset, so wrap the argument
136157 // in a BufferedInputStream if it doesn't already support this feature
@@ -141,6 +162,7 @@ CharsetMatch findCharset(List<String> explanation, InputStream inputStream) thro
141162 // This is from ICU4J
142163 CharsetDetector charsetDetector = new CharsetDetector ().setText (inputStream );
143164 CharsetMatch [] charsetMatches = charsetDetector .detectAll ();
165+ timeoutChecker .check ("character set detection" );
144166
145167 // Determine some extra characteristics of the input to compensate for some deficiencies of ICU4J
146168 boolean pureAscii = true ;
@@ -164,6 +186,7 @@ CharsetMatch findCharset(List<String> explanation, InputStream inputStream) thro
164186 remainingLength -= bytesRead ;
165187 } while (containsZeroBytes == false && remainingLength > 0 );
166188 inputStream .reset ();
189+ timeoutChecker .check ("character set detection" );
167190
168191 if (pureAscii ) {
169192 // If the input is pure ASCII then many single byte character sets will match. We want to favour
@@ -220,7 +243,7 @@ CharsetMatch findCharset(List<String> explanation, InputStream inputStream) thro
220243 }
221244
222245 FileStructureFinder makeBestStructureFinder (List <String > explanation , String sample , String charsetName , Boolean hasByteOrderMarker ,
223- FileStructureOverrides overrides ) throws Exception {
246+ FileStructureOverrides overrides , TimeoutChecker timeoutChecker ) throws Exception {
224247
225248 Character delimiter = overrides .getDelimiter ();
226249 Character quote = overrides .getQuote ();
@@ -250,16 +273,18 @@ FileStructureFinder makeBestStructureFinder(List<String> explanation, String sam
250273 }
251274
252275 for (FileStructureFinderFactory factory : factories ) {
276+ timeoutChecker .check ("high level format detection" );
253277 if (factory .canCreateFromSample (explanation , sample )) {
254- return factory .createFromSample (explanation , sample , charsetName , hasByteOrderMarker , overrides );
278+ return factory .createFromSample (explanation , sample , charsetName , hasByteOrderMarker , overrides , timeoutChecker );
255279 }
256280 }
257281
258282 throw new IllegalArgumentException ("Input did not match " +
259283 ((overrides .getFormat () == null ) ? "any known formats" : "the specified format [" + overrides .getFormat () + "]" ));
260284 }
261285
262- private Tuple <String , Boolean > sampleFile (Reader reader , String charsetName , int minLines , int maxLines ) throws IOException {
286+ private Tuple <String , Boolean > sampleFile (Reader reader , String charsetName , int minLines , int maxLines , TimeoutChecker timeoutChecker )
287+ throws IOException {
263288
264289 int lineCount = 0 ;
265290 BufferedReader bufferedReader = new BufferedReader (reader );
@@ -283,6 +308,7 @@ private Tuple<String, Boolean> sampleFile(Reader reader, String charsetName, int
283308 String line ;
284309 while ((line = bufferedReader .readLine ()) != null && ++lineCount <= maxLines ) {
285310 sample .append (line ).append ('\n' );
311+ timeoutChecker .check ("sample line splitting" );
286312 }
287313
288314 if (lineCount < minLines ) {
0 commit comments