[ML-DataFrame] adapt page size on circuit breaker responses (#41149)

Hendrik Muhs · Hendrik Muhs · commit 02247cc7df46 · 2019-04-16T19:49:43.000+02:00
handle circuit breaker response and adapt page size to reduce memory pressure, reduce preview buckets to 100, initial page size to 500
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/dataframe/DataFrameMessages.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/dataframe/DataFrameMessages.java
@@ -56,6 +56,11 @@ public class DataFrameMessages {
             "Failed to parse group_by for data frame pivot transform";
     public static final String LOG_DATA_FRAME_TRANSFORM_CONFIGURATION_BAD_AGGREGATION =
             "Failed to parse aggregation for data frame pivot transform";
+    public static final String LOG_DATA_FRAME_TRANSFORM_PIVOT_REDUCE_PAGE_SIZE =
+            "Search returned with out of memory error, reducing number of buckets per search from [{0}] to [{1}]";
+    public static final String LOG_DATA_FRAME_TRANSFORM_PIVOT_LOW_PAGE_SIZE_FAILURE =
+            "Search returned with out of memory error after repeated page size reductions to [{0}], unable to continue pivot, "
+            + "please simplify job or increase heap size on data nodes.";
 
     public static final String FAILED_TO_PARSE_TRANSFORM_CHECKPOINTS =
             "Failed to parse transform checkpoints for [{0}]";
diff --git a/x-pack/plugin/data-frame/qa/single-node-tests/src/test/java/org/elasticsearch/xpack/dataframe/integration/DataFramePivotRestIT.java b/x-pack/plugin/data-frame/qa/single-node-tests/src/test/java/org/elasticsearch/xpack/dataframe/integration/DataFramePivotRestIT.java
@@ -260,7 +260,8 @@ public void testPreviewTransform() throws Exception {
         createPreviewRequest.setJsonEntity(config);
         Map<String, Object> previewDataframeResponse = entityAsMap(client().performRequest(createPreviewRequest));
         List<Map<String, Object>> preview = (List<Map<String, Object>>)previewDataframeResponse.get("preview");
-        assertThat(preview.size(), equalTo(393));
+        // preview is limited to 100
+        assertThat(preview.size(), equalTo(100));
         Set<String> expectedFields = new HashSet<>(Arrays.asList("reviewer", "by_day", "avg_rating"));
         preview.forEach(p -> {
             Set<String> keys = p.keySet();
diff --git a/x-pack/plugin/data-frame/src/main/java/org/elasticsearch/xpack/dataframe/action/TransportPreviewDataFrameTransformAction.java b/x-pack/plugin/data-frame/src/main/java/org/elasticsearch/xpack/dataframe/action/TransportPreviewDataFrameTransformAction.java
@@ -35,6 +35,7 @@
 public class TransportPreviewDataFrameTransformAction extends
     HandledTransportAction<PreviewDataFrameTransformAction.Request, PreviewDataFrameTransformAction.Response> {
 
+    private static final int NUMBER_OF_PREVIEW_BUCKETS = 100;
     private final XPackLicenseState licenseState;
     private final Client client;
     private final ThreadPool threadPool;
@@ -77,7 +78,7 @@ private void getPreview(Pivot pivot, ActionListener<List<Map<String, Object>>> l
                     ClientHelper.DATA_FRAME_ORIGIN,
                     client,
                     SearchAction.INSTANCE,
-                    pivot.buildSearchRequest(null),
+                    pivot.buildSearchRequest(null, NUMBER_OF_PREVIEW_BUCKETS),
                     ActionListener.wrap(
                         r -> {
                             final CompositeAggregation agg = r.getAggregations().get(COMPOSITE_AGGREGATION_NAME);
diff --git a/x-pack/plugin/data-frame/src/main/java/org/elasticsearch/xpack/dataframe/transforms/DataFrameIndexer.java b/x-pack/plugin/data-frame/src/main/java/org/elasticsearch/xpack/dataframe/transforms/DataFrameIndexer.java
@@ -8,14 +8,21 @@
 
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
+import org.elasticsearch.ExceptionsHelper;
 import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.action.index.IndexRequest;
+import org.elasticsearch.action.search.SearchPhaseExecutionException;
 import org.elasticsearch.action.search.SearchRequest;
 import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.action.search.ShardSearchFailure;
+import org.elasticsearch.common.breaker.CircuitBreakingException;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.index.query.QueryBuilder;
 import org.elasticsearch.search.aggregations.bucket.composite.CompositeAggregation;
+import org.elasticsearch.xpack.core.common.notifications.Auditor;
 import org.elasticsearch.xpack.core.dataframe.DataFrameField;
+import org.elasticsearch.xpack.core.dataframe.DataFrameMessages;
+import org.elasticsearch.xpack.core.dataframe.notifications.DataFrameAuditMessage;
 import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameIndexerTransformStats;
 import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransformConfig;
 import org.elasticsearch.xpack.core.indexing.AsyncTwoPhaseIndexer;
@@ -26,6 +33,7 @@
 import java.io.IOException;
 import java.io.UncheckedIOException;
 import java.util.Map;
+import java.util.Objects;
 import java.util.concurrent.Executor;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.stream.Collectors;
@@ -35,22 +43,34 @@
 
 public abstract class DataFrameIndexer extends AsyncTwoPhaseIndexer<Map<String, Object>, DataFrameIndexerTransformStats> {
 
+    public static final int MINIMUM_PAGE_SIZE = 10;
     public static final String COMPOSITE_AGGREGATION_NAME = "_data_frame";
     private static final Logger logger = LogManager.getLogger(DataFrameIndexer.class);
 
+    protected final Auditor<DataFrameAuditMessage> auditor;
+
     private Pivot pivot;
+    private int pageSize = 0;
 
     public DataFrameIndexer(Executor executor,
+                            Auditor<DataFrameAuditMessage> auditor,
                             AtomicReference<IndexerState> initialState,
                             Map<String, Object> initialPosition,
                             DataFrameIndexerTransformStats jobStats) {
         super(executor, initialState, initialPosition, jobStats);
+        this.auditor = Objects.requireNonNull(auditor);
     }
 
     protected abstract DataFrameTransformConfig getConfig();
 
     protected abstract Map<String, String> getFieldMappings();
 
+    protected abstract void failIndexer(String message);
+
+    public int getPageSize() {
+        return pageSize;
+    }
+
     /**
      * Request a checkpoint
      */
@@ -62,6 +82,11 @@ protected void onStart(long now, ActionListener<Void> listener) {
             QueryBuilder queryBuilder = getConfig().getSource().getQueryConfig().getQuery();
             pivot = new Pivot(getConfig().getSource().getIndex(), queryBuilder, getConfig().getPivotConfig());
 
+            // if we haven't set the page size yet, if it is set we might have reduced it after running into an out of memory
+            if (pageSize == 0) {
+                pageSize = pivot.getInitialPageSize();
+            }
+
             // if run for the 1st time, create checkpoint
             if (getPosition() == null) {
                 createCheckpoint(listener);
@@ -73,6 +98,12 @@ protected void onStart(long now, ActionListener<Void> listener) {
         }
     }
 
+    @Override
+    protected void onFinish(ActionListener<Void> listener) {
+        // reset the page size, so we do not memorize a low page size forever, the pagesize will be re-calculated on start
+        pageSize = 0;
+    }
+
     @Override
     protected IterationResult<Map<String, Object>> doProcess(SearchResponse searchResponse) {
         final CompositeAggregation agg = searchResponse.getAggregations().get(COMPOSITE_AGGREGATION_NAME);
@@ -121,6 +152,70 @@ private Stream<IndexRequest> processBucketsToIndexRequests(CompositeAggregation
 
     @Override
     protected SearchRequest buildSearchRequest() {
-        return pivot.buildSearchRequest(getPosition());
+        return pivot.buildSearchRequest(getPosition(), pageSize);
+    }
+
+    /**
+     * Handle the circuit breaking case: A search consumed to much memory and got aborted.
+     *
+     * Going out of memory we smoothly reduce the page size which reduces memory consumption.
+     *
+     * Implementation details: We take the values from the circuit breaker as a hint, but
+     * note that it breaks early, that's why we also reduce using
+     *
+     * @param e Exception thrown, only {@link CircuitBreakingException} are handled
+     * @return true if exception was handled, false if not
+     */
+    protected boolean handleCircuitBreakingException(Exception e) {
+        CircuitBreakingException circuitBreakingException = getCircuitBreakingException(e);
+
+        if (circuitBreakingException == null) {
+            return false;
+        }
+
+        double reducingFactor = Math.min((double) circuitBreakingException.getByteLimit() / circuitBreakingException.getBytesWanted(),
+                1 - (Math.log10(pageSize) * 0.1));
+
+        int newPageSize = (int) Math.round(reducingFactor * pageSize);
+
+        if (newPageSize < MINIMUM_PAGE_SIZE) {
+            String message = DataFrameMessages.getMessage(DataFrameMessages.LOG_DATA_FRAME_TRANSFORM_PIVOT_LOW_PAGE_SIZE_FAILURE, pageSize);
+            failIndexer(message);
+            return true;
+        }
+
+        String message = DataFrameMessages.getMessage(DataFrameMessages.LOG_DATA_FRAME_TRANSFORM_PIVOT_REDUCE_PAGE_SIZE, pageSize,
+                newPageSize);
+        auditor.info(getJobId(), message);
+        logger.info("Data frame transform [" + getJobId() + "]:" + message);
+
+        pageSize = newPageSize;
+        return true;
+    }
+
+    /**
+     * Inspect exception for circuit breaking exception and return the first one it can find.
+     *
+     * @param e Exception
+     * @return CircuitBreakingException instance if found, null otherwise
+     */
+    private static CircuitBreakingException getCircuitBreakingException(Exception e) {
+        // circuit breaking exceptions are at the bottom
+        Throwable unwrappedThrowable = ExceptionsHelper.unwrapCause(e);
+
+        if (unwrappedThrowable instanceof CircuitBreakingException) {
+            return (CircuitBreakingException) unwrappedThrowable;
+        } else if (unwrappedThrowable instanceof SearchPhaseExecutionException) {
+            SearchPhaseExecutionException searchPhaseException = (SearchPhaseExecutionException) e;
+            for (ShardSearchFailure shardFailure : searchPhaseException.shardFailures()) {
+                Throwable unwrappedShardFailure = ExceptionsHelper.unwrapCause(shardFailure.getCause());
+
+                if (unwrappedShardFailure instanceof CircuitBreakingException) {
+                    return (CircuitBreakingException) unwrappedShardFailure;
+                }
+            }
+        }
+
+        return null;
     }
 }
diff --git a/x-pack/plugin/data-frame/src/main/java/org/elasticsearch/xpack/dataframe/transforms/DataFrameTransformTask.java b/x-pack/plugin/data-frame/src/main/java/org/elasticsearch/xpack/dataframe/transforms/DataFrameTransformTask.java
@@ -277,25 +277,6 @@ void persistStateToClusterState(DataFrameTransformState state,
         ));
     }
 
-    private boolean isIrrecoverableFailure(Exception e) {
-        return e instanceof IndexNotFoundException || e instanceof DataFrameConfigurationException;
-    }
-
-    synchronized void handleFailure(Exception e) {
-        if (isIrrecoverableFailure(e) || failureCount.incrementAndGet() > MAX_CONTINUOUS_FAILURES) {
-            String failureMessage = isIrrecoverableFailure(e) ?
-                "task encountered irrecoverable failure: " + e.getMessage() :
-                "task encountered more than " + MAX_CONTINUOUS_FAILURES + " failures; latest failure: " + e.getMessage();
-            auditor.error(transform.getId(), failureMessage);
-            stateReason.set(failureMessage);
-            taskState.set(DataFrameTransformTaskState.FAILED);
-            persistStateToClusterState(getState(), ActionListener.wrap(
-                r -> failureCount.set(0), // Successfully marked as failed, reset counter so that task can be restarted
-                exception -> {} // Noop, internal method logs the failure to update the state
-            ));
-        }
-    }
-
     /**
      * This is called when the persistent task signals that the allocated task should be terminated.
      * Termination in the task framework is essentially voluntary, as the allocated task can only be
@@ -313,13 +294,11 @@ public synchronized void onCancelled() {
 
     protected class ClientDataFrameIndexer extends DataFrameIndexer {
         private static final int LOAD_TRANSFORM_TIMEOUT_IN_SECONDS = 30;
-        private static final int CREATE_CHECKPOINT_TIMEOUT_IN_SECONDS = 30;
 
         private final Client client;
         private final DataFrameTransformsConfigManager transformsConfigManager;
         private final DataFrameTransformsCheckpointService transformsCheckpointService;
         private final String transformId;
-        private final Auditor<DataFrameAuditMessage> auditor;
         private volatile DataFrameIndexerTransformStats previouslyPersistedStats = null;
         // Keeps track of the last exception that was written to our audit, keeps us from spamming the audit index
         private volatile String lastAuditedExceptionMessage = null;
@@ -331,13 +310,12 @@ public ClientDataFrameIndexer(String transformId, DataFrameTransformsConfigManag
                                       DataFrameTransformsCheckpointService transformsCheckpointService,
                                       AtomicReference<IndexerState> initialState, Map<String, Object> initialPosition, Client client,
                                       Auditor<DataFrameAuditMessage> auditor) {
-            super(threadPool.executor(ThreadPool.Names.GENERIC), initialState, initialPosition,
+            super(threadPool.executor(ThreadPool.Names.GENERIC), auditor, initialState, initialPosition,
                 new DataFrameIndexerTransformStats(transformId));
             this.transformId = transformId;
             this.transformsConfigManager = transformsConfigManager;
             this.transformsCheckpointService = transformsCheckpointService;
             this.client = client;
-            this.auditor = auditor;
         }
 
         @Override
@@ -474,19 +452,26 @@ protected void doSaveState(IndexerState indexerState, Map<String, Object> positi
 
         @Override
         protected void onFailure(Exception exc) {
-            // Since our schedule fires again very quickly after failures it is possible to run into the same failure numerous
-            // times in a row, very quickly. We do not want to spam the audit log with repeated failures, so only record the first one
-            if (exc.getMessage().equals(lastAuditedExceptionMessage) == false) {
-                auditor.warning(transform.getId(), "Data frame transform encountered an exception: " + exc.getMessage());
-                lastAuditedExceptionMessage = exc.getMessage();
+            // the failure handler must not throw an exception due to internal problems
+            try {
+                logger.warn("Data frame transform [" + transform.getId() + "] encountered an exception: ", exc);
+
+                // Since our schedule fires again very quickly after failures it is possible to run into the same failure numerous
+                // times in a row, very quickly. We do not want to spam the audit log with repeated failures, so only record the first one
+                if (exc.getMessage().equals(lastAuditedExceptionMessage) == false) {
+                    auditor.warning(transform.getId(), "Data frame transform encountered an exception: " + exc.getMessage());
+                    lastAuditedExceptionMessage = exc.getMessage();
+                }
+                handleFailure(exc);
+            } catch (Exception e) {
+                logger.error("Data frame transform encountered an unexpected internal exception: " ,e);
             }
-            logger.warn("Data frame transform [" + transform.getId() + "] encountered an exception: ", exc);
-            handleFailure(exc);
         }
 
         @Override
         protected void onFinish(ActionListener<Void> listener) {
             try {
+                super.onFinish(listener);
                 long checkpoint = currentCheckpoint.incrementAndGet();
                 auditor.info(transform.getId(), "Finished indexing for data frame transform checkpoint [" + checkpoint + "]");
                 logger.info("Finished indexing for data frame transform [" + transform.getId() + "] checkpoint [" + checkpoint + "]");
@@ -515,6 +500,35 @@ protected void createCheckpoint(ActionListener<Void> listener) {
                 listener.onFailure(new RuntimeException("Failed to retrieve checkpoint", getCheckPointException));
             }));
         }
+
+        private boolean isIrrecoverableFailure(Exception e) {
+            return e instanceof IndexNotFoundException || e instanceof DataFrameConfigurationException;
+        }
+
+        synchronized void handleFailure(Exception e) {
+            if (handleCircuitBreakingException(e)) {
+                return;
+            }
+
+            if (isIrrecoverableFailure(e) || failureCount.incrementAndGet() > MAX_CONTINUOUS_FAILURES) {
+                String failureMessage = isIrrecoverableFailure(e) ?
+                    "task encountered irrecoverable failure: " + e.getMessage() :
+                    "task encountered more than " + MAX_CONTINUOUS_FAILURES + " failures; latest failure: " + e.getMessage();
+                failIndexer(failureMessage);
+            }
+        }
+
+        @Override
+        protected void failIndexer(String failureMessage) {
+            logger.error("Data frame transform [" + getJobId() + "]:" + failureMessage);
+            auditor.error(transform.getId(), failureMessage);
+            stateReason.set(failureMessage);
+            taskState.set(DataFrameTransformTaskState.FAILED);
+            persistStateToClusterState(DataFrameTransformTask.this.getState(), ActionListener.wrap(
+                r -> failureCount.set(0), // Successfully marked as failed, reset counter so that task can be restarted
+                exception -> {} // Noop, internal method logs the failure to update the state
+            ));
+        }
     }
 
     class DataFrameConfigurationException extends RuntimeException {
diff --git a/x-pack/plugin/data-frame/src/main/java/org/elasticsearch/xpack/dataframe/transforms/pivot/Pivot.java b/x-pack/plugin/data-frame/src/main/java/org/elasticsearch/xpack/dataframe/transforms/pivot/Pivot.java
@@ -35,6 +35,8 @@
 import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
 
 public class Pivot {
+    public static final int DEFAULT_INITIAL_PAGE_SIZE = 500;
+
     private static final String COMPOSITE_AGGREGATION_NAME = "_data_frame";
 
     private final PivotConfig config;
@@ -68,11 +70,29 @@ public void deduceMappings(Client client, final ActionListener<Map<String, Strin
         SchemaUtil.deduceMappings(client, config, source, listener);
     }
 
-    public SearchRequest buildSearchRequest(Map<String, Object> position) {
+    /**
+     * Get the initial page size for this pivot.
+     *
+     * The page size is the main parameter for adjusting memory consumption. Memory consumption mainly depends on
+     * the page size, the type of aggregations and the data. As the page size is the number of buckets we return
+     * per page the page size is a multiplier for the costs of aggregating bucket.
+     *
+     * Initially this returns a default, in future it might inspect the configuration and base the initial size
+     * on the aggregations used.
+     *
+     * @return the page size
+     */
+    public int getInitialPageSize() {
+        return DEFAULT_INITIAL_PAGE_SIZE;
+    }
+
+    public SearchRequest buildSearchRequest(Map<String, Object> position, int pageSize) {
         if (position != null) {
             cachedCompositeAggregation.aggregateAfter(position);
         }
 
+        cachedCompositeAggregation.size(pageSize);
+
         return cachedSearchRequest;
     }
 
@@ -127,7 +147,6 @@ private static CompositeAggregationBuilder createCompositeAggregation(PivotConfi
             XContentParser parser = builder.generator().contentType().xContent().createParser(NamedXContentRegistry.EMPTY,
                     LoggingDeprecationHandler.INSTANCE, BytesReference.bytes(builder).streamInput());
             compositeAggregation = CompositeAggregationBuilder.parse(COMPOSITE_AGGREGATION_NAME, parser);
-            compositeAggregation.size(1000);
             config.getAggregationConfig().getAggregatorFactories().forEach(agg -> compositeAggregation.subAggregation(agg));
         } catch (IOException e) {
             throw new RuntimeException(DataFrameMessages.DATA_FRAME_TRANSFORM_PIVOT_FAILED_TO_CREATE_COMPOSITE_AGGREGATION, e);
diff --git a/x-pack/plugin/data-frame/src/test/java/org/elasticsearch/xpack/dataframe/transforms/DataFrameIndexerTests.java b/x-pack/plugin/data-frame/src/test/java/org/elasticsearch/xpack/dataframe/transforms/DataFrameIndexerTests.java