From fe03526b7ab180ad87224ca53ba07f6eed58682e Mon Sep 17 00:00:00 2001 From: udaysagar2177 Date: Wed, 10 Jul 2019 00:04:02 -0700 Subject: [PATCH 1/3] Fix incorrect escape in Gradient2Limit When we have a high limit, we skip adjusting the limit even when requests start taking longer times to finish. --- .../limits/limit/Gradient2Limit.java | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/concurrency-limits-core/src/main/java/com/netflix/concurrency/limits/limit/Gradient2Limit.java b/concurrency-limits-core/src/main/java/com/netflix/concurrency/limits/limit/Gradient2Limit.java index 11ec7d57..9ad77511 100644 --- a/concurrency-limits-core/src/main/java/com/netflix/concurrency/limits/limit/Gradient2Limit.java +++ b/concurrency-limits-core/src/main/java/com/netflix/concurrency/limits/limit/Gradient2Limit.java @@ -39,11 +39,14 @@ * applied to the base RTT so that the value is kept stable yet is allowed to adapt to long term changes in latency * characteristics. * - * The core algorithm re-calculates the limit every sampling window (ex. 1 second) using the formula + * The core algorithm re-calculates the limit on each request * * // Calculate the gradient limiting to the range [0.5, 1.0] to filter outliers * gradient = max(0.5, min(1.0, longtermRtt / currentRtt)); * + * // If gradient is 1.0 and in-flight requests are less than half of the currentLimit, skip + * // adjusting the limit. + * * // Calculate the new limit by applying the gradient and allowing for some queuing * newLimit = gradient * currentLimit + queueSize; * @@ -60,7 +63,7 @@ * 2. Transition from steady state to load * * In this state either the RPS to latency has spiked. The gradient is {@literal <} 1.0 due to a growing request queue that - * cannot be handled by the system. Excessive requests and rejected due to the low limit. The baseline RTT grows using + * cannot be handled by the system. Excessive requests are rejected due to the low limit. The baseline RTT grows using * exponential decay but lags the current measurement, which keeps the gradient {@literal <} 1.0 and limit low. * * 3. Transition from load to steady state @@ -275,16 +278,17 @@ public int _update(final long startTime, final long rtt, final int inflight, fin this.longRtt.update(current -> current.doubleValue() * 0.95); } - // Don't grow the limit if we are app limited - if (inflight < estimatedLimit / 2) { - return (int) estimatedLimit; - } - - // Rtt could be higher than rtt_noload because of smoothing rtt noload updates + // Rtt could be higher than longRtt because of smoothing longRtt updates // so set to 1.0 to indicate no queuing. Otherwise calculate the slope and don't - // allow it to be reduced by more than half to avoid aggressive load-shedding due to + // allow it to be reduced by not more than half to avoid aggressive load-shedding due to // outliers. final double gradient = Math.max(0.5, Math.min(1.0, tolerance * longRtt / shortRtt)); + + // Don't grow the limit if not necessary + if (gradient == 1.0 && inflight < estimatedLimit / 2) { + return (int) estimatedLimit; + } + double newLimit = estimatedLimit * gradient + queueSize; newLimit = estimatedLimit * (1 - smoothing) + newLimit * smoothing; newLimit = Math.max(minLimit, Math.min(maxLimit, newLimit)); From f7aec2c86aa578814037a226577fd742a664cbe0 Mon Sep 17 00:00:00 2001 From: udaysagar2177 Date: Wed, 10 Jul 2019 00:13:30 -0700 Subject: [PATCH 2/3] consider dropped requests in Gradient2Limit --- .../netflix/concurrency/limits/limit/Gradient2Limit.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/concurrency-limits-core/src/main/java/com/netflix/concurrency/limits/limit/Gradient2Limit.java b/concurrency-limits-core/src/main/java/com/netflix/concurrency/limits/limit/Gradient2Limit.java index 9ad77511..1a6b264f 100644 --- a/concurrency-limits-core/src/main/java/com/netflix/concurrency/limits/limit/Gradient2Limit.java +++ b/concurrency-limits-core/src/main/java/com/netflix/concurrency/limits/limit/Gradient2Limit.java @@ -280,9 +280,11 @@ public int _update(final long startTime, final long rtt, final int inflight, fin // Rtt could be higher than longRtt because of smoothing longRtt updates // so set to 1.0 to indicate no queuing. Otherwise calculate the slope and don't - // allow it to be reduced by not more than half to avoid aggressive load-shedding due to - // outliers. - final double gradient = Math.max(0.5, Math.min(1.0, tolerance * longRtt / shortRtt)); + // allow it to be reduced by more than half to avoid aggressive load-shedding due to + // outliers. If a request is dropped, choose lowest gradient value to initiate + // load-shedding. + final double gradient = didDrop ? 0.5 + : Math.max(0.5, Math.min(1.0, tolerance * longRtt / shortRtt)); // Don't grow the limit if not necessary if (gradient == 1.0 && inflight < estimatedLimit / 2) { From cc849e0750a0522c71547d30f68a9d8f6481d5d8 Mon Sep 17 00:00:00 2001 From: udaysagar2177 Date: Wed, 10 Jul 2019 01:02:16 -0700 Subject: [PATCH 3/3] use queueSize in Gradient2Limit only when latencies are low --- .../concurrency/limits/limit/Gradient2Limit.java | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/concurrency-limits-core/src/main/java/com/netflix/concurrency/limits/limit/Gradient2Limit.java b/concurrency-limits-core/src/main/java/com/netflix/concurrency/limits/limit/Gradient2Limit.java index 1a6b264f..f3b53e9b 100644 --- a/concurrency-limits-core/src/main/java/com/netflix/concurrency/limits/limit/Gradient2Limit.java +++ b/concurrency-limits-core/src/main/java/com/netflix/concurrency/limits/limit/Gradient2Limit.java @@ -261,15 +261,12 @@ private Gradient2Limit(Builder builder) { @Override public int _update(final long startTime, final long rtt, final int inflight, final boolean didDrop) { - final double queueSize = this.queueSize.apply((int)this.estimatedLimit); - this.lastRtt = rtt; final double shortRtt = (double)rtt; final double longRtt = this.longRtt.add(rtt).doubleValue(); shortRttSampleListener.addSample(shortRtt); longRttSampleListener.addSample(longRtt); - queueSizeSampleListener.addSample(queueSize); // If the long RTT is substantially larger than the short RTT then reduce the long RTT measurement. // This can happen when latency returns to normal after a prolonged prior of excessive load. Reducing the @@ -286,9 +283,14 @@ public int _update(final long startTime, final long rtt, final int inflight, fin final double gradient = didDrop ? 0.5 : Math.max(0.5, Math.min(1.0, tolerance * longRtt / shortRtt)); - // Don't grow the limit if not necessary - if (gradient == 1.0 && inflight < estimatedLimit / 2) { - return (int) estimatedLimit; + double queueSize = 0; + if (gradient == 1.0) { + if (inflight < estimatedLimit / 2) { + // Don't grow the limit if not necessary + return (int) estimatedLimit; + } + queueSize = this.queueSize.apply((int)this.estimatedLimit); + queueSizeSampleListener.addSample(queueSize); } double newLimit = estimatedLimit * gradient + queueSize;