Allow scaling parameters to be changed from command-line

bboreham · bboreham · commit 8497e4f93535 · 2018-06-08T13:59:15.000Z
diff --git a/pkg/chunk/aws/metrics_autoscaling.go b/pkg/chunk/aws/metrics_autoscaling.go
@@ -19,19 +19,19 @@ import (
 const (
 	cachePromDataFor       = 30 * time.Second
 	queueObservationPeriod = 2 * time.Minute
-	queueLengthScaledown   = 10000   // consider scaling down if queue smaller than this
-	queueLengthAcceptable  = 100000  // we don't mind queues smaller than this
-	queueLengthMax         = 1000000 // always scale up if queue bigger than this
+	targetScaledown        = 0.1 // consider scaling down if queue smaller than this times target
+	targetMax              = 10  // always scale up if queue bigger than this times target
 	errorFractionScaledown = 0.1
 	scaledown              = 0.9
 	scaleup                = 1.2
 )
 
 type metricsData struct {
-	promAPI      promV1.API
-	lastUpdated  time.Time
-	queueLengths []float64
-	errorRates   map[string]float64
+	queueLengthTarget int64
+	promAPI           promV1.API
+	lastUpdated       time.Time
+	queueLengths      []float64
+	errorRates        map[string]float64
 }
 
 func (d dynamoTableClient) metricsAutoScale(ctx context.Context, current, expected *chunk.TableDesc) error {
@@ -49,14 +49,14 @@ func (d dynamoTableClient) metricsAutoScale(ctx context.Context, current, expect
 
 	switch {
 	case errorRate < errorFractionScaledown*float64(current.ProvisionedWrite) &&
-		m.queueLengths[2] < queueLengthScaledown:
+		m.queueLengths[2] < float64(m.queueLengthTarget)*targetScaledown:
 		// No big queue, low errors -> scale down
 		scaleDownWrite(current, expected, int64(float64(current.ProvisionedWrite)*scaledown), "metrics scale-down")
-	case errorRate > 0 && m.queueLengths[2] > queueLengthMax:
+	case errorRate > 0 && m.queueLengths[2] > float64(m.queueLengthTarget)*targetMax:
 		// Too big queue, some errors -> scale up
 		scaleUpWrite(current, expected, int64(float64(current.ProvisionedWrite)*scaleup), "metrics max queue scale-up")
 	case errorRate > 0 &&
-		m.queueLengths[2] > queueLengthAcceptable &&
+		m.queueLengths[2] > float64(m.queueLengthTarget) &&
 		m.queueLengths[2] > m.queueLengths[1] && m.queueLengths[1] > m.queueLengths[0]:
 		// Growing queue, some errors -> scale up
 		scaleUpWrite(current, expected, int64(float64(current.ProvisionedWrite)*scaleup), "metrics queue growing scale-up")
@@ -93,7 +93,10 @@ func newMetrics(cfg DynamoDBConfig) (*metricsData, error) {
 		}
 		promAPI = promV1.NewAPI(client)
 	}
-	return &metricsData{promAPI: promAPI}, nil
+	return &metricsData{
+		promAPI:           promAPI,
+		queueLengthTarget: cfg.MetricsTargetQueueLen,
+	}, nil
 }
 
 func (m *metricsData) update(ctx context.Context) error {
diff --git a/pkg/chunk/aws/metrics_autoscaling_test.go b/pkg/chunk/aws/metrics_autoscaling_test.go
@@ -24,7 +24,7 @@ func TestTableManagerMetricsAutoScaling(t *testing.T) {
 
 	client := dynamoTableClient{
 		DynamoDB: dynamoDB,
-		metrics:  &metricsData{promAPI: &mockProm},
+		metrics:  &metricsData{promAPI: &mockProm, queueLengthTarget: 100000},
 	}
 
 	// Set up table-manager config
diff --git a/pkg/chunk/aws/storage_client.go b/pkg/chunk/aws/storage_client.go
@@ -104,6 +104,7 @@ type DynamoDBConfig struct {
 	APILimit               float64
 	ApplicationAutoScaling util.URLValue
 	MetricsURL             string
+	MetricsTargetQueueLen  int64
 	ChunkGangSize          int
 	ChunkGetMaxParallelism int
 	backoffConfig          util.BackoffConfig
@@ -116,6 +117,7 @@ func (cfg *DynamoDBConfig) RegisterFlags(f *flag.FlagSet) {
 	f.Float64Var(&cfg.APILimit, "dynamodb.api-limit", 2.0, "DynamoDB table management requests per second limit.")
 	f.Var(&cfg.ApplicationAutoScaling, "applicationautoscaling.url", "ApplicationAutoscaling endpoint URL with escaped Key and Secret encoded.")
 	f.StringVar(&cfg.MetricsURL, "metrics.url", "", "Use metrics-based autoscaling, via this query URL")
+	f.Int64Var(&cfg.MetricsTargetQueueLen, "metrics.target-queue-length", 100000, "Queue length above which we will scale up capacity")
 	f.IntVar(&cfg.ChunkGangSize, "dynamodb.chunk.gang.size", 10, "Number of chunks to group together to parallelise fetches (zero to disable)")
 	f.IntVar(&cfg.ChunkGetMaxParallelism, "dynamodb.chunk.get.max.parallelism", 32, "Max number of chunk-get operations to start in parallel")
 	f.DurationVar(&cfg.backoffConfig.MinBackoff, "dynamodb.min-backoff", 100*time.Millisecond, "Minimum backoff time")

Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ func TestTableManagerMetricsAutoScaling(t *testing.T) {`
`24`	`24`
`25`	`25`	`client := dynamoTableClient{`
`26`	`26`	`DynamoDB: dynamoDB,`
`27`		`- metrics: &metricsData{promAPI: &mockProm},`
	`27`	`+ metrics: &metricsData{promAPI: &mockProm, queueLengthTarget: 100000},`
`28`	`28`	`}`
`29`	`29`
`30`	`30`	`// Set up table-manager config`