fix instant query bypassing limit middleware due to vertical sharding (#5832)

yeya24 · web-flow · commit 107257223e20 · 2024-04-03T09:39:02.000-07:00
* fix instant query bypassing QFE instant query limit middleware

Signed-off-by: Ben Ye &lt;benye@amazon.com&gt;

* integration test for max query length

Signed-off-by: Ben Ye &lt;benye@amazon.com&gt;

* fix lint

Signed-off-by: Ben Ye &lt;benye@amazon.com&gt;

* update doc

Signed-off-by: Ben Ye &lt;benye@amazon.com&gt;

---------

Signed-off-by: Ben Ye &lt;benye@amazon.com&gt;
diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md
@@ -3187,9 +3187,9 @@ The `limits_config` configures default and per-tenant limits imposed by Cortex s
 # CLI flag: -querier.max-query-lookback
 [max_query_lookback: <duration> | default = 0s]
 
-# Limit the query time range (end - start time). This limit is enforced in the
-# query-frontend (on the received query) and in the querier (on the query
-# possibly split by the query-frontend). 0 to disable.
+# Limit the query time range (end - start time of range query parameter and max
+# - min of data fetched time range). This limit is enforced in the
+# query-frontend and ruler (on the received query). 0 to disable.
 # CLI flag: -store.max-query-length
 [max_query_length: <duration> | default = 0s]
 
diff --git a/integration/query_frontend_test.go b/integration/query_frontend_test.go
@@ -530,3 +530,70 @@ func TestQueryFrontendNoRetryChunkPool(t *testing.T) {
 	// We shouldn't be able to see any retries.
 	require.NoError(t, queryFrontend.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"cortex_query_frontend_retries"}, e2e.WaitMissingMetrics))
 }
+
+func TestQueryFrontendMaxQueryLengthLimits(t *testing.T) {
+	const blockRangePeriod = 5 * time.Second
+
+	s, err := e2e.NewScenario(networkName)
+	require.NoError(t, err)
+	defer s.Close()
+
+	// Configure the blocks storage to frequently compact TSDB head
+	// and ship blocks to the storage.
+	flags := mergeFlags(BlocksStorageFlags(), map[string]string{
+		"-blocks-storage.tsdb.block-ranges-period":          blockRangePeriod.String(),
+		"-blocks-storage.tsdb.ship-interval":                "1s",
+		"-blocks-storage.tsdb.retention-period":             ((blockRangePeriod * 2) - 1).String(),
+		"-blocks-storage.bucket-store.max-chunk-pool-bytes": "1",
+		"-store.max-query-length":                           "30d",
+	})
+
+	// Start dependencies.
+	consul := e2edb.NewConsul()
+	minio := e2edb.NewMinio(9000, flags["-blocks-storage.s3.bucket-name"])
+	require.NoError(t, s.StartAndWaitReady(consul, minio))
+
+	// Start Cortex components for the write path.
+	distributor := e2ecortex.NewDistributor("distributor", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
+	ingester := e2ecortex.NewIngester("ingester", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
+	require.NoError(t, s.StartAndWaitReady(distributor, ingester))
+
+	// Wait until the distributor has updated the ring.
+	require.NoError(t, distributor.WaitSumMetrics(e2e.Equals(512), "cortex_ring_tokens_total"))
+
+	queryFrontend := e2ecortex.NewQueryFrontendWithConfigFile("query-frontend", "", flags, "")
+	queryFrontendWithSharding := e2ecortex.NewQueryFrontendWithConfigFile("query-frontend-sharding", "", mergeFlags(flags, map[string]string{
+		"-frontend.query-vertical-shard-size": "2",
+	}), "")
+	require.NoError(t, s.Start(queryFrontend, queryFrontendWithSharding))
+
+	c, err := e2ecortex.NewClient("", queryFrontend.HTTPEndpoint(), "", "", "user-1")
+	require.NoError(t, err)
+	cSharding, err := e2ecortex.NewClient("", queryFrontendWithSharding.HTTPEndpoint(), "", "", "user-1")
+	require.NoError(t, err)
+
+	now := time.Now()
+	// We expect request to hit max query length limit.
+	resp, body, err := c.QueryRangeRaw(`rate(test[1m])`, now.Add(-90*time.Hour*24), now, 10*time.Hour)
+	require.NoError(t, err)
+	require.Equal(t, http.StatusBadRequest, resp.StatusCode)
+	require.Contains(t, string(body), "the query time range exceeds the limit")
+
+	// We expect request to hit max query length limit.
+	resp, body, err = cSharding.QueryRangeRaw(`rate(test[1m])`, now.Add(-90*time.Hour*24), now, 10*time.Hour)
+	require.NoError(t, err)
+	require.Equal(t, http.StatusBadRequest, resp.StatusCode)
+	require.Contains(t, string(body), "the query time range exceeds the limit")
+
+	// We expect request to hit max query length limit.
+	resp, body, err = c.QueryRaw(`rate(test[90d])`, now)
+	require.NoError(t, err)
+	require.Equal(t, http.StatusBadRequest, resp.StatusCode)
+	require.Contains(t, string(body), "the query time range exceeds the limit")
+
+	// We expect request to hit max query length limit.
+	resp, body, err = cSharding.QueryRaw(`rate(test[90d])`, now)
+	require.NoError(t, err)
+	require.Equal(t, http.StatusBadRequest, resp.StatusCode)
+	require.Contains(t, string(body), "the query time range exceeds the limit")
+}
diff --git a/pkg/querier/tripperware/roundtrip.go b/pkg/querier/tripperware/roundtrip.go
@@ -36,7 +36,6 @@ import (
 	"github.com/cortexproject/cortex/pkg/tenant"
 	"github.com/cortexproject/cortex/pkg/util"
 	util_log "github.com/cortexproject/cortex/pkg/util/log"
-	"github.com/cortexproject/cortex/pkg/util/validation"
 )
 
 // HandlerFunc is like http.HandlerFunc, but for Handler.
@@ -180,18 +179,6 @@ func NewQueryTripperware(
 				if isQueryRange {
 					return queryrange.RoundTrip(r)
 				} else if isQuery {
-					// If the given query is not shardable, use downstream roundtripper.
-					query := r.FormValue("query")
-
-					// If vertical sharding is not enabled for the tenant, use downstream roundtripper.
-					numShards := validation.SmallestPositiveIntPerTenant(tenantIDs, limits.QueryVerticalShardSize)
-					if numShards <= 1 {
-						return next.RoundTrip(r)
-					}
-					analysis, err := queryAnalyzer.Analyze(query)
-					if err != nil || !analysis.IsShardable() {
-						return next.RoundTrip(r)
-					}
 					return instantQuery.RoundTrip(r)
 				}
 				return next.RoundTrip(r)
diff --git a/pkg/querier/tripperware/roundtrip_test.go b/pkg/querier/tripperware/roundtrip_test.go
@@ -7,10 +7,12 @@ import (
 	"net/http"
 	"net/http/httptest"
 	"net/url"
+	"strings"
 	"testing"
 	"time"
 
 	"github.com/go-kit/log"
+	"github.com/prometheus/common/model"
 	"github.com/stretchr/testify/require"
 	"github.com/thanos-io/thanos/pkg/querysharding"
 	"github.com/weaveworks/common/httpgrpc"
@@ -22,12 +24,15 @@ import (
 
 const (
 	queryRange                    = "/api/v1/query_range?end=1536716898&query=sum%28container_memory_rss%29+by+%28namespace%29&start=1536673680&stats=all&step=120"
-	query                         = "/api/v1/query?time=1536716898&query=sum%28container_memory_rss%29+by+%28namespace%29&start=1536673680"
-	queryNonShardable             = "/api/v1/query?time=1536716898&query=container_memory_rss&start=1536673680"
+	query                         = "/api/v1/query?time=1536716898&query=sum%28container_memory_rss%29+by+%28namespace%29"
+	queryNonShardable             = "/api/v1/query?time=1536716898&query=container_memory_rss"
 	queryExemplar                 = "/api/v1/query_exemplars?query=test_exemplar_metric_total&start=2020-09-14T15:22:25.479Z&end=2020-09-14T15:23:25.479Z'"
 	querySubqueryStepSizeTooSmall = "/api/v1/query?query=up%5B30d%3A%5D"
+	queryExceedsMaxQueryLength    = "/api/v1/query?query=up%5B90d%5D"
+	seriesQuery                   = "/api/v1/series?match[]"
 
-	responseBody = `{"status":"success","data":{"resultType":"matrix","result":[{"metric":{"foo":"bar"},"values":[[1536673680,"137"],[1536673780,"137"]]}]}}`
+	responseBody        = `{"status":"success","data":{"resultType":"matrix","result":[{"metric":{"foo":"bar"},"values":[[1536673680,"137"],[1536673780,"137"]]}]}}`
+	instantResponseBody = `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"foo":"bar"},"values":[[1536673680,"137"],[1536673780,"137"]]}]}}`
 )
 
 type mockRequest struct {
@@ -45,9 +50,12 @@ type mockCodec struct {
 }
 
 func (c mockCodec) DecodeRequest(_ context.Context, r *http.Request, _ []string) (Request, error) {
-	if r.URL.String() == query || r.URL.String() == queryRange {
+	if strings.HasPrefix(r.URL.String(), "/api/v1/query_range") {
 		return &mockRequest{resp: responseBody}, nil
 	}
+	if strings.HasPrefix(r.URL.String(), "/api/v1/query") {
+		return &mockRequest{resp: instantResponseBody}, nil
+	}
 	return mockRequest{}, nil
 }
 
@@ -91,13 +99,20 @@ func TestRoundTrip(t *testing.T) {
 		next: http.DefaultTransport,
 	}
 
-	middlewares := []Middleware{
+	instantMiddlewares := []Middleware{
+		MiddlewareFunc(func(next Handler) Handler {
+			return mockMiddleware{}
+		}),
+	}
+	rangeMiddlewares := []Middleware{
 		MiddlewareFunc(func(next Handler) Handler {
 			return mockMiddleware{}
 		}),
 	}
 
-	limits := validation.Limits{}
+	limits := validation.Limits{
+		MaxQueryLength: model.Duration(time.Hour * 24 * 60),
+	}
 	flagext.DefaultValues(&limits)
 	defaultOverrides, err := validation.NewOverrides(limits, nil)
 	require.NoError(t, err)
@@ -124,46 +139,46 @@ func TestRoundTrip(t *testing.T) {
 			maxSubQuerySteps: 11000,
 		},
 		{
-			path:             queryRange,
-			expectedBody:     responseBody,
+			path:             seriesQuery,
+			expectedBody:     "bar",
 			limits:           defaultOverrides,
 			maxSubQuerySteps: 11000,
 		},
 		{
-			path:             query,
-			expectedBody:     "bar",
+			path:             queryRange,
+			expectedBody:     responseBody,
 			limits:           defaultOverrides,
 			maxSubQuerySteps: 11000,
 		},
 		{
-			path:             queryNonShardable,
-			expectedBody:     "bar",
+			path:             query,
+			expectedBody:     instantResponseBody,
 			limits:           defaultOverrides,
 			maxSubQuerySteps: 11000,
 		},
 		{
 			path:             queryNonShardable,
-			expectedBody:     "bar",
-			limits:           shardingOverrides,
+			expectedBody:     instantResponseBody,
+			limits:           defaultOverrides,
 			maxSubQuerySteps: 11000,
 		},
 		{
 			path:             query,
-			expectedBody:     responseBody,
+			expectedBody:     instantResponseBody,
 			limits:           shardingOverrides,
 			maxSubQuerySteps: 11000,
 		},
 		// Shouldn't hit subquery step limit because max steps is set to 0 so this check is disabled.
 		{
 			path:             querySubqueryStepSizeTooSmall,
-			expectedBody:     "bar",
+			expectedBody:     instantResponseBody,
 			limits:           defaultOverrides,
 			maxSubQuerySteps: 0,
 		},
 		// Shouldn't hit subquery step limit because max steps is higher, which is 100K.
 		{
 			path:             querySubqueryStepSizeTooSmall,
-			expectedBody:     "bar",
+			expectedBody:     instantResponseBody,
 			limits:           defaultOverrides,
 			maxSubQuerySteps: 100000,
 		},
@@ -173,11 +188,15 @@ func TestRoundTrip(t *testing.T) {
 			limits:           defaultOverrides,
 			maxSubQuerySteps: 11000,
 		},
+		{
+			// The query should go to instant query middlewares rather than forwarding to next.
+			path:             queryExceedsMaxQueryLength,
+			expectedBody:     instantResponseBody,
+			limits:           defaultOverrides,
+			maxSubQuerySteps: 11000,
+		},
 	} {
 		t.Run(tc.path, func(t *testing.T) {
-			if tc.path != querySubqueryStepSizeTooSmall {
-				return
-			}
 			//parallel testing causes data race
 			req, err := http.NewRequest("GET", tc.path, http.NoBody)
 			require.NoError(t, err)
@@ -193,8 +212,8 @@ func TestRoundTrip(t *testing.T) {
 			tw := NewQueryTripperware(log.NewNopLogger(),
 				nil,
 				nil,
-				middlewares,
-				middlewares,
+				rangeMiddlewares,
+				instantMiddlewares,
 				mockCodec{},
 				mockCodec{},
 				tc.limits,
diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go
@@ -209,7 +209,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) {
 	f.IntVar(&l.MaxFetchedSeriesPerQuery, "querier.max-fetched-series-per-query", 0, "The maximum number of unique series for which a query can fetch samples from each ingesters and blocks storage. This limit is enforced in the querier, ruler and store-gateway. 0 to disable")
 	f.IntVar(&l.MaxFetchedChunkBytesPerQuery, "querier.max-fetched-chunk-bytes-per-query", 0, "Deprecated (use max-fetched-data-bytes-per-query instead): The maximum size of all chunks in bytes that a query can fetch from each ingester and storage. This limit is enforced in the querier, ruler and store-gateway. 0 to disable.")
 	f.IntVar(&l.MaxFetchedDataBytesPerQuery, "querier.max-fetched-data-bytes-per-query", 0, "The maximum combined size of all data that a query can fetch from each ingester and storage. This limit is enforced in the querier and ruler for `query`, `query_range` and `series` APIs. 0 to disable.")
-	f.Var(&l.MaxQueryLength, "store.max-query-length", "Limit the query time range (end - start time). This limit is enforced in the query-frontend (on the received query) and in the querier (on the query possibly split by the query-frontend). 0 to disable.")
+	f.Var(&l.MaxQueryLength, "store.max-query-length", "Limit the query time range (end - start time of range query parameter and max - min of data fetched time range). This limit is enforced in the query-frontend and ruler (on the received query). 0 to disable.")
 	f.Var(&l.MaxQueryLookback, "querier.max-query-lookback", "Limit how long back data (series and metadata) can be queried, up until <lookback> duration ago. This limit is enforced in the query-frontend, querier and ruler. If the requested time range is outside the allowed range, the request will not fail but will be manipulated to only query data within the allowed time range. 0 to disable.")
 	f.IntVar(&l.MaxQueryParallelism, "querier.max-query-parallelism", 14, "Maximum number of split queries will be scheduled in parallel by the frontend.")
 	_ = l.MaxCacheFreshness.Set("1m")