From 7b82ea49695052c7c5242255e02a744ff183fe95 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Fri, 24 Feb 2023 13:39:46 -0600 Subject: [PATCH 1/8] Count error_ids with duplicates --- src/sentry/replays/query.py | 4 ++-- tests/sentry/replays/test_organization_replay_index.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/sentry/replays/query.py b/src/sentry/replays/query.py index 128a0b810efbab..253648561a6cac 100644 --- a/src/sentry/replays/query.py +++ b/src/sentry/replays/query.py @@ -637,8 +637,8 @@ def _activity_score(): ), "count_segments": Function("count", parameters=[Column("segment_id")], alias="count_segments"), "count_errors": Function( - "uniqArray", - parameters=[Column("error_ids")], + "sum", + parameters=[Function("length", parameters=[Column("error_ids")])], alias="count_errors", ), "count_urls": Function( diff --git a/tests/sentry/replays/test_organization_replay_index.py b/tests/sentry/replays/test_organization_replay_index.py index 8462a4ef1cb096..5416746f444cc3 100644 --- a/tests/sentry/replays/test_organization_replay_index.py +++ b/tests/sentry/replays/test_organization_replay_index.py @@ -92,9 +92,9 @@ def test_get_replays(self): ], count_segments=2, # count_errors=3, - count_errors=1, + count_errors=2, tags={"test": ["hello", "world"], "other": ["hello"]}, - activity=4, + activity=6, ) assert_expected_response(response_data["data"][0], expected_response) @@ -502,7 +502,7 @@ def test_get_replays_user_filters(self): "!c:*zz", "urls:example.com", "url:example.com", - "activity:3", + "activity:5", "activity:>2", ] @@ -546,7 +546,7 @@ def test_get_replays_user_filters(self): "release:[a,b]", "c:*zz", "!c:*st", - "!activity:3", + "!activity:5", "activity:<2", ] for query in null_queries: From 3115aa45836aeeec49be8ea1484922b83b7e386f Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Fri, 24 Feb 2023 13:44:10 -0600 Subject: [PATCH 2/8] Update details coverage --- tests/sentry/replays/test_project_replay_details.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/sentry/replays/test_project_replay_details.py b/tests/sentry/replays/test_project_replay_details.py index 86fb56395fe7ff..e14a2c4f31c0eb 100644 --- a/tests/sentry/replays/test_project_replay_details.py +++ b/tests/sentry/replays/test_project_replay_details.py @@ -144,7 +144,8 @@ def test_get_replay_schema(self): "http://localhost:3000/", ], count_segments=3, - activity=4, + count_errors=3, + activity=9, ) assert_expected_response(response_data["data"], expected_response) From a09a85e846b6a929590e558b69f082e0004929e7 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Fri, 24 Feb 2023 13:56:45 -0600 Subject: [PATCH 3/8] Remove dependency on urls_sorted within activity calculation --- src/sentry/replays/query.py | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/src/sentry/replays/query.py b/src/sentry/replays/query.py index 253648561a6cac..6eeb971e7320aa 100644 --- a/src/sentry/replays/query.py +++ b/src/sentry/replays/query.py @@ -467,20 +467,8 @@ def _activity_score(): # score = (count_errors * 25 + pagesVisited * 5 ) / 10; # score = Math.floor(Math.min(10, Math.max(1, score))); - error_weight = Function( - "multiply", - parameters=[Column("count_errors"), 25], - ) - pages_visited_weight = Function( - "multiply", - parameters=[ - Function( - "length", - parameters=[Column("urls_sorted")], - ), - 5, - ], - ) + error_weight = Function("multiply", parameters=[Column("count_errors"), 25]) + pages_visited_weight = Function("multiply", parameters=[Column("count_urls"), 5]) combined_weight = Function( "plus", @@ -549,10 +537,10 @@ def _activity_score(): "urls": ["urls_sorted", "agg_urls"], "url": ["urls_sorted", "agg_urls"], "count_errors": ["count_errors"], - "count_urls": ["count_urls", "urls_sorted", "agg_urls"], + "count_urls": ["count_urls"], "count_segments": ["count_segments"], "is_archived": ["is_archived"], - "activity": ["activity", "count_errors", "urls_sorted", "agg_urls"], + "activity": ["activity", "count_errors", "count_urls"], "user": ["user_id", "user_email", "user_name", "user_ip"], "os": ["os_name", "os_version"], "browser": ["browser_name", "browser_version"], @@ -642,8 +630,8 @@ def _activity_score(): alias="count_errors", ), "count_urls": Function( - "length", - parameters=[Column("urls_sorted")], + "sum", + parameters=[Function("length", parameters=[Column("urls")])], alias="count_urls", ), "is_archived": Function( From 27c3cb60f45d7ed8fb29dbba781b6512a822df4d Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Tue, 28 Feb 2023 08:37:41 -0600 Subject: [PATCH 4/8] Group by project_id --- src/sentry/replays/post_process.py | 2 +- src/sentry/replays/query.py | 12 ++++-------- src/sentry/replays/serializers.py | 2 +- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/sentry/replays/post_process.py b/src/sentry/replays/post_process.py index 51b59d44af4f8d..008cfa1e524a5c 100644 --- a/src/sentry/replays/post_process.py +++ b/src/sentry/replays/post_process.py @@ -25,7 +25,7 @@ def generate_normalized_output( """For each payload in the response strip "agg_" prefixes.""" for item in response: item["id"] = item.pop("replay_id", None) - item["project_id"] = item.pop("projectId", None) + item["project_id"] = str(item["project_id"]) item["trace_ids"] = item.pop("traceIds", []) item["error_ids"] = item.pop("errorIds", []) item["environment"] = item.pop("agg_environment", None) diff --git a/src/sentry/replays/query.py b/src/sentry/replays/query.py index 6eeb971e7320aa..8d4b8606009c90 100644 --- a/src/sentry/replays/query.py +++ b/src/sentry/replays/query.py @@ -153,7 +153,7 @@ def query_replays_dataset( *having, ], orderby=sorting, - groupby=[Column("replay_id")], + groupby=[Column("project_id"), Column("replay_id")], granularity=Granularity(3600), **query_options, ), @@ -421,8 +421,8 @@ class ReplayQueryConfig(QueryConfig): started_at = String(is_filterable=False) finished_at = String(is_filterable=False) # Dedicated url parameter should be used. - project_id = String(query_alias="projectId", is_filterable=False) - project = String(query_alias="projectId", is_filterable=False) + project_id = String(query_alias="project_id", is_filterable=False) + project = String(query_alias="project_id", is_filterable=False) # Pagination. @@ -571,11 +571,7 @@ def _activity_score(): QUERY_ALIAS_COLUMN_MAP = { "replay_id": _strip_uuid_dashes("replay_id", Column("replay_id")), "replay_type": _grouped_unique_scalar_value(column_name="replay_type", alias="replay_type"), - "project_id": Function( - "toString", - parameters=[_grouped_unique_scalar_value(column_name="project_id", alias="agg_pid")], - alias="projectId", - ), + "project_id": Column("project_id"), "platform": _grouped_unique_scalar_value(column_name="platform"), "agg_environment": _grouped_unique_scalar_value( column_name="environment", alias="agg_environment" diff --git a/src/sentry/replays/serializers.py b/src/sentry/replays/serializers.py index 60aeecaa666b6f..380d6c14e83b4a 100644 --- a/src/sentry/replays/serializers.py +++ b/src/sentry/replays/serializers.py @@ -17,7 +17,7 @@ def serialize(self, obj: ReplayRecordingSegment, attrs, user): VALID_FIELD_SET = { "id", "title", - "projectId", + "project_id", "errorIds", "traceIds", "urls", From aa366aa67165d6f4074d8e5e2ef848409fd7c259 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Tue, 28 Feb 2023 09:18:13 -0600 Subject: [PATCH 5/8] Use an efficient, limited grouping clause to prevent unnecessary memory use --- src/sentry/replays/query.py | 63 ++++++++++--------- .../replays/test_organization_replay_index.py | 4 ++ 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/src/sentry/replays/query.py b/src/sentry/replays/query.py index 8d4b8606009c90..3466b77eaf25f6 100644 --- a/src/sentry/replays/query.py +++ b/src/sentry/replays/query.py @@ -7,6 +7,7 @@ from snuba_sdk import ( Column, Condition, + CurriedFunction, Entity, Function, Granularity, @@ -308,17 +309,22 @@ def _grouped_unique_values( ) -def _grouped_unique_scalar_value( - column_name: str, alias: Optional[str] = None, aliased: bool = True +def take_first_from_aggregation( + column_name: str, + alias: Optional[str] = None, + aliased: bool = True, ) -> Function: - """Returns the first value of a unique array. + """Returns the first value encountered in an aggregated array. E.g. - [1, 2, 2, 3, 3, 3, null] => [1, 2, 3] => 1 + [1, 2, 2, 3, 3, 3, null] => [1] => 1 """ return Function( "arrayElement", - parameters=[_grouped_unique_values(column_name), 1], + parameters=[ + CurriedFunction("groupArray", initializers=[1], parameters=[Column(column_name)]), + 1, + ], alias=alias or column_name if aliased else None, ) @@ -570,14 +576,7 @@ def _activity_score(): QUERY_ALIAS_COLUMN_MAP = { "replay_id": _strip_uuid_dashes("replay_id", Column("replay_id")), - "replay_type": _grouped_unique_scalar_value(column_name="replay_type", alias="replay_type"), "project_id": Column("project_id"), - "platform": _grouped_unique_scalar_value(column_name="platform"), - "agg_environment": _grouped_unique_scalar_value( - column_name="environment", alias="agg_environment" - ), - "releases": _grouped_unique_values(column_name="release", alias="releases", aliased=True), - "dist": _grouped_unique_scalar_value(column_name="dist"), "trace_ids": Function( "arrayMap", parameters=[ @@ -636,29 +635,31 @@ def _activity_score(): alias="isArchived", ), "activity": _activity_score(), - "user_id": _grouped_unique_scalar_value(column_name="user_id"), - "user_email": _grouped_unique_scalar_value(column_name="user_email"), - "user_name": _grouped_unique_scalar_value(column_name="user_name"), + "releases": _grouped_unique_values(column_name="release", alias="releases", aliased=True), + "replay_type": take_first_from_aggregation(column_name="replay_type", alias="replay_type"), + "platform": take_first_from_aggregation(column_name="platform"), + "agg_environment": take_first_from_aggregation( + column_name="environment", alias="agg_environment" + ), + "dist": take_first_from_aggregation(column_name="dist"), + "user_id": take_first_from_aggregation(column_name="user_id"), + "user_email": take_first_from_aggregation(column_name="user_email"), + "user_name": take_first_from_aggregation(column_name="user_name"), "user_ip": Function( "IPv4NumToString", - parameters=[ - _grouped_unique_scalar_value( - column_name="ip_address_v4", - aliased=False, - ) - ], + parameters=[take_first_from_aggregation(column_name="ip_address_v4", aliased=False)], alias="user_ip", ), - "os_name": _grouped_unique_scalar_value(column_name="os_name"), - "os_version": _grouped_unique_scalar_value(column_name="os_version"), - "browser_name": _grouped_unique_scalar_value(column_name="browser_name"), - "browser_version": _grouped_unique_scalar_value(column_name="browser_version"), - "device_name": _grouped_unique_scalar_value(column_name="device_name"), - "device_brand": _grouped_unique_scalar_value(column_name="device_brand"), - "device_family": _grouped_unique_scalar_value(column_name="device_family"), - "device_model": _grouped_unique_scalar_value(column_name="device_model"), - "sdk_name": _grouped_unique_scalar_value(column_name="sdk_name"), - "sdk_version": _grouped_unique_scalar_value(column_name="sdk_version"), + "os_name": take_first_from_aggregation(column_name="os_name"), + "os_version": take_first_from_aggregation(column_name="os_version"), + "browser_name": take_first_from_aggregation(column_name="browser_name"), + "browser_version": take_first_from_aggregation(column_name="browser_version"), + "device_name": take_first_from_aggregation(column_name="device_name"), + "device_brand": take_first_from_aggregation(column_name="device_brand"), + "device_family": take_first_from_aggregation(column_name="device_family"), + "device_model": take_first_from_aggregation(column_name="device_model"), + "sdk_name": take_first_from_aggregation(column_name="sdk_name"), + "sdk_version": take_first_from_aggregation(column_name="sdk_version"), "tk": Function("groupArrayArray", parameters=[Column("tags.key")], alias="tk"), "tv": Function("groupArrayArray", parameters=[Column("tags.value")], alias="tv"), } diff --git a/tests/sentry/replays/test_organization_replay_index.py b/tests/sentry/replays/test_organization_replay_index.py index 5416746f444cc3..bdb36c30462237 100644 --- a/tests/sentry/replays/test_organization_replay_index.py +++ b/tests/sentry/replays/test_organization_replay_index.py @@ -434,6 +434,10 @@ def test_get_replays_user_filters(self): seq2_timestamp, project.id, replay1_id, + user_id=None, + user_name=None, + user_email=None, + ipv4=None, os_name=None, os_version=None, browser_name=None, From 06f5bedc719135711b6b97694f624c3d70d0a54f Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Tue, 28 Feb 2023 12:28:11 -0600 Subject: [PATCH 6/8] Remove duplicate errors from mocks --- .../replays/test_organization_replay_index.py | 13 ++++++++----- tests/sentry/replays/test_project_replay_details.py | 6 ++++-- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/tests/sentry/replays/test_organization_replay_index.py b/tests/sentry/replays/test_organization_replay_index.py index bdb36c30462237..13019e5e0c443b 100644 --- a/tests/sentry/replays/test_organization_replay_index.py +++ b/tests/sentry/replays/test_organization_replay_index.py @@ -68,6 +68,7 @@ def test_get_replays(self): # error_ids=[uuid.uuid4().hex, replay1_id], # duplicate error-id urls=["http://localhost:3000/"], # duplicate urls are okay tags={"test": "world", "other": "hello"}, + error_ids=[], ) ) @@ -92,9 +93,9 @@ def test_get_replays(self): ], count_segments=2, # count_errors=3, - count_errors=2, + count_errors=1, tags={"test": ["hello", "world"], "other": ["hello"]}, - activity=6, + activity=4, ) assert_expected_response(response_data["data"][0], expected_response) @@ -427,6 +428,7 @@ def test_get_replays_user_filters(self): device_model="10", tags={"a": "m", "b": "q", "c": "test"}, urls=["example.com"], + error_ids=["a3a62ef6-ac86-415b-83c2-416fc2f76db1"], ) ) self.store_replays( @@ -447,6 +449,7 @@ def test_get_replays_user_filters(self): device_family=None, device_model=None, tags={"a": "n", "b": "o"}, + error_ids=[], ) ) @@ -506,12 +509,12 @@ def test_get_replays_user_filters(self): "!c:*zz", "urls:example.com", "url:example.com", - "activity:5", + "activity:3", "activity:>2", ] for query in queries: - response = self.client.get(self.url + f"?field=id&query={query}") + response = self.client.get(self.url + f"?query={query}") assert response.status_code == 200, query response_data = response.json() assert len(response_data["data"]) == 1, query @@ -550,7 +553,7 @@ def test_get_replays_user_filters(self): "release:[a,b]", "c:*zz", "!c:*st", - "!activity:5", + "!activity:3", "activity:<2", ] for query in null_queries: diff --git a/tests/sentry/replays/test_project_replay_details.py b/tests/sentry/replays/test_project_replay_details.py index e14a2c4f31c0eb..d99dee50d5f3e2 100644 --- a/tests/sentry/replays/test_project_replay_details.py +++ b/tests/sentry/replays/test_project_replay_details.py @@ -109,6 +109,7 @@ def test_get_replay_schema(self): segment_id=1, trace_ids=[trace_id_2], urls=["http://www.sentry.io/"], + error_ids=[], ) ) self.store_replays( @@ -119,6 +120,7 @@ def test_get_replay_schema(self): segment_id=2, trace_ids=[trace_id_2], urls=["http://localhost:3000/"], + error_ids=[], ) ) @@ -144,8 +146,8 @@ def test_get_replay_schema(self): "http://localhost:3000/", ], count_segments=3, - count_errors=3, - activity=9, + count_errors=1, + activity=4, ) assert_expected_response(response_data["data"], expected_response) From b57c44da8b8b22300972e08ba5d5aefa51e0ab9e Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Tue, 28 Feb 2023 12:29:05 -0600 Subject: [PATCH 7/8] Fix bug where array fields were not querying aggregated fields --- src/sentry/replays/query.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/sentry/replays/query.py b/src/sentry/replays/query.py index 3466b77eaf25f6..176d81716ed500 100644 --- a/src/sentry/replays/query.py +++ b/src/sentry/replays/query.py @@ -389,11 +389,11 @@ class ReplayQueryConfig(QueryConfig): releases = ListField() release = ListField(query_alias="releases") dist = String() - error_ids = ListField(query_alias="error_ids", is_uuid=True) - error_id = ListField(query_alias="error_ids", is_uuid=True) - trace_ids = ListField(query_alias="trace_ids", is_uuid=True) - trace_id = ListField(query_alias="trace_ids", is_uuid=True) - trace = ListField(query_alias="trace_ids", is_uuid=True) + error_ids = ListField(query_alias="errorIds") + error_id = ListField(query_alias="errorIds") + trace_ids = ListField(query_alias="traceIds") + trace_id = ListField(query_alias="traceIds") + trace = ListField(query_alias="traceIds") urls = ListField(query_alias="urls_sorted") url = ListField(query_alias="urls_sorted") user_id = String(field_alias="user.id", query_alias="user_id") From e5b4027b95b2218557b880765c3cd8e1a5099517 Mon Sep 17 00:00:00 2001 From: Colton Allen Date: Tue, 28 Feb 2023 12:53:14 -0600 Subject: [PATCH 8/8] Re-add id marshaling --- tests/sentry/replays/test_organization_replay_index.py | 3 +-- tests/sentry/replays/test_project_replay_details.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/sentry/replays/test_organization_replay_index.py b/tests/sentry/replays/test_organization_replay_index.py index 13019e5e0c443b..69fe424424ad22 100644 --- a/tests/sentry/replays/test_organization_replay_index.py +++ b/tests/sentry/replays/test_organization_replay_index.py @@ -428,7 +428,6 @@ def test_get_replays_user_filters(self): device_model="10", tags={"a": "m", "b": "q", "c": "test"}, urls=["example.com"], - error_ids=["a3a62ef6-ac86-415b-83c2-416fc2f76db1"], ) ) self.store_replays( @@ -514,7 +513,7 @@ def test_get_replays_user_filters(self): ] for query in queries: - response = self.client.get(self.url + f"?query={query}") + response = self.client.get(self.url + f"?field=id&query={query}") assert response.status_code == 200, query response_data = response.json() assert len(response_data["data"]) == 1, query diff --git a/tests/sentry/replays/test_project_replay_details.py b/tests/sentry/replays/test_project_replay_details.py index d99dee50d5f3e2..11d62f780010aa 100644 --- a/tests/sentry/replays/test_project_replay_details.py +++ b/tests/sentry/replays/test_project_replay_details.py @@ -146,7 +146,6 @@ def test_get_replay_schema(self): "http://localhost:3000/", ], count_segments=3, - count_errors=1, activity=4, ) assert_expected_response(response_data["data"], expected_response)