From 74e9c0804021e87276f9519ccd94a86e9d51242b Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Tue, 15 Nov 2022 15:24:49 +0100 Subject: [PATCH 01/22] Strip sensitive data from URLs. refs 1742 --- sentry_sdk/integrations/httpx.py | 8 +++-- sentry_sdk/integrations/stdlib.py | 10 ++++-- sentry_sdk/utils.py | 38 ++++++++++++++++++++++ tests/test_utils.py | 53 +++++++++++++++++++++++++++++++ 4 files changed, 104 insertions(+), 5 deletions(-) create mode 100644 tests/test_utils.py diff --git a/sentry_sdk/integrations/httpx.py b/sentry_sdk/integrations/httpx.py index 2e9142d2b8..aec080a9d8 100644 --- a/sentry_sdk/integrations/httpx.py +++ b/sentry_sdk/integrations/httpx.py @@ -1,7 +1,7 @@ from sentry_sdk import Hub from sentry_sdk.consts import OP from sentry_sdk.integrations import Integration, DidNotEnable -from sentry_sdk.utils import logger +from sentry_sdk.utils import logger, parameterize_url from sentry_sdk._types import MYPY @@ -42,7 +42,8 @@ def send(self, request, **kwargs): return real_send(self, request, **kwargs) with hub.start_span( - op=OP.HTTP_CLIENT, description="%s %s" % (request.method, request.url) + op=OP.HTTP_CLIENT, + description="%s %s" % (request.method, parameterize_url(request.url)), ) as span: span.set_data("method", request.method) span.set_data("url", str(request.url)) @@ -74,7 +75,8 @@ async def send(self, request, **kwargs): return await real_send(self, request, **kwargs) with hub.start_span( - op=OP.HTTP_CLIENT, description="%s %s" % (request.method, request.url) + op=OP.HTTP_CLIENT, + description="%s %s" % (request.method, parameterize_url(request.url)), ) as span: span.set_data("method", request.method) span.set_data("url", str(request.url)) diff --git a/sentry_sdk/integrations/stdlib.py b/sentry_sdk/integrations/stdlib.py index 3b81b6c2c5..2d77f79716 100644 --- a/sentry_sdk/integrations/stdlib.py +++ b/sentry_sdk/integrations/stdlib.py @@ -8,7 +8,12 @@ from sentry_sdk.integrations import Integration from sentry_sdk.scope import add_global_event_processor from sentry_sdk.tracing_utils import EnvironHeaders -from sentry_sdk.utils import capture_internal_exceptions, logger, safe_repr +from sentry_sdk.utils import ( + capture_internal_exceptions, + logger, + parameterize_url, + safe_repr, +) from sentry_sdk._types import MYPY @@ -80,7 +85,8 @@ def putrequest(self, method, url, *args, **kwargs): ) span = hub.start_span( - op=OP.HTTP_CLIENT, description="%s %s" % (method, real_url) + op=OP.HTTP_CLIENT, + description="%s %s" % (method, parameterize_url(real_url)), ) span.set_data("method", method) diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index c000a3bd2c..eb1fce51c8 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -8,6 +8,8 @@ import subprocess import re import time +from collections import namedtuple +from urllib import parse from datetime import datetime @@ -37,6 +39,11 @@ epoch = datetime(1970, 1, 1) +Components = namedtuple( + typename="Components", field_names=["scheme", "netloc", "path", "query", "fragment"] +) + + # The logger is created here but initialized in the debug support module logger = logging.getLogger("sentry_sdk.errors") @@ -1086,6 +1093,37 @@ def from_base64(base64_string): return utf8_string +def parameterize_url(url): + # type: (str) -> str + """ + Removes all query parameter values and username:password from a given URL. + """ + parsed_url = parse.urlsplit(url) + query_params = parse.parse_qs(parsed_url.query, keep_blank_values=True) + + # strip username:password (netloc can be usr:pwd@example.com) + netloc_parts = parsed_url.netloc.split("@") + if len(netloc_parts) > 1: + netloc = "%s:%s@" + netloc_parts[-1] + else: + netloc = parsed_url.netloc + + # strip values from query string + query_string = parse.unquote(parse.urlencode({key: "%s" for key in query_params})) + + safe_url = parse.urlunsplit( + Components( + scheme=parsed_url.scheme, + netloc=netloc, + query=query_string, + path=parsed_url.path, + fragment=parsed_url.fragment, + ) + ) + + return safe_url + + if PY37: def nanosecond_time(): diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000000..62b3185d3c --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,53 @@ +import pytest + +from sentry_sdk.utils import parameterize_url + + +@pytest.mark.parametrize( + ("url", "expected_result"), + [ + ("http://localhost:8000", "http://localhost:8000"), + ("http://example.com", "http://example.com"), + ("https://example.com", "https://example.com"), + ( + "example.com?token=abc&sessionid=123&save=true", + "example.com?token=%s&sessionid=%s&save=%s", + ), + ( + "http://example.com?token=abc&sessionid=123&save=true", + "http://example.com?token=%s&sessionid=%s&save=%s", + ), + ( + "https://example.com?token=abc&sessionid=123&save=true", + "https://example.com?token=%s&sessionid=%s&save=%s", + ), + ( + "http://localhost:8000/?token=abc&sessionid=123&save=true", + "http://localhost:8000/?token=%s&sessionid=%s&save=%s", + ), + ( + "ftp://username:password@ftp.example.com:9876/bla/blub#foo", + "ftp://%s:%s@ftp.example.com:9876/bla/blub#foo", + ), + ( + "https://username:password@example.com/bla/blub?token=abc&sessionid=123&save=true#fragment", + "https://%s:%s@example.com/bla/blub?token=%s&sessionid=%s&save=%s#fragment", + ), + ( + "http://example.com/bla?üsername=ada&pwd=häöüß", + "http://example.com/bla?üsername=%s&pwd=%s", + ), + ("bla/blub/foo", "bla/blub/foo"), + ("/bla/blub/foo/", "/bla/blub/foo/"), + ( + "bla/blub/foo?token=abc&sessionid=123&save=true", + "bla/blub/foo?token=%s&sessionid=%s&save=%s", + ), + ( + "/bla/blub/foo/?token=abc&sessionid=123&save=true", + "/bla/blub/foo/?token=%s&sessionid=%s&save=%s", + ), + ], +) +def test_parameterize_url(url, expected_result): + assert parameterize_url(url) == expected_result From 71d5e4a48477c49ce16d79cdea231dc9fc2a908c Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Tue, 15 Nov 2022 15:44:49 +0100 Subject: [PATCH 02/22] Better function name --- sentry_sdk/integrations/httpx.py | 6 +++--- sentry_sdk/integrations/stdlib.py | 4 ++-- sentry_sdk/utils.py | 2 +- tests/test_utils.py | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/sentry_sdk/integrations/httpx.py b/sentry_sdk/integrations/httpx.py index aec080a9d8..c5b77fb5a4 100644 --- a/sentry_sdk/integrations/httpx.py +++ b/sentry_sdk/integrations/httpx.py @@ -1,7 +1,7 @@ from sentry_sdk import Hub from sentry_sdk.consts import OP from sentry_sdk.integrations import Integration, DidNotEnable -from sentry_sdk.utils import logger, parameterize_url +from sentry_sdk.utils import logger, sanitize_url from sentry_sdk._types import MYPY @@ -43,7 +43,7 @@ def send(self, request, **kwargs): with hub.start_span( op=OP.HTTP_CLIENT, - description="%s %s" % (request.method, parameterize_url(request.url)), + description="%s %s" % (request.method, sanitize_url(request.url)), ) as span: span.set_data("method", request.method) span.set_data("url", str(request.url)) @@ -76,7 +76,7 @@ async def send(self, request, **kwargs): with hub.start_span( op=OP.HTTP_CLIENT, - description="%s %s" % (request.method, parameterize_url(request.url)), + description="%s %s" % (request.method, sanitize_url(request.url)), ) as span: span.set_data("method", request.method) span.set_data("url", str(request.url)) diff --git a/sentry_sdk/integrations/stdlib.py b/sentry_sdk/integrations/stdlib.py index 2d77f79716..366fee80ec 100644 --- a/sentry_sdk/integrations/stdlib.py +++ b/sentry_sdk/integrations/stdlib.py @@ -11,7 +11,7 @@ from sentry_sdk.utils import ( capture_internal_exceptions, logger, - parameterize_url, + sanitize_url, safe_repr, ) @@ -86,7 +86,7 @@ def putrequest(self, method, url, *args, **kwargs): span = hub.start_span( op=OP.HTTP_CLIENT, - description="%s %s" % (method, parameterize_url(real_url)), + description="%s %s" % (method, sanitize_url(real_url)), ) span.set_data("method", method) diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index eb1fce51c8..7e4de50c15 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -1093,7 +1093,7 @@ def from_base64(base64_string): return utf8_string -def parameterize_url(url): +def sanitize_url(url): # type: (str) -> str """ Removes all query parameter values and username:password from a given URL. diff --git a/tests/test_utils.py b/tests/test_utils.py index 62b3185d3c..e06848ef2f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,6 @@ import pytest -from sentry_sdk.utils import parameterize_url +from sentry_sdk.utils import sanitize_url @pytest.mark.parametrize( @@ -49,5 +49,5 @@ ), ], ) -def test_parameterize_url(url, expected_result): - assert parameterize_url(url) == expected_result +def test_sanitize_url(url, expected_result): + assert sanitize_url(url) == expected_result From 743c3d1acc68a94278d4d7c9e74ecae7fa206869 Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Tue, 15 Nov 2022 15:48:31 +0100 Subject: [PATCH 03/22] Check send_default_pii before sanitizing url. --- sentry_sdk/integrations/httpx.py | 17 +++++++++++++++-- sentry_sdk/integrations/stdlib.py | 8 ++++++-- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/sentry_sdk/integrations/httpx.py b/sentry_sdk/integrations/httpx.py index c5b77fb5a4..4365a77d8a 100644 --- a/sentry_sdk/integrations/httpx.py +++ b/sentry_sdk/integrations/httpx.py @@ -1,5 +1,6 @@ from sentry_sdk import Hub from sentry_sdk.consts import OP +from sentry_sdk.hub import _should_send_default_pii from sentry_sdk.integrations import Integration, DidNotEnable from sentry_sdk.utils import logger, sanitize_url @@ -43,7 +44,13 @@ def send(self, request, **kwargs): with hub.start_span( op=OP.HTTP_CLIENT, - description="%s %s" % (request.method, sanitize_url(request.url)), + description="%s %s" + % ( + request.method, + request.url + if _should_send_default_pii() + else sanitize_url(request.url), + ), ) as span: span.set_data("method", request.method) span.set_data("url", str(request.url)) @@ -76,7 +83,13 @@ async def send(self, request, **kwargs): with hub.start_span( op=OP.HTTP_CLIENT, - description="%s %s" % (request.method, sanitize_url(request.url)), + description="%s %s" + % ( + request.method, + request.url + if _should_send_default_pii() + else sanitize_url(request.url), + ), ) as span: span.set_data("method", request.method) span.set_data("url", str(request.url)) diff --git a/sentry_sdk/integrations/stdlib.py b/sentry_sdk/integrations/stdlib.py index 366fee80ec..1d90933acc 100644 --- a/sentry_sdk/integrations/stdlib.py +++ b/sentry_sdk/integrations/stdlib.py @@ -4,7 +4,7 @@ import platform from sentry_sdk.consts import OP -from sentry_sdk.hub import Hub +from sentry_sdk.hub import Hub, _should_send_default_pii from sentry_sdk.integrations import Integration from sentry_sdk.scope import add_global_event_processor from sentry_sdk.tracing_utils import EnvironHeaders @@ -86,7 +86,11 @@ def putrequest(self, method, url, *args, **kwargs): span = hub.start_span( op=OP.HTTP_CLIENT, - description="%s %s" % (method, sanitize_url(real_url)), + description="%s %s" + % ( + method, + real_url if _should_send_default_pii() else sanitize_url(real_url), + ), ) span.set_data("method", method) From 056089150feb07707ca5f631acba096c2a9dec50 Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Tue, 15 Nov 2022 16:38:54 +0100 Subject: [PATCH 04/22] Ignore typing on named tuples --- sentry_sdk/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index 7e4de50c15..795e6423a2 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -39,7 +39,7 @@ epoch = datetime(1970, 1, 1) -Components = namedtuple( +Components = namedtuple( # type: ignore typename="Components", field_names=["scheme", "netloc", "path", "query", "fragment"] ) @@ -1112,7 +1112,7 @@ def sanitize_url(url): query_string = parse.unquote(parse.urlencode({key: "%s" for key in query_params})) safe_url = parse.urlunsplit( - Components( + Components( # type: ignore scheme=parsed_url.scheme, netloc=netloc, query=query_string, From 5d250637f05b6879056473c90d78996eef128648 Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Tue, 15 Nov 2022 16:56:37 +0100 Subject: [PATCH 05/22] Make it run in Python 2 --- sentry_sdk/utils.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index 795e6423a2..ea2ca34fb5 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -9,7 +9,23 @@ import re import time from collections import namedtuple -from urllib import parse + +try: + # Python 3 + from urllib.parse import parse_qs + from urllib.parse import unquote + from urllib.parse import urlencode + from urllib.parse import urlsplit + from urllib.parse import urlunsplit + +except ImportError: + # Python 2 + from cgi import parse_qs + from urllib import unquote + from urllib import urlencode + from urlparse import urlsplit + from urlparse import urlunsplit + from datetime import datetime @@ -1098,8 +1114,8 @@ def sanitize_url(url): """ Removes all query parameter values and username:password from a given URL. """ - parsed_url = parse.urlsplit(url) - query_params = parse.parse_qs(parsed_url.query, keep_blank_values=True) + parsed_url = urlsplit(url) + query_params = parse_qs(parsed_url.query, keep_blank_values=True) # strip username:password (netloc can be usr:pwd@example.com) netloc_parts = parsed_url.netloc.split("@") @@ -1109,9 +1125,9 @@ def sanitize_url(url): netloc = parsed_url.netloc # strip values from query string - query_string = parse.unquote(parse.urlencode({key: "%s" for key in query_params})) + query_string = unquote(urlencode({key: "%s" for key in query_params})) - safe_url = parse.urlunsplit( + safe_url = urlunsplit( Components( # type: ignore scheme=parsed_url.scheme, netloc=netloc, From 63df6762a76448039f6dcc1a8d48e8123f419f76 Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Mon, 30 Jan 2023 11:22:18 +0100 Subject: [PATCH 06/22] Split url into url, query and fragment --- sentry_sdk/integrations/httpx.py | 36 +++---- sentry_sdk/integrations/stdlib.py | 15 +-- sentry_sdk/utils.py | 53 ++++++++-- tests/test_utils.py | 159 ++++++++++++++++++++++++++++-- 4 files changed, 219 insertions(+), 44 deletions(-) diff --git a/sentry_sdk/integrations/httpx.py b/sentry_sdk/integrations/httpx.py index 4365a77d8a..fe189a77fe 100644 --- a/sentry_sdk/integrations/httpx.py +++ b/sentry_sdk/integrations/httpx.py @@ -2,7 +2,7 @@ from sentry_sdk.consts import OP from sentry_sdk.hub import _should_send_default_pii from sentry_sdk.integrations import Integration, DidNotEnable -from sentry_sdk.utils import logger, sanitize_url +from sentry_sdk.utils import logger, parse_url from sentry_sdk._types import MYPY @@ -42,18 +42,18 @@ def send(self, request, **kwargs): if hub.get_integration(HttpxIntegration) is None: return real_send(self, request, **kwargs) + sanitize = not _should_send_default_pii() + parsed_url = parse_url(request.url, sanitize=sanitize) + with hub.start_span( op=OP.HTTP_CLIENT, - description="%s %s" - % ( - request.method, - request.url - if _should_send_default_pii() - else sanitize_url(request.url), - ), + description="%s %s" % (request.method, parsed_url.url), ) as span: span.set_data("method", request.method) - span.set_data("url", str(request.url)) + span.set_data("url", parsed_url.url) + span.set_data("http.query", parsed_url.query) + span.set_data("http.fragment", parsed_url.fragment) + for key, value in hub.iter_trace_propagation_headers(): logger.debug( "[Tracing] Adding `{key}` header {value} to outgoing request to {url}.".format( @@ -66,6 +66,7 @@ def send(self, request, **kwargs): span.set_data("status_code", rv.status_code) span.set_http_status(rv.status_code) span.set_data("reason", rv.reason_phrase) + return rv Client.send = send @@ -81,18 +82,18 @@ async def send(self, request, **kwargs): if hub.get_integration(HttpxIntegration) is None: return await real_send(self, request, **kwargs) + sanitize = not _should_send_default_pii() + parsed_url = parse_url(request.url, sanitize=sanitize) + with hub.start_span( op=OP.HTTP_CLIENT, - description="%s %s" - % ( - request.method, - request.url - if _should_send_default_pii() - else sanitize_url(request.url), - ), + description="%s %s" % (request.method, parsed_url.url), ) as span: span.set_data("method", request.method) - span.set_data("url", str(request.url)) + span.set_data("url", parsed_url.url) + span.set_data("http.query", parsed_url.query) + span.set_data("http.fragment", parsed_url.fragment) + for key, value in hub.iter_trace_propagation_headers(): logger.debug( "[Tracing] Adding `{key}` header {value} to outgoing request to {url}.".format( @@ -105,6 +106,7 @@ async def send(self, request, **kwargs): span.set_data("status_code", rv.status_code) span.set_http_status(rv.status_code) span.set_data("reason", rv.reason_phrase) + return rv AsyncClient.send = send diff --git a/sentry_sdk/integrations/stdlib.py b/sentry_sdk/integrations/stdlib.py index 6444e5c22d..69db8436b6 100644 --- a/sentry_sdk/integrations/stdlib.py +++ b/sentry_sdk/integrations/stdlib.py @@ -11,8 +11,8 @@ from sentry_sdk.utils import ( capture_internal_exceptions, logger, - sanitize_url, safe_repr, + parse_url, ) from sentry_sdk._types import MYPY @@ -84,17 +84,18 @@ def putrequest(self, method, url, *args, **kwargs): url, ) + sanitize = not _should_send_default_pii() + parsed_url = parse_url(real_url, sanitize=sanitize) + span = hub.start_span( op=OP.HTTP_CLIENT, - description="%s %s" - % ( - method, - real_url if _should_send_default_pii() else sanitize_url(real_url), - ), + description="%s %s" % (method, parsed_url.url), ) span.set_data("method", method) - span.set_data("url", real_url) + span.set_data("url", parsed_url.url) + span.set_data("http.query", parsed_url.query) + span.set_data("http.fragment", parsed_url.fragment) rv = real_putrequest(self, method, url, *args, **kwargs) diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index a9c07ed580..4ead48a468 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -10,6 +10,8 @@ import time from collections import namedtuple +from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE + try: # Python 3 from urllib.parse import parse_qs @@ -62,12 +64,6 @@ epoch = datetime(1970, 1, 1) - -Components = namedtuple( # type: ignore - typename="Components", field_names=["scheme", "netloc", "path", "query", "fragment"] -) - - # The logger is created here but initialized in the debug support module logger = logging.getLogger("sentry_sdk.errors") @@ -1150,6 +1146,11 @@ def from_base64(base64_string): return utf8_string +Components = namedtuple( # type: ignore + typename="Components", field_names=["scheme", "netloc", "path", "query", "fragment"] +) + + def sanitize_url(url): # type: (str) -> str """ @@ -1161,12 +1162,18 @@ def sanitize_url(url): # strip username:password (netloc can be usr:pwd@example.com) netloc_parts = parsed_url.netloc.split("@") if len(netloc_parts) > 1: - netloc = "%s:%s@" + netloc_parts[-1] + netloc = "%s:%s@%s" % ( + SENSITIVE_DATA_SUBSTITUTE, + SENSITIVE_DATA_SUBSTITUTE, + netloc_parts[-1], + ) else: netloc = parsed_url.netloc # strip values from query string - query_string = unquote(urlencode({key: "%s" for key in query_params})) + query_string = unquote( + urlencode({key: SENSITIVE_DATA_SUBSTITUTE for key in query_params}) + ) safe_url = urlunsplit( Components( # type: ignore @@ -1181,6 +1188,34 @@ def sanitize_url(url): return safe_url +ParsedUrl = namedtuple( # type: ignore + typename="ParsedUrl", field_names=["url", "query", "fragment"] +) + + +def parse_url(url, sanitize=True): + + # type: (str, bool) -> ParsedUrl + """ + Splits a URL into a url (including path), query and fragment. If sanitize is True, the url will be + sanitized to remove sensitive data. + """ + url = sanitize_url(url) if sanitize else url + + parsed_url = urlsplit(url) + base_url = urlunsplit( + Components( # type: ignore + scheme=parsed_url.scheme, + netloc=parsed_url.netloc, + query="", + path=parsed_url.path, + fragment="", + ) + ) + + return ParsedUrl(url=base_url, query=parsed_url.query, fragment=parsed_url.fragment) + + if PY37: def nanosecond_time(): @@ -1191,12 +1226,10 @@ def nanosecond_time(): def nanosecond_time(): # type: () -> int - return int(time.perf_counter() * 1e9) else: def nanosecond_time(): # type: () -> int - raise AttributeError diff --git a/tests/test_utils.py b/tests/test_utils.py index e06848ef2f..802d8623f8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,6 @@ import pytest -from sentry_sdk.utils import sanitize_url +from sentry_sdk.utils import parse_url, sanitize_url @pytest.mark.parametrize( @@ -11,43 +11,182 @@ ("https://example.com", "https://example.com"), ( "example.com?token=abc&sessionid=123&save=true", - "example.com?token=%s&sessionid=%s&save=%s", + "example.com?token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", ), ( "http://example.com?token=abc&sessionid=123&save=true", - "http://example.com?token=%s&sessionid=%s&save=%s", + "http://example.com?token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", ), ( "https://example.com?token=abc&sessionid=123&save=true", - "https://example.com?token=%s&sessionid=%s&save=%s", + "https://example.com?token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", ), ( "http://localhost:8000/?token=abc&sessionid=123&save=true", - "http://localhost:8000/?token=%s&sessionid=%s&save=%s", + "http://localhost:8000/?token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", ), ( "ftp://username:password@ftp.example.com:9876/bla/blub#foo", - "ftp://%s:%s@ftp.example.com:9876/bla/blub#foo", + "ftp://[Filtered]:[Filtered]@ftp.example.com:9876/bla/blub#foo", ), ( "https://username:password@example.com/bla/blub?token=abc&sessionid=123&save=true#fragment", - "https://%s:%s@example.com/bla/blub?token=%s&sessionid=%s&save=%s#fragment", + "https://[Filtered]:[Filtered]@example.com/bla/blub?token=[Filtered]&sessionid=[Filtered]&save=[Filtered]#fragment", ), ( "http://example.com/bla?üsername=ada&pwd=häöüß", - "http://example.com/bla?üsername=%s&pwd=%s", + "http://example.com/bla?üsername=[Filtered]&pwd=[Filtered]", ), ("bla/blub/foo", "bla/blub/foo"), ("/bla/blub/foo/", "/bla/blub/foo/"), ( "bla/blub/foo?token=abc&sessionid=123&save=true", - "bla/blub/foo?token=%s&sessionid=%s&save=%s", + "bla/blub/foo?token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", ), ( "/bla/blub/foo/?token=abc&sessionid=123&save=true", - "/bla/blub/foo/?token=%s&sessionid=%s&save=%s", + "/bla/blub/foo/?token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", ), ], ) def test_sanitize_url(url, expected_result): assert sanitize_url(url) == expected_result + + +@pytest.mark.parametrize( + ("url", "sanitize", "expected_url", "expected_query", "expected_fragment"), + [ + # Test with sanitize=True + ( + "https://example.com", + True, + "https://example.com", + "", + "", + ), + ( + "example.com?token=abc&sessionid=123&save=true", + True, + "example.com", + "token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", + "", + ), + ( + "https://example.com?token=abc&sessionid=123&save=true", + True, + "https://example.com", + "token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", + "", + ), + ( + "https://username:password@example.com/bla/blub?token=abc&sessionid=123&save=true#fragment", + True, + "https://[Filtered]:[Filtered]@example.com/bla/blub", + "token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", + "fragment", + ), + ( + "http://example.com/bla?üsername=ada&pwd=häöüß", + True, + "http://example.com/bla", + "üsername=[Filtered]&pwd=[Filtered]", + "", + ), + ( + "bla/blub/foo", + True, + "bla/blub/foo", + "", + "", + ), + ( + "/bla/blub/foo/#baz", + True, + "/bla/blub/foo/", + "", + "baz", + ), + ( + "bla/blub/foo?token=abc&sessionid=123&save=true", + True, + "bla/blub/foo", + "token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", + "", + ), + ( + "/bla/blub/foo/?token=abc&sessionid=123&save=true", + True, + "/bla/blub/foo/", + "token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", + "", + ), + # Test with sanitize=False + ( + "https://example.com", + False, + "https://example.com", + "", + "", + ), + ( + "example.com?token=abc&sessionid=123&save=true", + False, + "example.com", + "token=abc&sessionid=123&save=true", + "", + ), + ( + "https://example.com?token=abc&sessionid=123&save=true", + False, + "https://example.com", + "token=abc&sessionid=123&save=true", + "", + ), + ( + "https://username:password@example.com/bla/blub?token=abc&sessionid=123&save=true#fragment", + False, + "https://username:password@example.com/bla/blub", + "token=abc&sessionid=123&save=true", + "fragment", + ), + ( + "http://example.com/bla?üsername=ada&pwd=häöüß", + False, + "http://example.com/bla", + "üsername=ada&pwd=häöüß", + "", + ), + ( + "bla/blub/foo", + False, + "bla/blub/foo", + "", + "", + ), + ( + "/bla/blub/foo/#baz", + False, + "/bla/blub/foo/", + "", + "baz", + ), + ( + "bla/blub/foo?token=abc&sessionid=123&save=true", + False, + "bla/blub/foo", + "token=abc&sessionid=123&save=true", + "", + ), + ( + "/bla/blub/foo/?token=abc&sessionid=123&save=true", + False, + "/bla/blub/foo/", + "token=abc&sessionid=123&save=true", + "", + ), + ], +) +def test_parse_url(url, sanitize, expected_url, expected_query, expected_fragment): + assert parse_url(url, sanitize=sanitize).url == expected_url + assert parse_url(url, sanitize=sanitize).query == expected_query + assert parse_url(url, sanitize=sanitize).fragment == expected_fragment From 4db535fa7e4f2770caaadd1c7ee8b975ee2bdfea Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Mon, 30 Jan 2023 11:38:34 +0100 Subject: [PATCH 07/22] Some type fixes --- sentry_sdk/integrations/boto3.py | 10 +++++++++- sentry_sdk/utils.py | 22 +++++++++------------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/sentry_sdk/integrations/boto3.py b/sentry_sdk/integrations/boto3.py index 2f2f6bbea9..74999eb964 100644 --- a/sentry_sdk/integrations/boto3.py +++ b/sentry_sdk/integrations/boto3.py @@ -2,11 +2,13 @@ from sentry_sdk import Hub from sentry_sdk.consts import OP +from sentry_sdk.hub import _should_send_default_pii from sentry_sdk.integrations import Integration, DidNotEnable from sentry_sdk.tracing import Span from sentry_sdk._functools import partial from sentry_sdk._types import MYPY +from sentry_sdk.utils import parse_url if MYPY: from typing import Any @@ -66,9 +68,15 @@ def _sentry_request_created(service_id, request, operation_name, **kwargs): op=OP.HTTP_CLIENT, description=description, ) + + sanitize = not _should_send_default_pii() + parsed_url = parse_url(request.url, sanitize=sanitize) + span.set_tag("aws.service_id", service_id) span.set_tag("aws.operation_name", operation_name) - span.set_data("aws.request.url", request.url) + span.set_data("aws.request.url", parsed_url.url) + span.set_data("http.query", parsed_url.query) + span.set_data("http.fragment", parsed_url.fragment) # We do it in order for subsequent http calls/retries be # attached to this span. diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index 4ead48a468..1139e24859 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -22,11 +22,11 @@ except ImportError: # Python 2 - from cgi import parse_qs - from urllib import unquote - from urllib import urlencode - from urlparse import urlsplit - from urlparse import urlunsplit + from cgi import parse_qs # type: ignore + from urllib import unquote # type: ignore + from urllib import urlencode # type: ignore + from urlparse import urlsplit # type: ignore + from urlparse import urlunsplit # type: ignore from datetime import datetime @@ -1146,9 +1146,7 @@ def from_base64(base64_string): return utf8_string -Components = namedtuple( # type: ignore - typename="Components", field_names=["scheme", "netloc", "path", "query", "fragment"] -) +Components = namedtuple("Components", ["scheme", "netloc", "path", "query", "fragment"]) def sanitize_url(url): @@ -1176,7 +1174,7 @@ def sanitize_url(url): ) safe_url = urlunsplit( - Components( # type: ignore + Components( scheme=parsed_url.scheme, netloc=netloc, query=query_string, @@ -1188,9 +1186,7 @@ def sanitize_url(url): return safe_url -ParsedUrl = namedtuple( # type: ignore - typename="ParsedUrl", field_names=["url", "query", "fragment"] -) +ParsedUrl = namedtuple("ParsedUrl", ["url", "query", "fragment"]) def parse_url(url, sanitize=True): @@ -1204,7 +1200,7 @@ def parse_url(url, sanitize=True): parsed_url = urlsplit(url) base_url = urlunsplit( - Components( # type: ignore + Components( scheme=parsed_url.scheme, netloc=parsed_url.netloc, query="", From ce56e93da521fbea0eeb02acaca0c19c7ce13b06 Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Mon, 30 Jan 2023 11:55:27 +0100 Subject: [PATCH 08/22] Preventing circular import --- sentry_sdk/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index 1139e24859..09732b78ee 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -10,8 +10,6 @@ import time from collections import namedtuple -from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE - try: # Python 3 from urllib.parse import parse_qs @@ -1157,6 +1155,8 @@ def sanitize_url(url): parsed_url = urlsplit(url) query_params = parse_qs(parsed_url.query, keep_blank_values=True) + from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE + # strip username:password (netloc can be usr:pwd@example.com) netloc_parts = parsed_url.netloc.split("@") if len(netloc_parts) > 1: From e41803303b0da073c8923910ed78c3fb7cfa320d Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Mon, 30 Jan 2023 12:04:53 +0100 Subject: [PATCH 09/22] Fixed some tests --- tests/integrations/requests/test_requests.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integrations/requests/test_requests.py b/tests/integrations/requests/test_requests.py index 02c6636853..f4c6b01db0 100644 --- a/tests/integrations/requests/test_requests.py +++ b/tests/integrations/requests/test_requests.py @@ -20,6 +20,8 @@ def test_crumb_capture(sentry_init, capture_events): assert crumb["data"] == { "url": "https://httpbin.org/status/418", "method": "GET", + "http.fragment": "", + "http.query": "", "status_code": response.status_code, "reason": response.reason, } From 5bbd781dfc25c9375b6852e4764f765abb85b75e Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Mon, 30 Jan 2023 12:59:53 +0100 Subject: [PATCH 10/22] Make url a string to fix tests --- sentry_sdk/integrations/httpx.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sentry_sdk/integrations/httpx.py b/sentry_sdk/integrations/httpx.py index fe189a77fe..1f28a12e4a 100644 --- a/sentry_sdk/integrations/httpx.py +++ b/sentry_sdk/integrations/httpx.py @@ -43,7 +43,7 @@ def send(self, request, **kwargs): return real_send(self, request, **kwargs) sanitize = not _should_send_default_pii() - parsed_url = parse_url(request.url, sanitize=sanitize) + parsed_url = parse_url(str(request.url, sanitize=sanitize)) with hub.start_span( op=OP.HTTP_CLIENT, @@ -57,7 +57,7 @@ def send(self, request, **kwargs): for key, value in hub.iter_trace_propagation_headers(): logger.debug( "[Tracing] Adding `{key}` header {value} to outgoing request to {url}.".format( - key=key, value=value, url=request.url + key=key, value=value, url=str(request.url) ) ) request.headers[key] = value From fcbd8d73e7d275c7a3aea63a3f9887a539c5c1a2 Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Mon, 30 Jan 2023 14:47:12 +0100 Subject: [PATCH 11/22] Fixing httpx tests again --- sentry_sdk/integrations/httpx.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sentry_sdk/integrations/httpx.py b/sentry_sdk/integrations/httpx.py index 1f28a12e4a..b30a29ee0a 100644 --- a/sentry_sdk/integrations/httpx.py +++ b/sentry_sdk/integrations/httpx.py @@ -43,7 +43,7 @@ def send(self, request, **kwargs): return real_send(self, request, **kwargs) sanitize = not _should_send_default_pii() - parsed_url = parse_url(str(request.url, sanitize=sanitize)) + parsed_url = parse_url(str(request.url), sanitize=sanitize) with hub.start_span( op=OP.HTTP_CLIENT, @@ -83,7 +83,7 @@ async def send(self, request, **kwargs): return await real_send(self, request, **kwargs) sanitize = not _should_send_default_pii() - parsed_url = parse_url(request.url, sanitize=sanitize) + parsed_url = parse_url(str(request.url), sanitize=sanitize) with hub.start_span( op=OP.HTTP_CLIENT, From 2bd870c84b79dc1f0698a6cbf4472f23767c0e82 Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Mon, 30 Jan 2023 14:56:03 +0100 Subject: [PATCH 12/22] Fixing tests --- tests/integrations/httpx/test_httpx.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integrations/httpx/test_httpx.py b/tests/integrations/httpx/test_httpx.py index 4623f13348..0597d10988 100644 --- a/tests/integrations/httpx/test_httpx.py +++ b/tests/integrations/httpx/test_httpx.py @@ -34,6 +34,8 @@ def before_breadcrumb(crumb, hint): assert crumb["data"] == { "url": url, "method": "GET", + "http.fragment": "", + "http.query": "", "status_code": 200, "reason": "OK", "extra": "foo", From 72a4675ba3dcf989df5c61c2b81059049a7ff586 Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Mon, 30 Jan 2023 15:05:24 +0100 Subject: [PATCH 13/22] Fix tests for old Python versions --- tests/test_utils.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 802d8623f8..34e33efa06 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,4 +1,5 @@ import pytest +import re from sentry_sdk.utils import parse_url, sanitize_url @@ -50,7 +51,12 @@ ], ) def test_sanitize_url(url, expected_result): - assert sanitize_url(url) == expected_result + # sort parts because old Python versions (<3.6) don't preserve order + sanitized_url = sanitize_url(url) + parts = sorted(re.split(r"\&|\?", sanitized_url)) + expected_parts = sorted(re.split(r"\&|\?", expected_result)) + + assert parts == expected_parts @pytest.mark.parametrize( @@ -188,5 +194,11 @@ def test_sanitize_url(url, expected_result): ) def test_parse_url(url, sanitize, expected_url, expected_query, expected_fragment): assert parse_url(url, sanitize=sanitize).url == expected_url - assert parse_url(url, sanitize=sanitize).query == expected_query assert parse_url(url, sanitize=sanitize).fragment == expected_fragment + + # sort parts because old Python versions (<3.6) don't preserve order + sanitized_query = parse_url(url, sanitize=sanitize).query + query_parts = sorted(re.split(r"\&|\?", sanitized_query)) + expected_query_parts = sorted(re.split(r"\&|\?", expected_query)) + + assert query_parts == expected_query_parts From 1639cc45c0ab54af0abb86b4f2e84a28ba400272 Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Mon, 30 Jan 2023 15:20:20 +0100 Subject: [PATCH 14/22] Fix tests with fragments in old Python versions --- tests/test_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 34e33efa06..7f89bc979d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -53,8 +53,8 @@ def test_sanitize_url(url, expected_result): # sort parts because old Python versions (<3.6) don't preserve order sanitized_url = sanitize_url(url) - parts = sorted(re.split(r"\&|\?", sanitized_url)) - expected_parts = sorted(re.split(r"\&|\?", expected_result)) + parts = sorted(re.split(r"\&|\?|\#", sanitized_url)) + expected_parts = sorted(re.split(r"\&|\?|\#", expected_result)) assert parts == expected_parts @@ -198,7 +198,7 @@ def test_parse_url(url, sanitize, expected_url, expected_query, expected_fragmen # sort parts because old Python versions (<3.6) don't preserve order sanitized_query = parse_url(url, sanitize=sanitize).query - query_parts = sorted(re.split(r"\&|\?", sanitized_query)) - expected_query_parts = sorted(re.split(r"\&|\?", expected_query)) + query_parts = sorted(re.split(r"\&|\?|\#", sanitized_query)) + expected_query_parts = sorted(re.split(r"\&|\?|\#", expected_query)) assert query_parts == expected_query_parts From 90eb4dbec21702c438e1084da32ca94491469f26 Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Mon, 30 Jan 2023 15:44:43 +0100 Subject: [PATCH 15/22] Fixed utf8 chars in Python 2.7 --- tests/test_utils.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 7f89bc979d..0dca64f86f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -34,10 +34,6 @@ "https://username:password@example.com/bla/blub?token=abc&sessionid=123&save=true#fragment", "https://[Filtered]:[Filtered]@example.com/bla/blub?token=[Filtered]&sessionid=[Filtered]&save=[Filtered]#fragment", ), - ( - "http://example.com/bla?üsername=ada&pwd=häöüß", - "http://example.com/bla?üsername=[Filtered]&pwd=[Filtered]", - ), ("bla/blub/foo", "bla/blub/foo"), ("/bla/blub/foo/", "/bla/blub/foo/"), ( @@ -91,13 +87,6 @@ def test_sanitize_url(url, expected_result): "token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", "fragment", ), - ( - "http://example.com/bla?üsername=ada&pwd=häöüß", - True, - "http://example.com/bla", - "üsername=[Filtered]&pwd=[Filtered]", - "", - ), ( "bla/blub/foo", True, @@ -155,13 +144,6 @@ def test_sanitize_url(url, expected_result): "token=abc&sessionid=123&save=true", "fragment", ), - ( - "http://example.com/bla?üsername=ada&pwd=häöüß", - False, - "http://example.com/bla", - "üsername=ada&pwd=häöüß", - "", - ), ( "bla/blub/foo", False, From 9215f4538b3174cf895aab096744e0a71da84da8 Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Mon, 30 Jan 2023 16:28:28 +0100 Subject: [PATCH 16/22] Cleanup --- sentry_sdk/integrations/httpx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sentry_sdk/integrations/httpx.py b/sentry_sdk/integrations/httpx.py index b30a29ee0a..920c2f29bd 100644 --- a/sentry_sdk/integrations/httpx.py +++ b/sentry_sdk/integrations/httpx.py @@ -57,7 +57,7 @@ def send(self, request, **kwargs): for key, value in hub.iter_trace_propagation_headers(): logger.debug( "[Tracing] Adding `{key}` header {value} to outgoing request to {url}.".format( - key=key, value=value, url=str(request.url) + key=key, value=value, url=request.url ) ) request.headers[key] = value From 1482ac6fa7459df06954615fe4a0a9dd8b8680e9 Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Wed, 15 Feb 2023 12:35:26 +0100 Subject: [PATCH 17/22] Moved import outside of function --- sentry_sdk/utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index 7c1f4b9d3a..1edf3bb90c 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -40,6 +40,7 @@ import sentry_sdk from sentry_sdk._compat import PY2, PY33, PY37, implements_str, text_type, urlparse from sentry_sdk._types import MYPY +from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE if MYPY: from types import FrameType, TracebackType @@ -392,8 +393,6 @@ def removed_because_over_size_limit(cls): def substituted_because_contains_sensitive_data(cls): # type: () -> AnnotatedValue """The actual value was removed because it contained sensitive information.""" - from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE - return AnnotatedValue( value=SENSITIVE_DATA_SUBSTITUTE, metadata={ @@ -1192,8 +1191,6 @@ def sanitize_url(url): parsed_url = urlsplit(url) query_params = parse_qs(parsed_url.query, keep_blank_values=True) - from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE - # strip username:password (netloc can be usr:pwd@example.com) netloc_parts = parsed_url.netloc.split("@") if len(netloc_parts) > 1: From 6a82959cbb281dc47d7450f8c7948d12a1e7d6c3 Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Wed, 15 Feb 2023 12:38:28 +0100 Subject: [PATCH 18/22] Revert "Moved import outside of function" This reverts commit 1482ac6fa7459df06954615fe4a0a9dd8b8680e9. --- sentry_sdk/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index 1edf3bb90c..7c1f4b9d3a 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -40,7 +40,6 @@ import sentry_sdk from sentry_sdk._compat import PY2, PY33, PY37, implements_str, text_type, urlparse from sentry_sdk._types import MYPY -from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE if MYPY: from types import FrameType, TracebackType @@ -393,6 +392,8 @@ def removed_because_over_size_limit(cls): def substituted_because_contains_sensitive_data(cls): # type: () -> AnnotatedValue """The actual value was removed because it contained sensitive information.""" + from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE + return AnnotatedValue( value=SENSITIVE_DATA_SUBSTITUTE, metadata={ @@ -1191,6 +1192,8 @@ def sanitize_url(url): parsed_url = urlsplit(url) query_params = parse_qs(parsed_url.query, keep_blank_values=True) + from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE + # strip username:password (netloc can be usr:pwd@example.com) netloc_parts = parsed_url.netloc.split("@") if len(netloc_parts) > 1: From 773ed80832bb7fda222a97a99df9959a3878947e Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Wed, 15 Feb 2023 13:35:58 +0100 Subject: [PATCH 19/22] Always remove authority, but for now to not filter query values --- sentry_sdk/integrations/boto3.py | 4 +--- sentry_sdk/integrations/httpx.py | 7 ++---- sentry_sdk/integrations/stdlib.py | 5 ++-- sentry_sdk/utils.py | 39 ++++++++++++++++++------------- tests/test_utils.py | 2 +- 5 files changed, 29 insertions(+), 28 deletions(-) diff --git a/sentry_sdk/integrations/boto3.py b/sentry_sdk/integrations/boto3.py index 74999eb964..d86628402e 100644 --- a/sentry_sdk/integrations/boto3.py +++ b/sentry_sdk/integrations/boto3.py @@ -2,7 +2,6 @@ from sentry_sdk import Hub from sentry_sdk.consts import OP -from sentry_sdk.hub import _should_send_default_pii from sentry_sdk.integrations import Integration, DidNotEnable from sentry_sdk.tracing import Span @@ -69,8 +68,7 @@ def _sentry_request_created(service_id, request, operation_name, **kwargs): description=description, ) - sanitize = not _should_send_default_pii() - parsed_url = parse_url(request.url, sanitize=sanitize) + parsed_url = parse_url(request.url, sanitize=False) span.set_tag("aws.service_id", service_id) span.set_tag("aws.operation_name", operation_name) diff --git a/sentry_sdk/integrations/httpx.py b/sentry_sdk/integrations/httpx.py index 920c2f29bd..963fb64741 100644 --- a/sentry_sdk/integrations/httpx.py +++ b/sentry_sdk/integrations/httpx.py @@ -1,6 +1,5 @@ from sentry_sdk import Hub from sentry_sdk.consts import OP -from sentry_sdk.hub import _should_send_default_pii from sentry_sdk.integrations import Integration, DidNotEnable from sentry_sdk.utils import logger, parse_url @@ -42,8 +41,7 @@ def send(self, request, **kwargs): if hub.get_integration(HttpxIntegration) is None: return real_send(self, request, **kwargs) - sanitize = not _should_send_default_pii() - parsed_url = parse_url(str(request.url), sanitize=sanitize) + parsed_url = parse_url(str(request.url), sanitize=False) with hub.start_span( op=OP.HTTP_CLIENT, @@ -82,8 +80,7 @@ async def send(self, request, **kwargs): if hub.get_integration(HttpxIntegration) is None: return await real_send(self, request, **kwargs) - sanitize = not _should_send_default_pii() - parsed_url = parse_url(str(request.url), sanitize=sanitize) + parsed_url = parse_url(str(request.url), sanitize=False) with hub.start_span( op=OP.HTTP_CLIENT, diff --git a/sentry_sdk/integrations/stdlib.py b/sentry_sdk/integrations/stdlib.py index 69db8436b6..8da3b95d49 100644 --- a/sentry_sdk/integrations/stdlib.py +++ b/sentry_sdk/integrations/stdlib.py @@ -4,7 +4,7 @@ import platform from sentry_sdk.consts import OP -from sentry_sdk.hub import Hub, _should_send_default_pii +from sentry_sdk.hub import Hub from sentry_sdk.integrations import Integration from sentry_sdk.scope import add_global_event_processor from sentry_sdk.tracing_utils import EnvironHeaders @@ -84,8 +84,7 @@ def putrequest(self, method, url, *args, **kwargs): url, ) - sanitize = not _should_send_default_pii() - parsed_url = parse_url(real_url, sanitize=sanitize) + parsed_url = parse_url(real_url, sanitize=False) span = hub.start_span( op=OP.HTTP_CLIENT, diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index 7c1f4b9d3a..f39653176f 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -1184,10 +1184,10 @@ def from_base64(base64_string): Components = namedtuple("Components", ["scheme", "netloc", "path", "query", "fragment"]) -def sanitize_url(url): - # type: (str) -> str +def sanitize_url(url, remove_authority=True, remove_query_values=True): + # type: (str, bool, bool) -> str """ - Removes all query parameter values and username:password from a given URL. + Removes the authority and query parameter values from a given URL. """ parsed_url = urlsplit(url) query_params = parse_qs(parsed_url.query, keep_blank_values=True) @@ -1195,20 +1195,26 @@ def sanitize_url(url): from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE # strip username:password (netloc can be usr:pwd@example.com) - netloc_parts = parsed_url.netloc.split("@") - if len(netloc_parts) > 1: - netloc = "%s:%s@%s" % ( - SENSITIVE_DATA_SUBSTITUTE, - SENSITIVE_DATA_SUBSTITUTE, - netloc_parts[-1], - ) + if remove_authority: + netloc_parts = parsed_url.netloc.split("@") + if len(netloc_parts) > 1: + netloc = "%s:%s@%s" % ( + SENSITIVE_DATA_SUBSTITUTE, + SENSITIVE_DATA_SUBSTITUTE, + netloc_parts[-1], + ) + else: + netloc = parsed_url.netloc else: netloc = parsed_url.netloc # strip values from query string - query_string = unquote( - urlencode({key: SENSITIVE_DATA_SUBSTITUTE for key in query_params}) - ) + if remove_query_values: + query_string = unquote( + urlencode({key: SENSITIVE_DATA_SUBSTITUTE for key in query_params}) + ) + else: + query_string = parsed_url.query safe_url = urlunsplit( Components( @@ -1230,10 +1236,11 @@ def parse_url(url, sanitize=True): # type: (str, bool) -> ParsedUrl """ - Splits a URL into a url (including path), query and fragment. If sanitize is True, the url will be - sanitized to remove sensitive data. + Splits a URL into a url (including path), query and fragment. If sanitize is True, the query + parameters will be sanitized to remove sensitive data. The autority (username and password) + in the URL will always be removed. """ - url = sanitize_url(url) if sanitize else url + url = sanitize_url(url, remove_authority=True, remove_query_values=sanitize) parsed_url = urlsplit(url) base_url = urlunsplit( diff --git a/tests/test_utils.py b/tests/test_utils.py index 0dca64f86f..2e266c7600 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -140,7 +140,7 @@ def test_sanitize_url(url, expected_result): ( "https://username:password@example.com/bla/blub?token=abc&sessionid=123&save=true#fragment", False, - "https://username:password@example.com/bla/blub", + "https://[Filtered]:[Filtered]@example.com/bla/blub", "token=abc&sessionid=123&save=true", "fragment", ), From 51ab32d37483ef2dcd4121dfe5be00d3a69b9384 Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Wed, 15 Feb 2023 13:40:18 +0100 Subject: [PATCH 20/22] Moved import to the bottom of file to prevent circular import --- sentry_sdk/utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index f39653176f..e5ce6fa949 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -392,7 +392,6 @@ def removed_because_over_size_limit(cls): def substituted_because_contains_sensitive_data(cls): # type: () -> AnnotatedValue """The actual value was removed because it contained sensitive information.""" - from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE return AnnotatedValue( value=SENSITIVE_DATA_SUBSTITUTE, @@ -1192,8 +1191,6 @@ def sanitize_url(url, remove_authority=True, remove_query_values=True): parsed_url = urlsplit(url) query_params = parse_qs(parsed_url.query, keep_blank_values=True) - from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE - # strip username:password (netloc can be usr:pwd@example.com) if remove_authority: netloc_parts = parsed_url.netloc.split("@") @@ -1273,3 +1270,7 @@ def nanosecond_time(): def nanosecond_time(): # type: () -> int raise AttributeError + + +# prevent circular import +from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE From 4eaafc0898e85030b8afdc80248c9e3ff91e68a3 Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Wed, 15 Feb 2023 13:44:06 +0100 Subject: [PATCH 21/22] Revert "Moved import to the bottom of file to prevent circular import" This reverts commit 51ab32d37483ef2dcd4121dfe5be00d3a69b9384. --- sentry_sdk/utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index e5ce6fa949..f39653176f 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -392,6 +392,7 @@ def removed_because_over_size_limit(cls): def substituted_because_contains_sensitive_data(cls): # type: () -> AnnotatedValue """The actual value was removed because it contained sensitive information.""" + from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE return AnnotatedValue( value=SENSITIVE_DATA_SUBSTITUTE, @@ -1191,6 +1192,8 @@ def sanitize_url(url, remove_authority=True, remove_query_values=True): parsed_url = urlsplit(url) query_params = parse_qs(parsed_url.query, keep_blank_values=True) + from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE + # strip username:password (netloc can be usr:pwd@example.com) if remove_authority: netloc_parts = parsed_url.netloc.split("@") @@ -1270,7 +1273,3 @@ def nanosecond_time(): def nanosecond_time(): # type: () -> int raise AttributeError - - -# prevent circular import -from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE From 302214316574d6193a122051cde3d04305935e3d Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Wed, 15 Feb 2023 14:02:56 +0100 Subject: [PATCH 22/22] Moved SENSITIVE_DATA_SUBSTITUTE to utils.py to prevent circular imports --- sentry_sdk/consts.py | 2 -- sentry_sdk/integrations/django/__init__.py | 3 ++- sentry_sdk/integrations/huey.py | 8 ++++++-- sentry_sdk/utils.py | 6 ++---- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/sentry_sdk/consts.py b/sentry_sdk/consts.py index bc25213add..743e869af7 100644 --- a/sentry_sdk/consts.py +++ b/sentry_sdk/consts.py @@ -44,8 +44,6 @@ DEFAULT_QUEUE_SIZE = 100 DEFAULT_MAX_BREADCRUMBS = 100 -SENSITIVE_DATA_SUBSTITUTE = "[Filtered]" - class INSTRUMENTER: SENTRY = "sentry" diff --git a/sentry_sdk/integrations/django/__init__.py b/sentry_sdk/integrations/django/__init__.py index 697ab484e3..45dad780ff 100644 --- a/sentry_sdk/integrations/django/__init__.py +++ b/sentry_sdk/integrations/django/__init__.py @@ -6,7 +6,7 @@ import weakref from sentry_sdk._types import MYPY -from sentry_sdk.consts import OP, SENSITIVE_DATA_SUBSTITUTE +from sentry_sdk.consts import OP from sentry_sdk.hub import Hub, _should_send_default_pii from sentry_sdk.scope import add_global_event_processor from sentry_sdk.serializer import add_global_repr_processor @@ -16,6 +16,7 @@ AnnotatedValue, HAS_REAL_CONTEXTVARS, CONTEXTVARS_ERROR_MESSAGE, + SENSITIVE_DATA_SUBSTITUTE, logger, capture_internal_exceptions, event_from_exception, diff --git a/sentry_sdk/integrations/huey.py b/sentry_sdk/integrations/huey.py index 8f5f26133c..74ce4d35d5 100644 --- a/sentry_sdk/integrations/huey.py +++ b/sentry_sdk/integrations/huey.py @@ -6,11 +6,15 @@ from sentry_sdk._compat import reraise from sentry_sdk._types import MYPY from sentry_sdk import Hub -from sentry_sdk.consts import OP, SENSITIVE_DATA_SUBSTITUTE +from sentry_sdk.consts import OP from sentry_sdk.hub import _should_send_default_pii from sentry_sdk.integrations import DidNotEnable, Integration from sentry_sdk.tracing import Transaction, TRANSACTION_SOURCE_TASK -from sentry_sdk.utils import capture_internal_exceptions, event_from_exception +from sentry_sdk.utils import ( + capture_internal_exceptions, + event_from_exception, + SENSITIVE_DATA_SUBSTITUTE, +) if MYPY: from typing import Any, Callable, Optional, Union, TypeVar diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index f39653176f..93301ccbf3 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -68,6 +68,8 @@ MAX_STRING_LENGTH = 1024 BASE64_ALPHABET = re.compile(r"^[a-zA-Z0-9/+=]*$") +SENSITIVE_DATA_SUBSTITUTE = "[Filtered]" + def json_dumps(data): # type: (Any) -> bytes @@ -392,8 +394,6 @@ def removed_because_over_size_limit(cls): def substituted_because_contains_sensitive_data(cls): # type: () -> AnnotatedValue """The actual value was removed because it contained sensitive information.""" - from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE - return AnnotatedValue( value=SENSITIVE_DATA_SUBSTITUTE, metadata={ @@ -1192,8 +1192,6 @@ def sanitize_url(url, remove_authority=True, remove_query_values=True): parsed_url = urlsplit(url) query_params = parse_qs(parsed_url.query, keep_blank_values=True) - from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE - # strip username:password (netloc can be usr:pwd@example.com) if remove_authority: netloc_parts = parsed_url.netloc.split("@")