diff --git a/sentry_sdk/consts.py b/sentry_sdk/consts.py index bc25213add..743e869af7 100644 --- a/sentry_sdk/consts.py +++ b/sentry_sdk/consts.py @@ -44,8 +44,6 @@ DEFAULT_QUEUE_SIZE = 100 DEFAULT_MAX_BREADCRUMBS = 100 -SENSITIVE_DATA_SUBSTITUTE = "[Filtered]" - class INSTRUMENTER: SENTRY = "sentry" diff --git a/sentry_sdk/integrations/boto3.py b/sentry_sdk/integrations/boto3.py index 2f2f6bbea9..d86628402e 100644 --- a/sentry_sdk/integrations/boto3.py +++ b/sentry_sdk/integrations/boto3.py @@ -7,6 +7,7 @@ from sentry_sdk._functools import partial from sentry_sdk._types import MYPY +from sentry_sdk.utils import parse_url if MYPY: from typing import Any @@ -66,9 +67,14 @@ def _sentry_request_created(service_id, request, operation_name, **kwargs): op=OP.HTTP_CLIENT, description=description, ) + + parsed_url = parse_url(request.url, sanitize=False) + span.set_tag("aws.service_id", service_id) span.set_tag("aws.operation_name", operation_name) - span.set_data("aws.request.url", request.url) + span.set_data("aws.request.url", parsed_url.url) + span.set_data("http.query", parsed_url.query) + span.set_data("http.fragment", parsed_url.fragment) # We do it in order for subsequent http calls/retries be # attached to this span. diff --git a/sentry_sdk/integrations/django/__init__.py b/sentry_sdk/integrations/django/__init__.py index 697ab484e3..45dad780ff 100644 --- a/sentry_sdk/integrations/django/__init__.py +++ b/sentry_sdk/integrations/django/__init__.py @@ -6,7 +6,7 @@ import weakref from sentry_sdk._types import MYPY -from sentry_sdk.consts import OP, SENSITIVE_DATA_SUBSTITUTE +from sentry_sdk.consts import OP from sentry_sdk.hub import Hub, _should_send_default_pii from sentry_sdk.scope import add_global_event_processor from sentry_sdk.serializer import add_global_repr_processor @@ -16,6 +16,7 @@ AnnotatedValue, HAS_REAL_CONTEXTVARS, CONTEXTVARS_ERROR_MESSAGE, + SENSITIVE_DATA_SUBSTITUTE, logger, capture_internal_exceptions, event_from_exception, diff --git a/sentry_sdk/integrations/httpx.py b/sentry_sdk/integrations/httpx.py index 2e9142d2b8..963fb64741 100644 --- a/sentry_sdk/integrations/httpx.py +++ b/sentry_sdk/integrations/httpx.py @@ -1,7 +1,7 @@ from sentry_sdk import Hub from sentry_sdk.consts import OP from sentry_sdk.integrations import Integration, DidNotEnable -from sentry_sdk.utils import logger +from sentry_sdk.utils import logger, parse_url from sentry_sdk._types import MYPY @@ -41,11 +41,17 @@ def send(self, request, **kwargs): if hub.get_integration(HttpxIntegration) is None: return real_send(self, request, **kwargs) + parsed_url = parse_url(str(request.url), sanitize=False) + with hub.start_span( - op=OP.HTTP_CLIENT, description="%s %s" % (request.method, request.url) + op=OP.HTTP_CLIENT, + description="%s %s" % (request.method, parsed_url.url), ) as span: span.set_data("method", request.method) - span.set_data("url", str(request.url)) + span.set_data("url", parsed_url.url) + span.set_data("http.query", parsed_url.query) + span.set_data("http.fragment", parsed_url.fragment) + for key, value in hub.iter_trace_propagation_headers(): logger.debug( "[Tracing] Adding `{key}` header {value} to outgoing request to {url}.".format( @@ -58,6 +64,7 @@ def send(self, request, **kwargs): span.set_data("status_code", rv.status_code) span.set_http_status(rv.status_code) span.set_data("reason", rv.reason_phrase) + return rv Client.send = send @@ -73,11 +80,17 @@ async def send(self, request, **kwargs): if hub.get_integration(HttpxIntegration) is None: return await real_send(self, request, **kwargs) + parsed_url = parse_url(str(request.url), sanitize=False) + with hub.start_span( - op=OP.HTTP_CLIENT, description="%s %s" % (request.method, request.url) + op=OP.HTTP_CLIENT, + description="%s %s" % (request.method, parsed_url.url), ) as span: span.set_data("method", request.method) - span.set_data("url", str(request.url)) + span.set_data("url", parsed_url.url) + span.set_data("http.query", parsed_url.query) + span.set_data("http.fragment", parsed_url.fragment) + for key, value in hub.iter_trace_propagation_headers(): logger.debug( "[Tracing] Adding `{key}` header {value} to outgoing request to {url}.".format( @@ -90,6 +103,7 @@ async def send(self, request, **kwargs): span.set_data("status_code", rv.status_code) span.set_http_status(rv.status_code) span.set_data("reason", rv.reason_phrase) + return rv AsyncClient.send = send diff --git a/sentry_sdk/integrations/huey.py b/sentry_sdk/integrations/huey.py index 8f5f26133c..74ce4d35d5 100644 --- a/sentry_sdk/integrations/huey.py +++ b/sentry_sdk/integrations/huey.py @@ -6,11 +6,15 @@ from sentry_sdk._compat import reraise from sentry_sdk._types import MYPY from sentry_sdk import Hub -from sentry_sdk.consts import OP, SENSITIVE_DATA_SUBSTITUTE +from sentry_sdk.consts import OP from sentry_sdk.hub import _should_send_default_pii from sentry_sdk.integrations import DidNotEnable, Integration from sentry_sdk.tracing import Transaction, TRANSACTION_SOURCE_TASK -from sentry_sdk.utils import capture_internal_exceptions, event_from_exception +from sentry_sdk.utils import ( + capture_internal_exceptions, + event_from_exception, + SENSITIVE_DATA_SUBSTITUTE, +) if MYPY: from typing import Any, Callable, Optional, Union, TypeVar diff --git a/sentry_sdk/integrations/stdlib.py b/sentry_sdk/integrations/stdlib.py index 687d9dd2c1..8da3b95d49 100644 --- a/sentry_sdk/integrations/stdlib.py +++ b/sentry_sdk/integrations/stdlib.py @@ -8,7 +8,12 @@ from sentry_sdk.integrations import Integration from sentry_sdk.scope import add_global_event_processor from sentry_sdk.tracing_utils import EnvironHeaders -from sentry_sdk.utils import capture_internal_exceptions, logger, safe_repr +from sentry_sdk.utils import ( + capture_internal_exceptions, + logger, + safe_repr, + parse_url, +) from sentry_sdk._types import MYPY @@ -79,12 +84,17 @@ def putrequest(self, method, url, *args, **kwargs): url, ) + parsed_url = parse_url(real_url, sanitize=False) + span = hub.start_span( - op=OP.HTTP_CLIENT, description="%s %s" % (method, real_url) + op=OP.HTTP_CLIENT, + description="%s %s" % (method, parsed_url.url), ) span.set_data("method", method) - span.set_data("url", real_url) + span.set_data("url", parsed_url.url) + span.set_data("http.query", parsed_url.query) + span.set_data("http.fragment", parsed_url.fragment) rv = real_putrequest(self, method, url, *args, **kwargs) diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index 542a4901e8..93301ccbf3 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -8,6 +8,25 @@ import sys import threading import time +from collections import namedtuple + +try: + # Python 3 + from urllib.parse import parse_qs + from urllib.parse import unquote + from urllib.parse import urlencode + from urllib.parse import urlsplit + from urllib.parse import urlunsplit + +except ImportError: + # Python 2 + from cgi import parse_qs # type: ignore + from urllib import unquote # type: ignore + from urllib import urlencode # type: ignore + from urlparse import urlsplit # type: ignore + from urlparse import urlunsplit # type: ignore + + from datetime import datetime from functools import partial @@ -43,13 +62,14 @@ epoch = datetime(1970, 1, 1) - # The logger is created here but initialized in the debug support module logger = logging.getLogger("sentry_sdk.errors") MAX_STRING_LENGTH = 1024 BASE64_ALPHABET = re.compile(r"^[a-zA-Z0-9/+=]*$") +SENSITIVE_DATA_SUBSTITUTE = "[Filtered]" + def json_dumps(data): # type: (Any) -> bytes @@ -374,8 +394,6 @@ def removed_because_over_size_limit(cls): def substituted_because_contains_sensitive_data(cls): # type: () -> AnnotatedValue """The actual value was removed because it contained sensitive information.""" - from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE - return AnnotatedValue( value=SENSITIVE_DATA_SUBSTITUTE, metadata={ @@ -1163,6 +1181,79 @@ def from_base64(base64_string): return utf8_string +Components = namedtuple("Components", ["scheme", "netloc", "path", "query", "fragment"]) + + +def sanitize_url(url, remove_authority=True, remove_query_values=True): + # type: (str, bool, bool) -> str + """ + Removes the authority and query parameter values from a given URL. + """ + parsed_url = urlsplit(url) + query_params = parse_qs(parsed_url.query, keep_blank_values=True) + + # strip username:password (netloc can be usr:pwd@example.com) + if remove_authority: + netloc_parts = parsed_url.netloc.split("@") + if len(netloc_parts) > 1: + netloc = "%s:%s@%s" % ( + SENSITIVE_DATA_SUBSTITUTE, + SENSITIVE_DATA_SUBSTITUTE, + netloc_parts[-1], + ) + else: + netloc = parsed_url.netloc + else: + netloc = parsed_url.netloc + + # strip values from query string + if remove_query_values: + query_string = unquote( + urlencode({key: SENSITIVE_DATA_SUBSTITUTE for key in query_params}) + ) + else: + query_string = parsed_url.query + + safe_url = urlunsplit( + Components( + scheme=parsed_url.scheme, + netloc=netloc, + query=query_string, + path=parsed_url.path, + fragment=parsed_url.fragment, + ) + ) + + return safe_url + + +ParsedUrl = namedtuple("ParsedUrl", ["url", "query", "fragment"]) + + +def parse_url(url, sanitize=True): + + # type: (str, bool) -> ParsedUrl + """ + Splits a URL into a url (including path), query and fragment. If sanitize is True, the query + parameters will be sanitized to remove sensitive data. The autority (username and password) + in the URL will always be removed. + """ + url = sanitize_url(url, remove_authority=True, remove_query_values=sanitize) + + parsed_url = urlsplit(url) + base_url = urlunsplit( + Components( + scheme=parsed_url.scheme, + netloc=parsed_url.netloc, + query="", + path=parsed_url.path, + fragment="", + ) + ) + + return ParsedUrl(url=base_url, query=parsed_url.query, fragment=parsed_url.fragment) + + if PY37: def nanosecond_time(): diff --git a/tests/integrations/httpx/test_httpx.py b/tests/integrations/httpx/test_httpx.py index 4623f13348..0597d10988 100644 --- a/tests/integrations/httpx/test_httpx.py +++ b/tests/integrations/httpx/test_httpx.py @@ -34,6 +34,8 @@ def before_breadcrumb(crumb, hint): assert crumb["data"] == { "url": url, "method": "GET", + "http.fragment": "", + "http.query": "", "status_code": 200, "reason": "OK", "extra": "foo", diff --git a/tests/integrations/requests/test_requests.py b/tests/integrations/requests/test_requests.py index 02c6636853..f4c6b01db0 100644 --- a/tests/integrations/requests/test_requests.py +++ b/tests/integrations/requests/test_requests.py @@ -20,6 +20,8 @@ def test_crumb_capture(sentry_init, capture_events): assert crumb["data"] == { "url": "https://httpbin.org/status/418", "method": "GET", + "http.fragment": "", + "http.query": "", "status_code": response.status_code, "reason": response.reason, } diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000000..2e266c7600 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,186 @@ +import pytest +import re + +from sentry_sdk.utils import parse_url, sanitize_url + + +@pytest.mark.parametrize( + ("url", "expected_result"), + [ + ("http://localhost:8000", "http://localhost:8000"), + ("http://example.com", "http://example.com"), + ("https://example.com", "https://example.com"), + ( + "example.com?token=abc&sessionid=123&save=true", + "example.com?token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", + ), + ( + "http://example.com?token=abc&sessionid=123&save=true", + "http://example.com?token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", + ), + ( + "https://example.com?token=abc&sessionid=123&save=true", + "https://example.com?token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", + ), + ( + "http://localhost:8000/?token=abc&sessionid=123&save=true", + "http://localhost:8000/?token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", + ), + ( + "ftp://username:password@ftp.example.com:9876/bla/blub#foo", + "ftp://[Filtered]:[Filtered]@ftp.example.com:9876/bla/blub#foo", + ), + ( + "https://username:password@example.com/bla/blub?token=abc&sessionid=123&save=true#fragment", + "https://[Filtered]:[Filtered]@example.com/bla/blub?token=[Filtered]&sessionid=[Filtered]&save=[Filtered]#fragment", + ), + ("bla/blub/foo", "bla/blub/foo"), + ("/bla/blub/foo/", "/bla/blub/foo/"), + ( + "bla/blub/foo?token=abc&sessionid=123&save=true", + "bla/blub/foo?token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", + ), + ( + "/bla/blub/foo/?token=abc&sessionid=123&save=true", + "/bla/blub/foo/?token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", + ), + ], +) +def test_sanitize_url(url, expected_result): + # sort parts because old Python versions (<3.6) don't preserve order + sanitized_url = sanitize_url(url) + parts = sorted(re.split(r"\&|\?|\#", sanitized_url)) + expected_parts = sorted(re.split(r"\&|\?|\#", expected_result)) + + assert parts == expected_parts + + +@pytest.mark.parametrize( + ("url", "sanitize", "expected_url", "expected_query", "expected_fragment"), + [ + # Test with sanitize=True + ( + "https://example.com", + True, + "https://example.com", + "", + "", + ), + ( + "example.com?token=abc&sessionid=123&save=true", + True, + "example.com", + "token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", + "", + ), + ( + "https://example.com?token=abc&sessionid=123&save=true", + True, + "https://example.com", + "token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", + "", + ), + ( + "https://username:password@example.com/bla/blub?token=abc&sessionid=123&save=true#fragment", + True, + "https://[Filtered]:[Filtered]@example.com/bla/blub", + "token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", + "fragment", + ), + ( + "bla/blub/foo", + True, + "bla/blub/foo", + "", + "", + ), + ( + "/bla/blub/foo/#baz", + True, + "/bla/blub/foo/", + "", + "baz", + ), + ( + "bla/blub/foo?token=abc&sessionid=123&save=true", + True, + "bla/blub/foo", + "token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", + "", + ), + ( + "/bla/blub/foo/?token=abc&sessionid=123&save=true", + True, + "/bla/blub/foo/", + "token=[Filtered]&sessionid=[Filtered]&save=[Filtered]", + "", + ), + # Test with sanitize=False + ( + "https://example.com", + False, + "https://example.com", + "", + "", + ), + ( + "example.com?token=abc&sessionid=123&save=true", + False, + "example.com", + "token=abc&sessionid=123&save=true", + "", + ), + ( + "https://example.com?token=abc&sessionid=123&save=true", + False, + "https://example.com", + "token=abc&sessionid=123&save=true", + "", + ), + ( + "https://username:password@example.com/bla/blub?token=abc&sessionid=123&save=true#fragment", + False, + "https://[Filtered]:[Filtered]@example.com/bla/blub", + "token=abc&sessionid=123&save=true", + "fragment", + ), + ( + "bla/blub/foo", + False, + "bla/blub/foo", + "", + "", + ), + ( + "/bla/blub/foo/#baz", + False, + "/bla/blub/foo/", + "", + "baz", + ), + ( + "bla/blub/foo?token=abc&sessionid=123&save=true", + False, + "bla/blub/foo", + "token=abc&sessionid=123&save=true", + "", + ), + ( + "/bla/blub/foo/?token=abc&sessionid=123&save=true", + False, + "/bla/blub/foo/", + "token=abc&sessionid=123&save=true", + "", + ), + ], +) +def test_parse_url(url, sanitize, expected_url, expected_query, expected_fragment): + assert parse_url(url, sanitize=sanitize).url == expected_url + assert parse_url(url, sanitize=sanitize).fragment == expected_fragment + + # sort parts because old Python versions (<3.6) don't preserve order + sanitized_query = parse_url(url, sanitize=sanitize).query + query_parts = sorted(re.split(r"\&|\?|\#", sanitized_query)) + expected_query_parts = sorted(re.split(r"\&|\?|\#", expected_query)) + + assert query_parts == expected_query_parts