From a51b3082eebe0148d8b4c55e2ebcc67f8ae55920 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Tue, 14 Feb 2023 14:03:08 +0100 Subject: [PATCH 01/28] feat(source-maps): Implement new tables --- src/sentry/models/artifactbundle.py | 55 +++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 src/sentry/models/artifactbundle.py diff --git a/src/sentry/models/artifactbundle.py b/src/sentry/models/artifactbundle.py new file mode 100644 index 00000000000000..b98ff7c5e60896 --- /dev/null +++ b/src/sentry/models/artifactbundle.py @@ -0,0 +1,55 @@ +from django.db import models + +from sentry.db.models import ( + BoundedBigIntegerField, + BoundedPositiveIntegerField, + FlexibleForeignKey, + Model, + region_silo_only_model, +) + + +@region_silo_only_model +class ArtifactBundle(Model): + __include_in_export__ = False + + bundle_id = models.CharField(max_length=32, null=True) + organization_id = BoundedBigIntegerField(db_index=True) + # TODO: define max length. + release_name = models.CharField(null=True) + dist_id = BoundedBigIntegerField(null=True) + file = FlexibleForeignKey("sentry.File") + artifact_count = BoundedPositiveIntegerField() + + class Meta: + app_label = "sentry" + db_table = "sentry_artifactbundlefile" + + unique_together = (("organization_id", "bundle_id"),) + index_together = (("organization_id", "release_name"),) + + +@region_silo_only_model +class DebugIdIndex(Model): + __include_in_export__ = False + + debug_id = models.CharField(max_length=32, unique=True, db_index=True) + artifact_bundle = FlexibleForeignKey("sentry.ArtifactBundle") + + class Meta: + app_label = "sentry" + db_table = "sentry_debugidindex" + + +@region_silo_only_model +class ProjectArtifactBundleIndex(Model): + __include_in_export__ = False + + project_id = BoundedBigIntegerField(db_index=True) + artifact_bundle_file = FlexibleForeignKey("sentry.ArtifactBundle") + + class Meta: + app_label = "sentry" + db_table = "sentry_projectartifactbundleindex" + + unique_together = (("project_id", "artifact_bundle"),) From 2e417801e4296c98ca0ab94d38c56b18da26a2aa Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Tue, 14 Feb 2023 15:39:58 +0100 Subject: [PATCH 02/28] Improve models --- src/sentry/models/artifactbundle.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/sentry/models/artifactbundle.py b/src/sentry/models/artifactbundle.py index b98ff7c5e60896..33ba17d932f0c6 100644 --- a/src/sentry/models/artifactbundle.py +++ b/src/sentry/models/artifactbundle.py @@ -7,16 +7,16 @@ Model, region_silo_only_model, ) +from sentry.models import DB_VERSION_LENGTH @region_silo_only_model class ArtifactBundle(Model): __include_in_export__ = False - bundle_id = models.CharField(max_length=32, null=True) + bundle_id = models.UUIDField(null=True) organization_id = BoundedBigIntegerField(db_index=True) - # TODO: define max length. - release_name = models.CharField(null=True) + release_name = models.CharField(max_length=DB_VERSION_LENGTH, null=True) dist_id = BoundedBigIntegerField(null=True) file = FlexibleForeignKey("sentry.File") artifact_count = BoundedPositiveIntegerField() @@ -25,31 +25,34 @@ class Meta: app_label = "sentry" db_table = "sentry_artifactbundlefile" - unique_together = (("organization_id", "bundle_id"),) + unique_together = ( + ("organization_id", "bundle_id"), + ("organization_id", "release_name"), + ) index_together = (("organization_id", "release_name"),) @region_silo_only_model -class DebugIdIndex(Model): +class DebugIdArtifactBundle(Model): __include_in_export__ = False - debug_id = models.CharField(max_length=32, unique=True, db_index=True) + debug_id = models.UUIDField(unique=True, db_index=True) artifact_bundle = FlexibleForeignKey("sentry.ArtifactBundle") class Meta: app_label = "sentry" - db_table = "sentry_debugidindex" + db_table = "sentry_debugidartifactbundle" @region_silo_only_model -class ProjectArtifactBundleIndex(Model): +class ProjectArtifactBundle(Model): __include_in_export__ = False project_id = BoundedBigIntegerField(db_index=True) - artifact_bundle_file = FlexibleForeignKey("sentry.ArtifactBundle") + artifact_bundle = FlexibleForeignKey("sentry.ArtifactBundle") class Meta: app_label = "sentry" - db_table = "sentry_projectartifactbundleindex" + db_table = "sentry_projectartifactbundle" unique_together = (("project_id", "artifact_bundle"),) From 64f7794b539478ed51e867f8ff464c3ad26d142a Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Thu, 16 Feb 2023 13:52:20 +0100 Subject: [PATCH 03/28] Add date information --- src/sentry/models/artifactbundle.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/sentry/models/artifactbundle.py b/src/sentry/models/artifactbundle.py index 33ba17d932f0c6..54081529d3b312 100644 --- a/src/sentry/models/artifactbundle.py +++ b/src/sentry/models/artifactbundle.py @@ -1,4 +1,5 @@ from django.db import models +from django.utils import timezone from sentry.db.models import ( BoundedBigIntegerField, @@ -29,20 +30,29 @@ class Meta: ("organization_id", "bundle_id"), ("organization_id", "release_name"), ) - index_together = (("organization_id", "release_name"),) @region_silo_only_model class DebugIdArtifactBundle(Model): __include_in_export__ = False - debug_id = models.UUIDField(unique=True, db_index=True) + debug_id = models.UUIDField() artifact_bundle = FlexibleForeignKey("sentry.ArtifactBundle") + upload_date = models.DateTimeField(default=timezone.now) class Meta: app_label = "sentry" db_table = "sentry_debugidartifactbundle" + # We can have the same debug_id pointing to different artifact_bundle(s) because the user might upload + # the same artifacts twice, or they might have certain build files that don't change across builds. + unique_together = ( + ( + "debug_id", + "artifact_bundle", + ), + ) + @region_silo_only_model class ProjectArtifactBundle(Model): From 8bd08e6d57b5d4428037c512877c30cdc1793a62 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Mon, 20 Feb 2023 14:36:53 +0100 Subject: [PATCH 04/28] Change table name --- src/sentry/models/artifactbundle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentry/models/artifactbundle.py b/src/sentry/models/artifactbundle.py index 54081529d3b312..fa1587ab7cc68a 100644 --- a/src/sentry/models/artifactbundle.py +++ b/src/sentry/models/artifactbundle.py @@ -24,7 +24,7 @@ class ArtifactBundle(Model): class Meta: app_label = "sentry" - db_table = "sentry_artifactbundlefile" + db_table = "sentry_artifactbundle" unique_together = ( ("organization_id", "bundle_id"), From 301820529b8b41522900694875415fb43e8c4348 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Mon, 20 Feb 2023 15:49:37 +0100 Subject: [PATCH 05/28] Add new datetime fields --- src/sentry/models/artifactbundle.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sentry/models/artifactbundle.py b/src/sentry/models/artifactbundle.py index fa1587ab7cc68a..bb2c02040cfabc 100644 --- a/src/sentry/models/artifactbundle.py +++ b/src/sentry/models/artifactbundle.py @@ -38,7 +38,8 @@ class DebugIdArtifactBundle(Model): debug_id = models.UUIDField() artifact_bundle = FlexibleForeignKey("sentry.ArtifactBundle") - upload_date = models.DateTimeField(default=timezone.now) + date_added = models.DateTimeField(default=timezone.now) + date_last_accessed = models.DateTimeField(default=timezone.now) class Meta: app_label = "sentry" From 0c2702c7bcb404262b44bf944a3f340773f0e0e7 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Tue, 21 Feb 2023 15:19:17 +0100 Subject: [PATCH 06/28] Update tables --- src/sentry/models/artifactbundle.py | 47 +++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/src/sentry/models/artifactbundle.py b/src/sentry/models/artifactbundle.py index bb2c02040cfabc..1c5f0822d9d273 100644 --- a/src/sentry/models/artifactbundle.py +++ b/src/sentry/models/artifactbundle.py @@ -1,3 +1,5 @@ +from enum import Enum + from django.db import models from django.utils import timezone @@ -11,14 +13,23 @@ from sentry.models import DB_VERSION_LENGTH +class SourceFileType(Enum): + SOURCE = 1 + MINIFIED_SOURCE = 2 + SOURCE_MAP = 3 + INDEXED_RAM_BUNDLE = 4 + + @classmethod + def choices(cls): + return [(key.value, key.name) for key in cls] + + @region_silo_only_model class ArtifactBundle(Model): __include_in_export__ = False - bundle_id = models.UUIDField(null=True) organization_id = BoundedBigIntegerField(db_index=True) - release_name = models.CharField(max_length=DB_VERSION_LENGTH, null=True) - dist_id = BoundedBigIntegerField(null=True) + bundle_id = models.UUIDField(null=True) file = FlexibleForeignKey("sentry.File") artifact_count = BoundedPositiveIntegerField() @@ -26,18 +37,33 @@ class Meta: app_label = "sentry" db_table = "sentry_artifactbundle" - unique_together = ( - ("organization_id", "bundle_id"), - ("organization_id", "release_name"), - ) + unique_together = (("organization_id", "bundle_id"),) + + +@region_silo_only_model +class ReleaseArtifactBundle(Model): + __include_in_export__ = False + + organization_id = BoundedBigIntegerField(db_index=True) + release_name = models.CharField(max_length=DB_VERSION_LENGTH) + dist_id = BoundedBigIntegerField(null=True) + artifact_bundle = FlexibleForeignKey("sentry.ArtifactBundle") + + class Meta: + app_label = "sentry" + db_table = "sentry_releaseartifactbundle" + + unique_together = (("organization_id", "release_name", "dist_id", "artifact_bundle"),) @region_silo_only_model class DebugIdArtifactBundle(Model): __include_in_export__ = False + organization_id = BoundedBigIntegerField(db_index=True) debug_id = models.UUIDField() artifact_bundle = FlexibleForeignKey("sentry.ArtifactBundle") + source_file_type = models.IntegerField(choices=SourceFileType.choices()) date_added = models.DateTimeField(default=timezone.now) date_last_accessed = models.DateTimeField(default=timezone.now) @@ -47,12 +73,7 @@ class Meta: # We can have the same debug_id pointing to different artifact_bundle(s) because the user might upload # the same artifacts twice, or they might have certain build files that don't change across builds. - unique_together = ( - ( - "debug_id", - "artifact_bundle", - ), - ) + unique_together = (("debug_id", "artifact_bundle", "source_file_type"),) @region_silo_only_model From 71be64b31591cb2046b0b26cb9690b7711df6851 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Thu, 23 Feb 2023 13:14:01 +0100 Subject: [PATCH 07/28] Improve --- src/sentry/lang/javascript/processor.py | 492 +++++++++++------- .../sentry/lang/javascript/test_processor.py | 8 +- .../sentry/lang/javascript/test_sourcemaps.py | 6 +- 3 files changed, 313 insertions(+), 193 deletions(-) diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py index ec28d017bfd65d..96a9be4c8451a7 100644 --- a/src/sentry/lang/javascript/processor.py +++ b/src/sentry/lang/javascript/processor.py @@ -18,6 +18,7 @@ from django.utils.encoding import force_bytes, force_text from requests.utils import get_encoding_from_headers from symbolic import SourceMapCache as SmCache +from symbolic import SourceView from sentry import features, http, options from sentry.event_manager import set_tag @@ -38,7 +39,7 @@ from sentry.utils.safe import get_path from sentry.utils.urls import non_standard_url_join -from .cache import SourceCache, SourceMapCache +from .cache import SourceMapCache __all__ = ["JavaScriptStacktraceProcessor"] @@ -576,127 +577,146 @@ def fetch_release_artifact(url, release, dist): return result -def fetch_file(url, project=None, release=None, dist=None, allow_scraping=True): - """ - Pull down a URL, returning a UrlResult object. - - Attempts to fetch from the database first (assuming there's a release on the - event), then the internet. Caches the result of each of those two attempts - separately, whether or not those attempts are successful. Used for both - source files and source maps. - """ - # If our url has been truncated, it'd be impossible to fetch - # so we check for this early and bail - if url[-3:] == "...": - raise http.CannotFetch({"type": EventError.JS_MISSING_SOURCE, "url": http.expose_url(url)}) - - # if we've got a release to look on, try that first (incl associated cache) - if release: - with sentry_sdk.start_span( - op="JavaScriptStacktraceProcessor.fetch_file.fetch_release_artifact" - ): - result = fetch_release_artifact(url, release, dist) - else: - result = None +class Fetcher: + def __init__(self, project, release, dist, allow_scraping): + self.project = project + self.release = release + self.dist = dist + self.allow_scraping = allow_scraping - # otherwise, try the web-scraping cache and then the web itself + self.sourcemap_cache = SourceMapCache() - cache_key = f"source:cache:v4:{md5_text(url).hexdigest()}" + def fetch_by_debug_id(self, debug_id, source_file_type): + pass - if result is None: - if not allow_scraping or not url.startswith(("http:", "https:")): - error = {"type": EventError.JS_MISSING_SOURCE, "url": http.expose_url(url)} - raise http.CannotFetch(error) + def fetch_by_url(self, url): + """ + Pull down a URL, returning a UrlResult object. - logger.debug("Checking cache for url %r", url) - result = cache.get(cache_key) - if result is not None: - # Previous caches would be a 3-tuple instead of a 4-tuple, - # so this is being maintained for backwards compatibility - try: - encoding = result[4] - except IndexError: - encoding = None - # We got a cache hit, but the body is compressed, so we - # need to decompress it before handing it off - result = http.UrlResult( - result[0], result[1], zlib.decompress(result[2]), result[3], encoding + Attempts to fetch from the database first (assuming there's a release on the + event), then the internet. Caches the result of each of those two attempts + separately, whether or not those attempts are successful. Used for both + source files and source maps. + """ + # If our url has been truncated, it'd be impossible to fetch + # so we check for this early and bail + if url[-3:] == "...": + raise http.CannotFetch( + {"type": EventError.JS_MISSING_SOURCE, "url": http.expose_url(url)} ) - if result is None: - headers = {} - verify_ssl = False - if project and is_valid_origin(url, project=project): - verify_ssl = bool(project.get_option("sentry:verify_ssl", False)) - token = project.get_option("sentry:token") - if token: - token_header = project.get_option("sentry:token_header") or "X-Sentry-Token" - headers[token_header] = token - - with metrics.timer("sourcemaps.fetch"): - with sentry_sdk.start_span(op="JavaScriptStacktraceProcessor.fetch_file.http"): - result = http.fetch_file(url, headers=headers, verify_ssl=verify_ssl) + # if we've got a release to look on, try that first (incl associated cache) + if self.release: with sentry_sdk.start_span( - op="JavaScriptStacktraceProcessor.fetch_file.compress_for_cache" + op="JavaScriptStacktraceProcessor.fetch_file.fetch_release_artifact" ): - z_body = zlib.compress(result.body) - cache.set( - cache_key, - (url, result.headers, z_body, result.status, result.encoding), - get_max_age(result.headers), + result = fetch_release_artifact(url, self.release, self.dist) + else: + result = None + + # otherwise, try the web-scraping cache and then the web itself + + cache_key = f"source:cache:v4:{md5_text(url).hexdigest()}" + + if result is None: + if not self.allow_scraping or not url.startswith(("http:", "https:")): + error = {"type": EventError.JS_MISSING_SOURCE, "url": http.expose_url(url)} + raise http.CannotFetch(error) + + logger.debug("Checking cache for url %r", url) + result = cache.get(cache_key) + if result is not None: + # Previous caches would be a 3-tuple instead of a 4-tuple, + # so this is being maintained for backwards compatibility + try: + encoding = result[4] + except IndexError: + encoding = None + # We got a cache hit, but the body is compressed, so we + # need to decompress it before handing it off + result = http.UrlResult( + result[0], result[1], zlib.decompress(result[2]), result[3], encoding + ) + + if result is None: + headers = {} + verify_ssl = False + if self.project and is_valid_origin(url, project=self.project): + verify_ssl = bool(self.project.get_option("sentry:verify_ssl", False)) + token = self.project.get_option("sentry:token") + if token: + token_header = ( + self.project.get_option("sentry:token_header") or "X-Sentry-Token" + ) + headers[token_header] = token + + with metrics.timer("sourcemaps.fetch"): + with sentry_sdk.start_span(op="JavaScriptStacktraceProcessor.fetch_file.http"): + result = http.fetch_file(url, headers=headers, verify_ssl=verify_ssl) + with sentry_sdk.start_span( + op="JavaScriptStacktraceProcessor.fetch_file.compress_for_cache" + ): + z_body = zlib.compress(result.body) + cache.set( + cache_key, + (url, result.headers, z_body, result.status, result.encoding), + get_max_age(result.headers), + ) + + # since the cache.set above can fail we can end up in a situation + # where the file is too large for the cache. In that case we abort + # the fetch and cache a failure and lock the domain for future + # http fetches. + if cache.get(cache_key) is None: + error = { + "type": EventError.TOO_LARGE_FOR_CACHE, + "url": http.expose_url(url), + } + http.lock_domain(url, error=error) + raise http.CannotFetch(error) + + # If we did not get a 200 OK we just raise a cannot fetch here. + if result.status != 200: + raise http.CannotFetch( + { + "type": EventError.FETCH_INVALID_HTTP_CODE, + "value": result.status, + "url": http.expose_url(url), + } ) - # since the cache.set above can fail we can end up in a situation - # where the file is too large for the cache. In that case we abort - # the fetch and cache a failure and lock the domain for future - # http fetches. - if cache.get(cache_key) is None: + # Make sure the file we're getting back is bytes. The only + # reason it'd not be binary would be from old cached blobs, so + # for compatibility with current cached files, let's coerce back to + # binary and say utf8 encoding. + if not isinstance(result.body, bytes): + try: + result = http.UrlResult( + result.url, + result.headers, + result.body.encode("utf8"), + result.status, + result.encoding, + ) + except UnicodeEncodeError: error = { - "type": EventError.TOO_LARGE_FOR_CACHE, + "type": EventError.FETCH_INVALID_ENCODING, + "value": "utf8", "url": http.expose_url(url), } - http.lock_domain(url, error=error) raise http.CannotFetch(error) - # If we did not get a 200 OK we just raise a cannot fetch here. - if result.status != 200: - raise http.CannotFetch( - { - "type": EventError.FETCH_INVALID_HTTP_CODE, - "value": result.status, - "url": http.expose_url(url), - } - ) - - # Make sure the file we're getting back is bytes. The only - # reason it'd not be binary would be from old cached blobs, so - # for compatibility with current cached files, let's coerce back to - # binary and say utf8 encoding. - if not isinstance(result.body, bytes): - try: - result = http.UrlResult( - result.url, - result.headers, - result.body.encode("utf8"), - result.status, - result.encoding, - ) - except UnicodeEncodeError: - error = { - "type": EventError.FETCH_INVALID_ENCODING, - "value": "utf8", - "url": http.expose_url(url), - } + # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML) + # NOTE: possible to have JS files that don't actually end w/ ".js", but + # this should catch 99% of cases + if urlsplit(url).path.endswith(".js") and is_html_response(result): + error = {"type": EventError.JS_INVALID_CONTENT, "url": url} raise http.CannotFetch(error) - # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML) - # NOTE: possible to have JS files that don't actually end w/ ".js", but - # this should catch 99% of cases - if urlsplit(url).path.endswith(".js") and is_html_response(result): - error = {"type": EventError.JS_INVALID_CONTENT, "url": url} - raise http.CannotFetch(error) + return result - return result + def _fetch_sourcemap_by_debug_id(self): + pass def is_html_response(result): @@ -737,13 +757,14 @@ def fetch_sourcemap(url, source=b"", project=None, release=None, dist=None, allo op="JavaScriptStacktraceProcessor.fetch_sourcemap.fetch_file" ) as span: span.set_data("url", url) - result = fetch_file( - url, - project=project, - release=release, - dist=dist, - allow_scraping=allow_scraping, - ) + # result = fetch_file( + # url, + # project=project, + # release=release, + # dist=dist, + # allow_scraping=allow_scraping, + # ) + result = None body = result.body try: with sentry_sdk.start_span( @@ -889,7 +910,21 @@ def __init__(self, *args, **kwargs): # cache holding mangled code, original code, and errors associated with # each abs_path in the stacktrace - self.cache = SourceCache() + # self.cache = SourceCache() + self.fetch_by_url_cache = {} + self.fetch_by_url_result = {} + self.fetch_by_url_errors = {} + + self.fetch_by_debug_id_result = {} + self.fetch_by_debug_id_cache = {} + + self.debug_id_sourcemap_cache = {} + self.url_sourcemap_cache = {} + + self.url_aliases = {} + self.minified_source_url_to_sourcemap_url = {} + + self.fetcher = None # cache holding source URLs, corresponding source map URLs, and source map contents self.sourcemaps = SourceMapCache() @@ -920,6 +955,8 @@ def preprocess_step(self, processing_task): date = timestamp and datetime.fromtimestamp(timestamp).replace(tzinfo=timezone.utc) self.dist = self.release.add_dist(self.data["dist"], date) + self.fetcher = Fetcher(self.project, self.release, self.dist, self.allow_scraping) + with sentry_sdk.start_span( op="JavaScriptStacktraceProcessor.preprocess_step.populate_source_cache" ): @@ -967,7 +1004,7 @@ def process_frame(self, processable_frame, processing_task): # `source` is used for pre/post and `context_line` frame expansion. # Here it's pointing to minified source, however the variable can be shadowed with the original sourceview # (or `None` if the token doesnt provide us with the `context_line`) down the road. - source = self.get_sourceview(frame["abs_path"]) + source = self.get_or_fetch_sourceview(frame["abs_path"]) source_context = None in_app = None @@ -1026,7 +1063,7 @@ def process_frame(self, processable_frame, processing_task): if token.context_line is not None: source_context = token.pre_context, token.context_line, token.post_context else: - source = self.get_sourceview(abs_path) + source = self.get_or_fetch_sourceview(abs_path) if source is None: errors = cache.get_errors(abs_path) @@ -1180,7 +1217,7 @@ def expand_frame(self, frame, source_context=None, source=None): return False if source_context is None: - source = source or self.get_sourceview(frame["abs_path"]) + source = source or self.get_or_fetch_sourceview(frame["abs_path"]) if source is None: logger.debug("No source found for %s", frame["abs_path"]) return False @@ -1198,89 +1235,172 @@ def expand_frame(self, frame, source_context=None, source=None): return True - def get_sourceview(self, filename): - if filename not in self.cache: - self.cache_source(filename) - return self.cache.get(filename) + # If we return None it means that we weren't able to find the file. + def get_or_fetch_sourceview(self, url=None, debug_id=None, source_file_type=None): + # We require that artifact bundles with debug ids used embedded source in the source map. + if debug_id is not None and source_file_type != "source": + return None - def cache_source(self, filename): - """ - Look for and (if found) cache a source file and its associated source - map (if any). - """ + rv = self._get_cached_sourceview(url, debug_id, source_file_type) + if rv is not None: + return rv - sourcemaps = self.sourcemaps - cache = self.cache + return self._fetch_and_cache_sourceview(url, debug_id, source_file_type) + + def _get_cached_sourceview(self, url=None, debug_id=None, source_file_type=None): + if debug_id is not None: + rv = self.fetch_by_debug_id_cache.get((debug_id, source_file_type)) + if rv is not None: + return rv + if url is not None: + real_url = self.url_aliases.get(url, url) + rv = self.fetch_by_url_cache.get(real_url) + if rv is not None: + return rv + + return None + + def _fetch_and_cache_sourceview(self, url, debug_id, source_file_type): self.fetch_count += 1 + # TODO: this branch should only apply to http fetches. if self.fetch_count > self.max_fetches: - cache.add_error(filename, {"type": EventError.JS_TOO_MANY_REMOTE_SOURCES}) - return + cache.add_error(url, {"type": EventError.JS_TOO_MANY_REMOTE_SOURCES}) + return None # TODO: respect cache-control/max-age headers to some extent - logger.debug("Attempting to cache source %r", filename) - try: - # this both looks in the database and tries to scrape the internet - with sentry_sdk.start_span( - op="JavaScriptStacktraceProcessor.cache_source.fetch_file" - ) as span: - span.set_data("filename", filename) - result = fetch_file( - filename, - project=self.project, - release=self.release, - dist=self.dist, - allow_scraping=self.allow_scraping, - ) - except http.BadSource as exc: - if not fetch_error_should_be_silienced(exc.data, filename): - cache.add_error(filename, exc.data) - # either way, there's no more for us to do here, since we don't have - # a valid file to cache - return - cache.add(filename, result.body, result.encoding) - cache.alias(result.url, filename) + logger.debug("Attempting to cache source %r", url) + # this both looks in the database and tries to scrape the internet + with sentry_sdk.start_span( + op="JavaScriptStacktraceProcessor.cache_source.fetch_file" + ) as span: + span.set_data("filename", url) + if debug_id is not None: + result = self.fetcher.fetch_by_debug_id(debug_id, source_file_type) + if result is not None: + # We temporarily store the result directly, in case we want to compute the sourceview on demand + # we can delete the other dictionary. + self.fetch_by_debug_id_result[debug_id, source_file_type] = result + sourceview = SourceView.from_bytes(result.body) + self.fetch_by_debug_id_cache[debug_id, source_file_type] = sourceview + + return sourceview - sourcemap_url = discover_sourcemap(result) - if not sourcemap_url: - return + try: + result = self.fetcher.fetch_by_url(url) + except http.BadSource as exc: + if not fetch_error_should_be_silienced(exc.data, url): + self.fetch_by_url_errors[url] = exc.data + # either way, there's no more for us to do here, since we don't have + # a valid file to cache + return None + else: + # TODO: it should do something but we don't know. + if result.url != url: + self.url_aliases[result.url] = url + + # We temporarily store the result directly, in case we want to compute the sourceview on demand + # we can delete the other dictionary. + self.fetch_by_url_result[url] = result + sourceview = SourceView.from_bytes(result.body) + self.fetch_by_url_cache[url] = sourceview + + sourcemap_url = discover_sourcemap(result) + if sourcemap_url: + self.minified_source_url_to_sourcemap_url[url] = sourcemap_url + + return sourceview + + # get_or_fetch_sourceview("test.js") + # get_or_fetch_sourcemap("test.js") + # + # The url is of the minified file. + # If this function returns None, it means that we didn't find the sourcemap, maybe because the file at url isn't + # a minified file. + def get_or_fetch_sourcemap(self, url=None, debug_id=None): + rv = self._get_cached_sourcemap(url, debug_id) + if rv is not None: + return rv + + return self._fetch_and_cache_sourcemap(url, debug_id) + + def _get_cached_sourcemap(self, url, debug_id): + if debug_id is not None: + rv = self.debug_id_sourcemap_cache.get(debug_id) + if rv is not None: + return rv + + if url is not None: + real_url = self.url_aliases.get(url, url) + sourcemap_url = self.minified_source_url_to_sourcemap_url.get(real_url) + rv = self.url_sourcemap_cache.get(sourcemap_url) + if rv is not None: + return rv - logger.debug( - "Found sourcemap URL %r for minified script %r", sourcemap_url[:256], result.url - ) - sourcemaps.link(filename, sourcemap_url) - if sourcemap_url in sourcemaps: - return + return None - # pull down sourcemap - try: - with sentry_sdk.start_span( - op="JavaScriptStacktraceProcessor.cache_source.fetch_sourcemap" - ) as span: - span.set_data("sourcemap_url", sourcemap_url) - sourcemap_view = fetch_sourcemap( - sourcemap_url, - source=result.body, - project=self.project, - release=self.release, - dist=self.dist, - allow_scraping=self.allow_scraping, + def _fetch_and_cache_sourcemap(self, url, debug_id): + if debug_id is not None: + + minified_result = self.fetch_by_debug_id_result.get(debug_id) + if minified_result is None: + return None + + # TODO: use enum for source file type. + result = self.fetcher.fetch_by_debug_id(debug_id, "sourcemap") + if result is not None: + self.debug_id_sourcemap_cache = SmCache.from_bytes( + result.body, minified_result.body ) - except http.BadSource as exc: - # we don't perform the same check here as above, because if someone has - # uploaded a node_modules file, which has a sourceMappingURL, they - # presumably would like it mapped (and would like to know why it's not - # working, if that's the case). If they're not looking for it to be - # mapped, then they shouldn't be uploading the source file in the - # first place. - cache.add_error(filename, exc.data) - return + return result - with sentry_sdk.start_span( - op="JavaScriptStacktraceProcessor.cache_source.cache_sourcemap_view" - ) as span: - sourcemaps.add(sourcemap_url, sourcemap_view) + if url is not None: + minified_result = self.fetch_by_url_result.get(url) + if minified_result is None: + return None + + sourcemap_url = self.minified_source_url_to_sourcemap_url.get(url) + if sourcemap_url is None: + return None + + # TODO: find a way to get the original url of the request. + # logger.debug( + # "Found sourcemap URL %r for minified script %r", sourcemap_url[:256], url + # ) + self.sourcemaps.link(url, sourcemap_url) + if sourcemap_url in self.sourcemaps: + return + + # pull down sourcemap + try: + with sentry_sdk.start_span( + op="JavaScriptStacktraceProcessor.cache_source.fetch_sourcemap" + ) as span: + span.set_data("sourcemap_url", sourcemap_url) + sourcemap_view = fetch_sourcemap( + sourcemap_url, + # How do we get the body of the response if we stored the sourceview in the code. + source=minified_result.body, + project=self.project, + release=self.release, + dist=self.dist, + allow_scraping=self.allow_scraping, + ) + except http.BadSource as exc: + # we don't perform the same check here as above, because if someone has + # uploaded a node_modules file, which has a sourceMappingURL, they + # presumably would like it mapped (and would like to know why it's not + # working, if that's the case). If they're not looking for it to be + # mapped, then they shouldn't be uploading the source file in the + # first place. + cache.add_error(url, exc.data) + return + + with sentry_sdk.start_span( + op="JavaScriptStacktraceProcessor.cache_source.cache_sourcemap_view" + ): + self.sourcemaps.add(sourcemap_url, sourcemap_view) def populate_source_cache(self, frames): """ @@ -1308,7 +1428,7 @@ def populate_source_cache(self, frames): op="JavaScriptStacktraceProcessor.populate_source_cache.cache_source" ) as span: span.set_data("filename", filename) - self.cache_source(filename=filename) + self._fetch_and_cache_sourceview(filename=filename) def close(self): StacktraceProcessor.close(self) diff --git a/tests/sentry/lang/javascript/test_processor.py b/tests/sentry/lang/javascript/test_processor.py index 08611de646d90d..d29a0101204f9e 100644 --- a/tests/sentry/lang/javascript/test_processor.py +++ b/tests/sentry/lang/javascript/test_processor.py @@ -1453,7 +1453,7 @@ def test_file_no_source_records_error(self): # before caching, no errors assert len(processor.cache.get_errors(abs_path)) == 0 - processor.cache_source(abs_path) + processor._fetch_and_cache_sourceview(abs_path) # now we have an error assert len(processor.cache.get_errors(abs_path)) == 1 @@ -1474,7 +1474,7 @@ def test_node_modules_file_no_source_no_error(self): # not a real url, so won't find file on the internet abs_path = "app:///../node_modules/i/dont/exist.js" - processor.cache_source(abs_path) + processor._fetch_and_cache_sourceview(abs_path) # no errors, even though the file can't have been found assert len(processor.cache.get_errors(abs_path)) == 0 @@ -1499,7 +1499,7 @@ def test_node_modules_file_with_source_is_used(self): # but since this is just a unit test, we have to set it manually processor.release = release - processor.cache_source(abs_path) + processor._fetch_and_cache_sourceview(abs_path) # file is cached, no errors are generated assert processor.cache.get(abs_path) @@ -1532,7 +1532,7 @@ def test_node_modules_file_with_source_but_no_map_records_error(self, mock_disco # before caching, no errors assert len(processor.cache.get_errors(abs_path)) == 0 - processor.cache_source(abs_path) + processor._fetch_and_cache_sourceview(abs_path) # now we have an error assert len(processor.cache.get_errors(abs_path)) == 1 diff --git a/tests/sentry/lang/javascript/test_sourcemaps.py b/tests/sentry/lang/javascript/test_sourcemaps.py index a5b343ad8a0f8a..2d167db82421b3 100644 --- a/tests/sentry/lang/javascript/test_sourcemaps.py +++ b/tests/sentry/lang/javascript/test_sourcemaps.py @@ -102,16 +102,16 @@ def test_no_inline(self): # basic sourcemap fixture has no inlined sources, so expect None smap_view = SourceMapView.from_json_bytes(sourcemap) - source = smap_view.get_sourceview(0) + source = smap_view.get_or_fetch_sourceview(0) assert source is None def test_indexed_inline(self): smap_view = SourceMapView.from_json_bytes(indexed_sourcemap_example) - assert smap_view.get_sourceview(0).get_source() == ( + assert smap_view.get_or_fetch_sourceview(0).get_source() == ( " ONE.foo = function (bar) {\n" + " return baz(bar);\n" + " };" ) - assert smap_view.get_sourceview(1).get_source() == ( + assert smap_view.get_or_fetch_sourceview(1).get_source() == ( " TWO.inc = function (n) {\n" + " return n + 1;\n" + " };" ) From 2312970b7bfb37cbd12d2c0436deee1f19606885 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Thu, 23 Feb 2023 13:29:26 +0100 Subject: [PATCH 08/28] Improve --- src/sentry/lang/javascript/processor.py | 157 +++++++++++------------- 1 file changed, 72 insertions(+), 85 deletions(-) diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py index 96a9be4c8451a7..852ac7028c54d1 100644 --- a/src/sentry/lang/javascript/processor.py +++ b/src/sentry/lang/javascript/processor.py @@ -742,42 +742,6 @@ def get_max_age(headers): return min(max_age, CACHE_CONTROL_MAX) -def fetch_sourcemap(url, source=b"", project=None, release=None, dist=None, allow_scraping=True): - if is_data_uri(url): - try: - body = base64.b64decode( - force_bytes(url[BASE64_PREAMBLE_LENGTH:]) - + (b"=" * (-(len(url) - BASE64_PREAMBLE_LENGTH) % 4)) - ) - except TypeError as e: - raise UnparseableSourcemap({"url": "", "reason": str(e)}) - else: - # look in the database and, if not found, optionally try to scrape the web - with sentry_sdk.start_span( - op="JavaScriptStacktraceProcessor.fetch_sourcemap.fetch_file" - ) as span: - span.set_data("url", url) - # result = fetch_file( - # url, - # project=project, - # release=release, - # dist=dist, - # allow_scraping=allow_scraping, - # ) - result = None - body = result.body - try: - with sentry_sdk.start_span( - op="JavaScriptStacktraceProcessor.fetch_sourcemap.SmCache.from_bytes" - ): - return SmCache.from_bytes(source, body) - - except Exception as exc: - # This is in debug because the product shows an error already. - logger.debug(str(exc), exc_info=True) - raise UnparseableSourcemap({"url": http.expose_url(url)}) - - def is_data_uri(url): return url[:BASE64_PREAMBLE_LENGTH] == BASE64_SOURCEMAP_PREAMBLE @@ -1342,65 +1306,88 @@ def _get_cached_sourcemap(self, url, debug_id): def _fetch_and_cache_sourcemap(self, url, debug_id): if debug_id is not None: + sourcemap_view = self._handle_debug_id_sourcemap_lookup(debug_id) + if sourcemap_view is not None: + return sourcemap_view - minified_result = self.fetch_by_debug_id_result.get(debug_id) - if minified_result is None: - return None + if url is not None: + sourcemap_view = self._handle_url_sourcemap_lookup(url) + if sourcemap_view is not None: + return sourcemap_view - # TODO: use enum for source file type. - result = self.fetcher.fetch_by_debug_id(debug_id, "sourcemap") - if result is not None: - self.debug_id_sourcemap_cache = SmCache.from_bytes( - result.body, minified_result.body - ) - return result + return None - if url is not None: - minified_result = self.fetch_by_url_result.get(url) - if minified_result is None: - return None + def _handle_debug_id_sourcemap_lookup(self, debug_id): + minified_result = self.fetch_by_debug_id_result.get(debug_id) + if minified_result is None: + return None - sourcemap_url = self.minified_source_url_to_sourcemap_url.get(url) - if sourcemap_url is None: - return None + # TODO: use enum for source file type. + result = self.fetcher.fetch_by_debug_id(debug_id, "sourcemap") + if result is not None: + sourcemap_view = SmCache.from_bytes(minified_result.body, result.body) + self.debug_id_sourcemap_cache[debug_id] = sourcemap_view + return sourcemap_view + + def _handle_url_sourcemap_lookup(self, url): + minified_result = self.fetch_by_url_result.get(url) + if minified_result is None: + return None + + sourcemap_url = self.minified_source_url_to_sourcemap_url.get(url) + if sourcemap_url is None: + return None - # TODO: find a way to get the original url of the request. - # logger.debug( - # "Found sourcemap URL %r for minified script %r", sourcemap_url[:256], url - # ) - self.sourcemaps.link(url, sourcemap_url) - if sourcemap_url in self.sourcemaps: - return + try: + with sentry_sdk.start_span( + op="JavaScriptStacktraceProcessor.cache_source.fetch_sourcemap" + ) as span: + span.set_data("sourcemap_url", sourcemap_url) + sourcemap_view = self._fetch_sourcemap(sourcemap_url, source=minified_result.body) + except http.BadSource as exc: + # we don't perform the same check here as above, because if someone has + # uploaded a node_modules file, which has a sourceMappingURL, they + # presumably would like it mapped (and would like to know why it's not + # working, if that's the case). If they're not looking for it to be + # mapped, then they shouldn't be uploading the source file in the + # first place. + self.fetch_by_url_errors[url] = exc.data + return None + else: + with sentry_sdk.start_span( + op="JavaScriptStacktraceProcessor.cache_source.cache_sourcemap_view" + ): + self.url_sourcemap_cache[sourcemap_url] = sourcemap_view + return sourcemap_view - # pull down sourcemap + def _fetch_sourcemap(self, url, source=b""): + if is_data_uri(url): try: - with sentry_sdk.start_span( - op="JavaScriptStacktraceProcessor.cache_source.fetch_sourcemap" - ) as span: - span.set_data("sourcemap_url", sourcemap_url) - sourcemap_view = fetch_sourcemap( - sourcemap_url, - # How do we get the body of the response if we stored the sourceview in the code. - source=minified_result.body, - project=self.project, - release=self.release, - dist=self.dist, - allow_scraping=self.allow_scraping, - ) - except http.BadSource as exc: - # we don't perform the same check here as above, because if someone has - # uploaded a node_modules file, which has a sourceMappingURL, they - # presumably would like it mapped (and would like to know why it's not - # working, if that's the case). If they're not looking for it to be - # mapped, then they shouldn't be uploading the source file in the - # first place. - cache.add_error(url, exc.data) - return + body = base64.b64decode( + force_bytes(url[BASE64_PREAMBLE_LENGTH:]) + + (b"=" * (-(len(url) - BASE64_PREAMBLE_LENGTH) % 4)) + ) + except TypeError as e: + raise UnparseableSourcemap({"url": "", "reason": str(e)}) + else: + # look in the database and, if not found, optionally try to scrape the web + with sentry_sdk.start_span( + op="JavaScriptStacktraceProcessor.fetch_sourcemap.fetch_file" + ) as span: + span.set_data("url", url) + result = self.fetcher.fetch_by_url(url) + body = result.body + try: with sentry_sdk.start_span( - op="JavaScriptStacktraceProcessor.cache_source.cache_sourcemap_view" + op="JavaScriptStacktraceProcessor.fetch_sourcemap.SmCache.from_bytes" ): - self.sourcemaps.add(sourcemap_url, sourcemap_view) + return SmCache.from_bytes(source, body) + + except Exception as exc: + # This is in debug because the product shows an error already. + logger.debug(str(exc), exc_info=True) + raise UnparseableSourcemap({"url": http.expose_url(url)}) def populate_source_cache(self, frames): """ From 43cd8dd487233b35e491efc8d636af26fb5f0192 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Thu, 23 Feb 2023 13:53:20 +0100 Subject: [PATCH 09/28] Improve --- src/sentry/lang/javascript/processor.py | 76 ++++++++++--------- .../lang/javascript/test_plugin.py | 18 ++--- 2 files changed, 48 insertions(+), 46 deletions(-) diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py index 852ac7028c54d1..6b3c5bb730e991 100644 --- a/src/sentry/lang/javascript/processor.py +++ b/src/sentry/lang/javascript/processor.py @@ -610,6 +610,7 @@ def fetch_by_url(self, url): with sentry_sdk.start_span( op="JavaScriptStacktraceProcessor.fetch_file.fetch_release_artifact" ): + # TODO: move this method internally in the class. result = fetch_release_artifact(url, self.release, self.dist) else: result = None @@ -875,15 +876,13 @@ def __init__(self, *args, **kwargs): # cache holding mangled code, original code, and errors associated with # each abs_path in the stacktrace # self.cache = SourceCache() - self.fetch_by_url_cache = {} self.fetch_by_url_result = {} self.fetch_by_url_errors = {} self.fetch_by_debug_id_result = {} - self.fetch_by_debug_id_cache = {} self.debug_id_sourcemap_cache = {} - self.url_sourcemap_cache = {} + self.sourcemap_url_sourcemap_cache = {} self.url_aliases = {} self.minified_source_url_to_sourcemap_url = {} @@ -924,7 +923,7 @@ def preprocess_step(self, processing_task): with sentry_sdk.start_span( op="JavaScriptStacktraceProcessor.preprocess_step.populate_source_cache" ): - self.populate_source_cache(frames) + self.populate_sources_cache(frames) return True @@ -944,8 +943,6 @@ def process_frame(self, processable_frame, processing_task): frame = processable_frame.frame token = None - cache = self.cache - sourcemaps = self.sourcemaps all_errors = [] sourcemap_applied = False @@ -968,21 +965,25 @@ def process_frame(self, processable_frame, processing_task): # `source` is used for pre/post and `context_line` frame expansion. # Here it's pointing to minified source, however the variable can be shadowed with the original sourceview # (or `None` if the token doesnt provide us with the `context_line`) down the road. - source = self.get_or_fetch_sourceview(frame["abs_path"]) + # TODO: add support for debug ids. + # We fetch the sourcemap url for the given source. + sourceview = self.get_or_fetch_sourceview(url=frame["abs_path"]) + sourcemap_view = self.get_or_fetch_sourcemap(url=frame["abs_path"]) + + sourcemap_url = self.minified_source_url_to_sourcemap_url.get(frame["abs_path"]) + self.sourcemaps_touched.add(sourcemap_url) + source_context = None in_app = None new_frame = dict(frame) raw_frame = dict(frame) - sourcemap_url, sourcemap_cache = sourcemaps.get_link(frame["abs_path"]) - self.sourcemaps_touched.add(sourcemap_url) - - if sourcemap_cache and frame.get("colno") is None: + if sourcemap_view and frame.get("colno") is None: all_errors.append( {"type": EventError.JS_NO_COLUMN, "url": http.expose_url(frame["abs_path"])} ) - elif sourcemap_cache: + elif sourcemap_view: if is_data_uri(sourcemap_url): sourcemap_label = frame["abs_path"] else: @@ -993,7 +994,7 @@ def process_frame(self, processable_frame, processing_task): try: # Errors are 1-indexed in the frames. assert frame["lineno"] > 0, "line numbers are 1-indexed" - token = sourcemap_cache.lookup(frame["lineno"], frame["colno"], LINES_OF_CONTEXT) + token = sourcemap_view.lookup(frame["lineno"], frame["colno"], LINES_OF_CONTEXT) except Exception: token = None all_errors.append( @@ -1027,9 +1028,9 @@ def process_frame(self, processable_frame, processing_task): if token.context_line is not None: source_context = token.pre_context, token.context_line, token.post_context else: - source = self.get_or_fetch_sourceview(abs_path) + sourceview = self.get_or_fetch_sourceview(url=abs_path) - if source is None: + if sourceview is None: errors = cache.get_errors(abs_path) if errors: all_errors.extend(errors) @@ -1102,11 +1103,13 @@ def process_frame(self, processable_frame, processing_task): new_frame.get("data") or {}, sourcemap=http.expose_url(sourcemap_url) ) - changed_frame = self.expand_frame(new_frame, source_context=source_context, source=source) + changed_frame = self.expand_frame( + new_frame, source_context=source_context, source=sourceview + ) # If we did not manage to match but we do have a line or column # we want to report an error here. - if not new_frame.get("context_line") and source and new_frame.get("colno") is not None: + if not new_frame.get("context_line") and sourceview and new_frame.get("colno") is not None: all_errors.append( { "type": EventError.JS_INVALID_SOURCEMAP_LOCATION, @@ -1142,6 +1145,7 @@ def process_frame(self, processable_frame, processing_task): self.tag_suspected_console_errors(new_frames) except Exception as exc: logger.exception("Failed to tag suspected console errors", exc_info=exc) + return new_frames, raw_frames, all_errors def tag_suspected_console_errors(self, new_frames): @@ -1181,7 +1185,8 @@ def expand_frame(self, frame, source_context=None, source=None): return False if source_context is None: - source = source or self.get_or_fetch_sourceview(frame["abs_path"]) + # TODO: add lookup by debug id in case abs_path has it. + source = source or self.get_or_fetch_sourceview(url=frame["abs_path"]) if source is None: logger.debug("No source found for %s", frame["abs_path"]) return False @@ -1213,15 +1218,17 @@ def get_or_fetch_sourceview(self, url=None, debug_id=None, source_file_type=None def _get_cached_sourceview(self, url=None, debug_id=None, source_file_type=None): if debug_id is not None: - rv = self.fetch_by_debug_id_cache.get((debug_id, source_file_type)) - if rv is not None: - return rv + source_result = self.fetch_by_debug_id_result.get((debug_id, source_file_type)) + if source_result is not None: + sourceview = SourceView.from_bytes(source_result.body) + return sourceview if url is not None: real_url = self.url_aliases.get(url, url) - rv = self.fetch_by_url_cache.get(real_url) - if rv is not None: - return rv + source_result = self.fetch_by_url_result.get(real_url) + if source_result is not None: + sourceview = SourceView.from_bytes(source_result.body) + return sourceview return None @@ -1247,8 +1254,6 @@ def _fetch_and_cache_sourceview(self, url, debug_id, source_file_type): # we can delete the other dictionary. self.fetch_by_debug_id_result[debug_id, source_file_type] = result sourceview = SourceView.from_bytes(result.body) - self.fetch_by_debug_id_cache[debug_id, source_file_type] = sourceview - return sourceview try: @@ -1268,17 +1273,15 @@ def _fetch_and_cache_sourceview(self, url, debug_id, source_file_type): # we can delete the other dictionary. self.fetch_by_url_result[url] = result sourceview = SourceView.from_bytes(result.body) - self.fetch_by_url_cache[url] = sourceview + # TODO: the discovery of sourcemap urls here is very opaque, consider of putting it closer to the + # function's callsite. sourcemap_url = discover_sourcemap(result) if sourcemap_url: self.minified_source_url_to_sourcemap_url[url] = sourcemap_url return sourceview - # get_or_fetch_sourceview("test.js") - # get_or_fetch_sourcemap("test.js") - # # The url is of the minified file. # If this function returns None, it means that we didn't find the sourcemap, maybe because the file at url isn't # a minified file. @@ -1298,7 +1301,7 @@ def _get_cached_sourcemap(self, url, debug_id): if url is not None: real_url = self.url_aliases.get(url, url) sourcemap_url = self.minified_source_url_to_sourcemap_url.get(real_url) - rv = self.url_sourcemap_cache.get(sourcemap_url) + rv = self.sourcemap_url_sourcemap_cache.get(sourcemap_url) if rv is not None: return rv @@ -1357,7 +1360,7 @@ def _handle_url_sourcemap_lookup(self, url): with sentry_sdk.start_span( op="JavaScriptStacktraceProcessor.cache_source.cache_sourcemap_view" ): - self.url_sourcemap_cache[sourcemap_url] = sourcemap_view + self.sourcemap_url_sourcemap_cache[sourcemap_url] = sourcemap_view return sourcemap_view def _fetch_sourcemap(self, url, source=b""): @@ -1389,7 +1392,7 @@ def _fetch_sourcemap(self, url, source=b""): logger.debug(str(exc), exc_info=True) raise UnparseableSourcemap({"url": http.expose_url(url)}) - def populate_source_cache(self, frames): + def populate_sources_cache(self, frames): """ Fetch all sources that we know are required (being referenced directly in frames). @@ -1415,7 +1418,12 @@ def populate_source_cache(self, frames): op="JavaScriptStacktraceProcessor.populate_source_cache.cache_source" ) as span: span.set_data("filename", filename) - self._fetch_and_cache_sourceview(filename=filename) + # TODO: we want to implement the construction of the debug + # We first want to fetch the sourceview of the file. + self.get_or_fetch_sourceview(url=filename) + # We then want to fetch the sourcemap of the file, but this call can fail in case filename points to a + # non minified file. + self.get_or_fetch_sourcemap(url=filename) def close(self): StacktraceProcessor.close(self) diff --git a/tests/relay_integration/lang/javascript/test_plugin.py b/tests/relay_integration/lang/javascript/test_plugin.py index 76488093a8806e..08b4991cf8c54c 100644 --- a/tests/relay_integration/lang/javascript/test_plugin.py +++ b/tests/relay_integration/lang/javascript/test_plugin.py @@ -177,9 +177,9 @@ def test_source_expansion(self, mock_fetch_file): # no source map means no raw_stacktrace assert exception.values[0].raw_stacktrace is None - @patch("sentry.lang.javascript.processor.fetch_file") + @patch("sentry.lang.javascript.processor.Fetcher.fetch_by_url") @patch("sentry.lang.javascript.processor.discover_sourcemap") - def test_inlined_sources(self, mock_discover_sourcemap, mock_fetch_file): + def test_inlined_sources(self, mock_discover_sourcemap, mock_fetch_by_url): data = { "timestamp": self.min_ago, "message": "hello", @@ -205,19 +205,13 @@ def test_inlined_sources(self, mock_discover_sourcemap, mock_fetch_file): mock_discover_sourcemap.return_value = BASE64_SOURCEMAP - mock_fetch_file.return_value.url = "http://example.com/test.min.js" - mock_fetch_file.return_value.body = force_bytes("\n".join("")) - mock_fetch_file.return_value.encoding = None + mock_fetch_by_url.return_value.url = "http://example.com/test.min.js" + mock_fetch_by_url.return_value.body = force_bytes("\n".join("")) + mock_fetch_by_url.return_value.encoding = None event = self.post_and_retrieve_event(data) - mock_fetch_file.assert_called_once_with( - "http://example.com/test.min.js", - project=self.project, - release=None, - dist=None, - allow_scraping=True, - ) + mock_fetch_by_url.assert_called_once_with("http://example.com/test.min.js") exception = event.interfaces["exception"] frame_list = exception.values[0].stacktrace.frames From 595ddd9d96e0062d39a46043db9ae707984fc986 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Thu, 23 Feb 2023 14:02:47 +0100 Subject: [PATCH 10/28] Improve --- src/sentry/lang/javascript/processor.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py index 6b3c5bb730e991..d37f05b9fc9efd 100644 --- a/src/sentry/lang/javascript/processor.py +++ b/src/sentry/lang/javascript/processor.py @@ -873,25 +873,24 @@ def __init__(self, *args, **kwargs): self.fetch_count = 0 self.sourcemaps_touched = set() - # cache holding mangled code, original code, and errors associated with - # each abs_path in the stacktrace - # self.cache = SourceCache() + # Cache holding the results of the fetching by url. self.fetch_by_url_result = {} self.fetch_by_url_errors = {} + # Cache holding the results of the fetching by debug id. self.fetch_by_debug_id_result = {} + # Cache holding the sourcemaps views indexed by either debug id or sourcemap url. self.debug_id_sourcemap_cache = {} self.sourcemap_url_sourcemap_cache = {} + # Additional caches for mapping purposes. self.url_aliases = {} self.minified_source_url_to_sourcemap_url = {} + # Component responsible for fetching the files. self.fetcher = None - # cache holding source URLs, corresponding source map URLs, and source map contents - self.sourcemaps = SourceMapCache() - self.release = None self.dist = None @@ -958,8 +957,8 @@ def process_frame(self, processable_frame, processing_task): ): return - errors = cache.get_errors(frame["abs_path"]) - if errors: + errors = self.fetch_by_url_errors.get(frame["abs_path"]) + if errors is not None: all_errors.extend(errors) # `source` is used for pre/post and `context_line` frame expansion. @@ -1031,7 +1030,7 @@ def process_frame(self, processable_frame, processing_task): sourceview = self.get_or_fetch_sourceview(url=abs_path) if sourceview is None: - errors = cache.get_errors(abs_path) + errors = self.fetch_by_url_errors[abs_path] if errors: all_errors.extend(errors) else: @@ -1260,7 +1259,8 @@ def _fetch_and_cache_sourceview(self, url, debug_id, source_file_type): result = self.fetcher.fetch_by_url(url) except http.BadSource as exc: if not fetch_error_should_be_silienced(exc.data, url): - self.fetch_by_url_errors[url] = exc.data + self.fetch_by_url_errors.setdefault(url, []) + self.fetch_by_url_errors[url].append(exc.data) # either way, there's no more for us to do here, since we don't have # a valid file to cache return None @@ -1354,7 +1354,8 @@ def _handle_url_sourcemap_lookup(self, url): # working, if that's the case). If they're not looking for it to be # mapped, then they shouldn't be uploading the source file in the # first place. - self.fetch_by_url_errors[url] = exc.data + self.fetch_by_url_errors.setdefault(url, []) + self.fetch_by_url_errors[url].append(exc.data) return None else: with sentry_sdk.start_span( From 6a6ff675989731667756b2ad575bfdc04a1dec2a Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Thu, 23 Feb 2023 14:07:09 +0100 Subject: [PATCH 11/28] Improve --- .../lang/javascript/test_plugin.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/tests/relay_integration/lang/javascript/test_plugin.py b/tests/relay_integration/lang/javascript/test_plugin.py index 08b4991cf8c54c..6dc0e8d3c1f17e 100644 --- a/tests/relay_integration/lang/javascript/test_plugin.py +++ b/tests/relay_integration/lang/javascript/test_plugin.py @@ -116,8 +116,8 @@ def test_adds_contexts_with_ps4_device(self): "brand": "Sony", } - @patch("sentry.lang.javascript.processor.fetch_file") - def test_source_expansion(self, mock_fetch_file): + @patch("sentry.lang.javascript.processor.Fetcher.fetch_by_url") + def test_source_expansion(self, mock_fetch_by_url): data = { "timestamp": self.min_ago, "message": "hello", @@ -147,19 +147,13 @@ def test_source_expansion(self, mock_fetch_file): }, } - mock_fetch_file.return_value.body = force_bytes("\n".join("hello world")) - mock_fetch_file.return_value.encoding = None - mock_fetch_file.return_value.headers = {} + mock_fetch_by_url.return_value.body = force_bytes("\n".join("hello world")) + mock_fetch_by_url.return_value.encoding = None + mock_fetch_by_url.return_value.headers = {} event = self.post_and_retrieve_event(data) - mock_fetch_file.assert_called_once_with( - "http://example.com/foo.js", - project=self.project, - release=None, - dist=None, - allow_scraping=True, - ) + mock_fetch_by_url.assert_called_once_with("http://example.com/foo.js") exception = event.interfaces["exception"] frame_list = exception.values[0].stacktrace.frames From 5a2db201021c0ad3ba11e48f7933d218565fec52 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Thu, 23 Feb 2023 14:37:49 +0100 Subject: [PATCH 12/28] Improve --- src/sentry/lang/javascript/processor.py | 374 ++++++++++++------------ 1 file changed, 195 insertions(+), 179 deletions(-) diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py index d37f05b9fc9efd..a2a88fd5076910 100644 --- a/src/sentry/lang/javascript/processor.py +++ b/src/sentry/lang/javascript/processor.py @@ -39,8 +39,6 @@ from sentry.utils.safe import get_path from sentry.utils.urls import non_standard_url_join -from .cache import SourceMapCache - __all__ = ["JavaScriptStacktraceProcessor"] @@ -577,16 +575,139 @@ def fetch_release_artifact(url, release, dist): return result +def is_html_response(result): + # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<'). + # This cannot parse as valid JS/JSON. + # NOTE: not relying on Content-Type header because apps often don't set this correctly + # Discard leading whitespace (often found before doctype) + body_start = result.body[:20].lstrip() + + if body_start[:1] == b"<": + return True + return False + + +def get_max_age(headers): + cache_control = headers.get("cache-control") + max_age = CACHE_CONTROL_MIN + + if cache_control: + match = CACHE_CONTROL_RE.search(cache_control) + if match: + max_age = max(CACHE_CONTROL_MIN, int(match.group(1))) + return min(max_age, CACHE_CONTROL_MAX) + + +def is_data_uri(url): + return url[:BASE64_PREAMBLE_LENGTH] == BASE64_SOURCEMAP_PREAMBLE + + +def generate_module(src): + """ + Converts a url into a made-up module name by doing the following: + * Extract just the path name ignoring querystrings + * Trimming off the initial / + * Trimming off the file extension + * Removes off useless folder prefixes + + e.g. http://google.com/js/v1.0/foo/bar/baz.js -> foo/bar/baz + """ + if not src: + return UNKNOWN_MODULE + + filename, ext = splitext(urlsplit(src).path) + if filename.endswith(".min"): + filename = filename[:-4] + + # TODO(dcramer): replace CLEAN_MODULE_RE with tokenizer completely + tokens = filename.split("/") + for idx, token in enumerate(tokens): + # a SHA + if VERSION_RE.match(token): + return "/".join(tokens[idx + 1 :]) + + return CLEAN_MODULE_RE.sub("", filename) or UNKNOWN_MODULE + + +def is_valid_frame(frame): + return frame is not None and frame.get("lineno") is not None + + +def get_function_for_token(frame, token, previous_frame=None): + """ + Get function name for a given frame based on the token resolved by symbolic. + It tries following paths in order: + - return token function name if we have a usable value (filtered through `USELESS_FN_NAMES` list), + - return mapped name of the caller (previous frame) token if it had, + - return token function name, including filtered values if it mapped to anything in the first place, + - return current frames function name as a fallback + """ + + frame_function_name = frame.get("function") + token_function_name = token.function_name + + # Try to use the function name we got from sourcemap-cache, filtering useless names. + if token_function_name not in USELESS_FN_NAMES: + return token_function_name + + # If not found, ask the callsite (previous token) for function name if possible. + if previous_frame is not None: + last_token = previous_frame.data.get("token") + if last_token is not None and last_token.name not in ("", None): + return last_token.name + + # If there was no minified name at all, return even useless, filtered one from the original token. + if not frame_function_name: + return token_function_name + + # Otherwise fallback to the old, minified name. + return frame_function_name + + +def fold_function_name(function_name): + """ + Fold multiple consecutive occurences of the same property name into a single group, excluding the last component. + + foo | foo + foo.foo | foo.foo + foo.foo.foo | {foo#2}.foo + bar.foo.foo | bar.foo.foo + bar.foo.foo.foo | bar.{foo#2}.foo + bar.foo.foo.onError | bar.{foo#2}.onError + bar.bar.bar.foo.foo.onError | {bar#3}.{foo#2}.onError + bar.foo.foo.bar.bar.onError | bar.{foo#2}.{bar#2}.onError + """ + + parts = function_name.split(".") + + if len(parts) == 1: + return function_name + + tail = parts.pop() + grouped = [list(g) for _, g in groupby(parts)] + + def format_groups(p): + if len(p) == 1: + return p[0] + return f"\u007b{p[0]}#{len(p)}\u007d" + + return f'{".".join(map(format_groups, grouped))}.{tail}' + + class Fetcher: + """ + Components responsible for fetching files either by url or debug_id that aims at hiding the complexity involved + in file fetching. + """ + def __init__(self, project, release, dist, allow_scraping): self.project = project self.release = release self.dist = dist self.allow_scraping = allow_scraping - self.sourcemap_cache = SourceMapCache() - def fetch_by_debug_id(self, debug_id, source_file_type): + # TODO: implement debug id lookup. pass def fetch_by_url(self, url): @@ -717,128 +838,10 @@ def fetch_by_url(self, url): return result def _fetch_sourcemap_by_debug_id(self): + # TODO: implement debug id lookup. pass -def is_html_response(result): - # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<'). - # This cannot parse as valid JS/JSON. - # NOTE: not relying on Content-Type header because apps often don't set this correctly - # Discard leading whitespace (often found before doctype) - body_start = result.body[:20].lstrip() - - if body_start[:1] == b"<": - return True - return False - - -def get_max_age(headers): - cache_control = headers.get("cache-control") - max_age = CACHE_CONTROL_MIN - - if cache_control: - match = CACHE_CONTROL_RE.search(cache_control) - if match: - max_age = max(CACHE_CONTROL_MIN, int(match.group(1))) - return min(max_age, CACHE_CONTROL_MAX) - - -def is_data_uri(url): - return url[:BASE64_PREAMBLE_LENGTH] == BASE64_SOURCEMAP_PREAMBLE - - -def generate_module(src): - """ - Converts a url into a made-up module name by doing the following: - * Extract just the path name ignoring querystrings - * Trimming off the initial / - * Trimming off the file extension - * Removes off useless folder prefixes - - e.g. http://google.com/js/v1.0/foo/bar/baz.js -> foo/bar/baz - """ - if not src: - return UNKNOWN_MODULE - - filename, ext = splitext(urlsplit(src).path) - if filename.endswith(".min"): - filename = filename[:-4] - - # TODO(dcramer): replace CLEAN_MODULE_RE with tokenizer completely - tokens = filename.split("/") - for idx, token in enumerate(tokens): - # a SHA - if VERSION_RE.match(token): - return "/".join(tokens[idx + 1 :]) - - return CLEAN_MODULE_RE.sub("", filename) or UNKNOWN_MODULE - - -def is_valid_frame(frame): - return frame is not None and frame.get("lineno") is not None - - -def get_function_for_token(frame, token, previous_frame=None): - """ - Get function name for a given frame based on the token resolved by symbolic. - It tries following paths in order: - - return token function name if we have a usable value (filtered through `USELESS_FN_NAMES` list), - - return mapped name of the caller (previous frame) token if it had, - - return token function name, including filtered values if it mapped to anything in the first place, - - return current frames function name as a fallback - """ - - frame_function_name = frame.get("function") - token_function_name = token.function_name - - # Try to use the function name we got from sourcemap-cache, filtering useless names. - if token_function_name not in USELESS_FN_NAMES: - return token_function_name - - # If not found, ask the callsite (previous token) for function name if possible. - if previous_frame is not None: - last_token = previous_frame.data.get("token") - if last_token is not None and last_token.name not in ("", None): - return last_token.name - - # If there was no minified name at all, return even useless, filtered one from the original token. - if not frame_function_name: - return token_function_name - - # Otherwise fallback to the old, minified name. - return frame_function_name - - -def fold_function_name(function_name): - """ - Fold multiple consecutive occurences of the same property name into a single group, excluding the last component. - - foo | foo - foo.foo | foo.foo - foo.foo.foo | {foo#2}.foo - bar.foo.foo | bar.foo.foo - bar.foo.foo.foo | bar.{foo#2}.foo - bar.foo.foo.onError | bar.{foo#2}.onError - bar.bar.bar.foo.foo.onError | {bar#3}.{foo#2}.onError - bar.foo.foo.bar.bar.onError | bar.{foo#2}.{bar#2}.onError - """ - - parts = function_name.split(".") - - if len(parts) == 1: - return function_name - - tail = parts.pop() - grouped = [list(g) for _, g in groupby(parts)] - - def format_groups(p): - if len(p) == 1: - return p[0] - return f"\u007b{p[0]}#{len(p)}\u007d" - - return f'{".".join(map(format_groups, grouped))}.{tail}' - - class JavaScriptStacktraceProcessor(StacktraceProcessor): """ Modern SourceMap processor using symbolic-sourcemapcache. @@ -884,8 +887,11 @@ def __init__(self, *args, **kwargs): self.debug_id_sourcemap_cache = {} self.sourcemap_url_sourcemap_cache = {} - # Additional caches for mapping purposes. + # Contains a mapping between the 'result.url' of the request and the file url from 'abs_path'. self.url_aliases = {} + + # Contains a mapping between the minified source url ('abs_path' of the frame) and the sourcemap url + # which is discovered by looking at the minified source. self.minified_source_url_to_sourcemap_url = {} # Component responsible for fetching the files. @@ -961,14 +967,14 @@ def process_frame(self, processable_frame, processing_task): if errors is not None: all_errors.extend(errors) - # `source` is used for pre/post and `context_line` frame expansion. - # Here it's pointing to minified source, however the variable can be shadowed with the original sourceview + # `sourceview` is used for pre/post and `context_line` frame expansion. + # Here it's pointing to the minified source, however the variable can be shadowed with the original sourceview # (or `None` if the token doesnt provide us with the `context_line`) down the road. # TODO: add support for debug ids. - # We fetch the sourcemap url for the given source. sourceview = self.get_or_fetch_sourceview(url=frame["abs_path"]) - sourcemap_view = self.get_or_fetch_sourcemap(url=frame["abs_path"]) + sourcemap_view = self.get_or_fetch_sourcemap_view(url=frame["abs_path"]) + # We get the url that we discovered for the sourcemap which we use down in the processing pipeline. sourcemap_url = self.minified_source_url_to_sourcemap_url.get(frame["abs_path"]) self.sourcemaps_touched.add(sourcemap_url) @@ -1203,9 +1209,14 @@ def expand_frame(self, frame, source_context=None, source=None): return True - # If we return None it means that we weren't able to find the file. def get_or_fetch_sourceview(self, url=None, debug_id=None, source_file_type=None): + """ + Gets from the cache or fetches the file at 'url' or 'debug_id'. + + Returns None when a file with either 'url' or 'debug_id' cannot be found. + """ # We require that artifact bundles with debug ids used embedded source in the source map. + # TODO: use enum for source file type when available. if debug_id is not None and source_file_type != "source": return None @@ -1239,14 +1250,12 @@ def _fetch_and_cache_sourceview(self, url, debug_id, source_file_type): cache.add_error(url, {"type": EventError.JS_TOO_MANY_REMOTE_SOURCES}) return None - # TODO: respect cache-control/max-age headers to some extent - logger.debug("Attempting to cache source %r", url) - # this both looks in the database and tries to scrape the internet - with sentry_sdk.start_span( - op="JavaScriptStacktraceProcessor.cache_source.fetch_file" - ) as span: - span.set_data("filename", url) - if debug_id is not None: + if debug_id is not None: + logger.debug("Attempting to cache source with debug id %r", debug_id) + with sentry_sdk.start_span( + op="JavaScriptStacktraceProcessor._fetch_and_cache_sourceview.fetch_by_debug_id" + ) as span: + span.set_data("debug_id", debug_id) result = self.fetcher.fetch_by_debug_id(debug_id, source_file_type) if result is not None: # We temporarily store the result directly, in case we want to compute the sourceview on demand @@ -1255,44 +1264,51 @@ def _fetch_and_cache_sourceview(self, url, debug_id, source_file_type): sourceview = SourceView.from_bytes(result.body) return sourceview - try: + try: + logger.debug("Attempting to cache source with url %r", url) + with sentry_sdk.start_span( + op="JavaScriptStacktraceProcessor._fetch_and_cache_sourceview.fetch_by_url" + ) as span: + span.set_data("url", url) result = self.fetcher.fetch_by_url(url) - except http.BadSource as exc: - if not fetch_error_should_be_silienced(exc.data, url): - self.fetch_by_url_errors.setdefault(url, []) - self.fetch_by_url_errors[url].append(exc.data) - # either way, there's no more for us to do here, since we don't have - # a valid file to cache - return None - else: - # TODO: it should do something but we don't know. - if result.url != url: - self.url_aliases[result.url] = url + except http.BadSource as exc: + if not fetch_error_should_be_silienced(exc.data, url): + self.fetch_by_url_errors.setdefault(url, []) + self.fetch_by_url_errors[url].append(exc.data) + # either way, there's no more for us to do here, since we don't have + # a valid file to cache + return None + else: + # TODO: it should do something but we don't know. + if result.url != url: + self.url_aliases[result.url] = url - # We temporarily store the result directly, in case we want to compute the sourceview on demand - # we can delete the other dictionary. - self.fetch_by_url_result[url] = result - sourceview = SourceView.from_bytes(result.body) + # We temporarily store the result directly, in case we want to compute the sourceview on demand + # we can delete the other dictionary. + self.fetch_by_url_result[url] = result + sourceview = SourceView.from_bytes(result.body) - # TODO: the discovery of sourcemap urls here is very opaque, consider of putting it closer to the - # function's callsite. - sourcemap_url = discover_sourcemap(result) - if sourcemap_url: - self.minified_source_url_to_sourcemap_url[url] = sourcemap_url + # TODO: the discovery of sourcemap urls here is very opaque, consider of putting it closer to the + # function's callsite. + sourcemap_url = discover_sourcemap(result) + if sourcemap_url: + self.minified_source_url_to_sourcemap_url[url] = sourcemap_url - return sourceview + return sourceview + + def get_or_fetch_sourcemap_view(self, url=None, debug_id=None): + """ + Gets from the cache or fetches the sourcemap view of the file at 'url' or 'debug_id'. - # The url is of the minified file. - # If this function returns None, it means that we didn't find the sourcemap, maybe because the file at url isn't - # a minified file. - def get_or_fetch_sourcemap(self, url=None, debug_id=None): - rv = self._get_cached_sourcemap(url, debug_id) + Returns None when a sourcemap corresponding to the file at 'url' is not found. + """ + rv = self._get_cached_sourcemap_view(url, debug_id) if rv is not None: return rv - return self._fetch_and_cache_sourcemap(url, debug_id) + return self._fetch_and_cache_sourcemap_view(url, debug_id) - def _get_cached_sourcemap(self, url, debug_id): + def _get_cached_sourcemap_view(self, url, debug_id): if debug_id is not None: rv = self.debug_id_sourcemap_cache.get(debug_id) if rv is not None: @@ -1307,7 +1323,7 @@ def _get_cached_sourcemap(self, url, debug_id): return None - def _fetch_and_cache_sourcemap(self, url, debug_id): + def _fetch_and_cache_sourcemap_view(self, url, debug_id): if debug_id is not None: sourcemap_view = self._handle_debug_id_sourcemap_lookup(debug_id) if sourcemap_view is not None: @@ -1325,7 +1341,7 @@ def _handle_debug_id_sourcemap_lookup(self, debug_id): if minified_result is None: return None - # TODO: use enum for source file type. + # TODO: use enum for source file type when available. result = self.fetcher.fetch_by_debug_id(debug_id, "sourcemap") if result is not None: sourcemap_view = SmCache.from_bytes(minified_result.body, result.body) @@ -1343,10 +1359,13 @@ def _handle_url_sourcemap_lookup(self, url): try: with sentry_sdk.start_span( - op="JavaScriptStacktraceProcessor.cache_source.fetch_sourcemap" + op="JavaScriptStacktraceProcessor.handle_url_sourcemap_lookup.fetch_sourcemap_view_by_url" ) as span: + span.set_data("url", url) span.set_data("sourcemap_url", sourcemap_url) - sourcemap_view = self._fetch_sourcemap(sourcemap_url, source=minified_result.body) + sourcemap_view = self._fetch_sourcemap_view_by_url( + sourcemap_url, source=minified_result.body + ) except http.BadSource as exc: # we don't perform the same check here as above, because if someone has # uploaded a node_modules file, which has a sourceMappingURL, they @@ -1358,13 +1377,10 @@ def _handle_url_sourcemap_lookup(self, url): self.fetch_by_url_errors[url].append(exc.data) return None else: - with sentry_sdk.start_span( - op="JavaScriptStacktraceProcessor.cache_source.cache_sourcemap_view" - ): - self.sourcemap_url_sourcemap_cache[sourcemap_url] = sourcemap_view - return sourcemap_view + self.sourcemap_url_sourcemap_cache[sourcemap_url] = sourcemap_view + return sourcemap_view - def _fetch_sourcemap(self, url, source=b""): + def _fetch_sourcemap_view_by_url(self, url, source=b""): if is_data_uri(url): try: body = base64.b64decode( @@ -1376,7 +1392,7 @@ def _fetch_sourcemap(self, url, source=b""): else: # look in the database and, if not found, optionally try to scrape the web with sentry_sdk.start_span( - op="JavaScriptStacktraceProcessor.fetch_sourcemap.fetch_file" + op="JavaScriptStacktraceProcessor.fetch_sourcemap_view_by_url.fetch_by_url" ) as span: span.set_data("url", url) result = self.fetcher.fetch_by_url(url) @@ -1384,7 +1400,7 @@ def _fetch_sourcemap(self, url, source=b""): body = result.body try: with sentry_sdk.start_span( - op="JavaScriptStacktraceProcessor.fetch_sourcemap.SmCache.from_bytes" + op="JavaScriptStacktraceProcessor.fetch_sourcemap_view_by_url.SmCache.from_bytes" ): return SmCache.from_bytes(source, body) @@ -1424,7 +1440,7 @@ def populate_sources_cache(self, frames): self.get_or_fetch_sourceview(url=filename) # We then want to fetch the sourcemap of the file, but this call can fail in case filename points to a # non minified file. - self.get_or_fetch_sourcemap(url=filename) + self.get_or_fetch_sourcemap_view(url=filename) def close(self): StacktraceProcessor.close(self) From bca29af343d04657dc2e609bdb0eceaaf7612026 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Thu, 23 Feb 2023 15:20:27 +0100 Subject: [PATCH 13/28] Fix tests --- src/sentry/lang/javascript/processor.py | 20 ++-- .../sentry/lang/javascript/test_processor.py | 101 +++++++++++------- 2 files changed, 77 insertions(+), 44 deletions(-) diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py index a2a88fd5076910..d0b9e82e1716ff 100644 --- a/src/sentry/lang/javascript/processor.py +++ b/src/sentry/lang/javascript/processor.py @@ -700,7 +700,7 @@ class Fetcher: in file fetching. """ - def __init__(self, project, release, dist, allow_scraping): + def __init__(self, project=None, release=None, dist=None, allow_scraping=True): self.project = project self.release = release self.dist = dist @@ -923,15 +923,21 @@ def preprocess_step(self, processing_task): date = timestamp and datetime.fromtimestamp(timestamp).replace(tzinfo=timezone.utc) self.dist = self.release.add_dist(self.data["dist"], date) - self.fetcher = Fetcher(self.project, self.release, self.dist, self.allow_scraping) + # We initialize the fetcher during preprocessing, so that we can start fetching files during source cache + # population. + self._initialize_fetcher() with sentry_sdk.start_span( op="JavaScriptStacktraceProcessor.preprocess_step.populate_source_cache" ): - self.populate_sources_cache(frames) + self.populate_source_cache(frames) return True + def _initialize_fetcher(self): + # We initialize the fetcher once all data has been loaded. + self.fetcher = Fetcher(self.project, self.release, self.dist, self.allow_scraping) + def handles_frame(self, frame, stacktrace_info): platform = frame.get("platform") or self.data.get("platform") return platform in ("javascript", "node") @@ -1273,8 +1279,7 @@ def _fetch_and_cache_sourceview(self, url, debug_id, source_file_type): result = self.fetcher.fetch_by_url(url) except http.BadSource as exc: if not fetch_error_should_be_silienced(exc.data, url): - self.fetch_by_url_errors.setdefault(url, []) - self.fetch_by_url_errors[url].append(exc.data) + self.fetch_by_url_errors.setdefault(url, []).append(exc.data) # either way, there's no more for us to do here, since we don't have # a valid file to cache return None @@ -1373,8 +1378,7 @@ def _handle_url_sourcemap_lookup(self, url): # working, if that's the case). If they're not looking for it to be # mapped, then they shouldn't be uploading the source file in the # first place. - self.fetch_by_url_errors.setdefault(url, []) - self.fetch_by_url_errors[url].append(exc.data) + self.fetch_by_url_errors.setdefault(url, []).append(exc.data) return None else: self.sourcemap_url_sourcemap_cache[sourcemap_url] = sourcemap_view @@ -1409,7 +1413,7 @@ def _fetch_sourcemap_view_by_url(self, url, source=b""): logger.debug(str(exc), exc_info=True) raise UnparseableSourcemap({"url": http.expose_url(url)}) - def populate_sources_cache(self, frames): + def populate_source_cache(self, frames): """ Fetch all sources that we know are required (being referenced directly in frames). diff --git a/tests/sentry/lang/javascript/test_processor.py b/tests/sentry/lang/javascript/test_processor.py index d29a0101204f9e..e5dcec82bc56bd 100644 --- a/tests/sentry/lang/javascript/test_processor.py +++ b/tests/sentry/lang/javascript/test_processor.py @@ -16,14 +16,13 @@ from sentry.lang.javascript.processor import ( CACHE_CONTROL_MAX, CACHE_CONTROL_MIN, + Fetcher, JavaScriptStacktraceProcessor, UnparseableSourcemap, cache, discover_sourcemap, - fetch_file, fetch_release_archive_for_url, fetch_release_file, - fetch_sourcemap, fold_function_name, generate_module, get_function_for_token, @@ -556,7 +555,7 @@ def test_simple(self): responses.GET, "http://example.com", body="foo bar", content_type="application/json" ) - result = fetch_file("http://example.com") + result = Fetcher().fetch_by_url("http://example.com") assert len(responses.calls) == 1 @@ -565,7 +564,7 @@ def test_simple(self): assert result.headers == {"content-type": "application/json"} # ensure we use the cached result - result2 = fetch_file("http://example.com") + result2 = Fetcher().fetch_by_url("http://example.com") assert len(responses.calls) == 1 @@ -594,7 +593,7 @@ def test_with_token(self): self.project.update_option("sentry:token_header", header_name_option_value) url = f"http://example.com/{i}/" - result = fetch_file(url, project=self.project) + result = Fetcher(project=self.project).fetch_by_url(url) assert result.url == url assert result.body == b"foo bar" @@ -608,20 +607,20 @@ def test_connection_failure(self): responses.add(responses.GET, "http://example.com", body=RequestException()) with pytest.raises(http.BadSource): - fetch_file("http://example.com") + Fetcher().fetch_by_url("http://example.com") assert len(responses.calls) == 1 # ensure we use the cached domain-wide failure for the second call with pytest.raises(http.BadSource): - fetch_file("http://example.com/foo/bar") + Fetcher().fetch_by_url("http://example.com/foo/bar") assert len(responses.calls) == 1 @responses.activate def test_non_url_without_release(self): with pytest.raises(http.BadSource): - fetch_file("/example.js") + Fetcher().fetch_by_url("/example.js") @responses.activate @patch("sentry.lang.javascript.processor.fetch_release_file") @@ -634,7 +633,7 @@ def test_non_url_with_release(self, mock_fetch_release_file): release = Release.objects.create(version="1", organization_id=self.project.organization_id) release.add_project(self.project) - result = fetch_file("/example.js", release=release) + result = Fetcher(release=release).fetch_by_url("/example.js") assert result.url == "/example.js" assert result.body == b"foo" assert isinstance(result.body, bytes) @@ -670,13 +669,13 @@ def test_non_url_with_release_archive(self): # Attempt to fetch nonexisting with pytest.raises(http.BadSource): - fetch_file("does-not-exist.js", release=release) + Fetcher(release=release).fetch_by_url("does-not-exist.js") # Attempt to fetch nonexsting again (to check if cache works) with pytest.raises(http.BadSource): - result = fetch_file("does-not-exist.js", release=release) + result = Fetcher(release=release).fetch_by_url("does-not-exist.js") - result = fetch_file("/example.js", release=release) + result = Fetcher(release=release).fetch_by_url("/example.js") assert result.url == "/example.js" assert result.body == b"foo" assert isinstance(result.body, bytes) @@ -684,7 +683,7 @@ def test_non_url_with_release_archive(self): assert result.encoding == "utf-8" # Make sure cache loading works: - result2 = fetch_file("/example.js", release=release) + result2 = Fetcher(release=release).fetch_by_url("/example.js") assert result2 == result def _create_archive(self, release, url): @@ -842,7 +841,7 @@ def test_unicode_body(self): content_type="application/json; charset=utf-8", ) - result = fetch_file("http://example.com") + result = Fetcher().fetch_by_url("http://example.com") assert len(responses.calls) == 1 @@ -852,7 +851,7 @@ def test_unicode_body(self): assert result.encoding == "utf-8" # ensure we use the cached result - result2 = fetch_file("http://example.com") + result2 = Fetcher().fetch_by_url("http://example.com") assert len(responses.calls) == 1 @@ -878,7 +877,7 @@ def cache_get(key): ) with pytest.raises(http.CannotFetch) as exc: - fetch_file("http://example.com") + Fetcher().fetch_by_url("http://example.com") assert exc.value.data["type"] == EventError.TOO_LARGE_FOR_CACHE @@ -891,7 +890,7 @@ def cache_get(key): def test_truncated(self): url = truncatechars("http://example.com", 3) with pytest.raises(http.CannotFetch) as exc: - fetch_file(url) + Fetcher().fetch_by_url(url) assert exc.value.data["type"] == EventError.JS_MISSING_SOURCE assert exc.value.data["url"] == url @@ -1158,7 +1157,11 @@ def test_dedupe_properties(self): class FetchSourcemapTest(TestCase): def test_simple_base64(self): - smap_view = fetch_sourcemap(base64_sourcemap) + processor = JavaScriptStacktraceProcessor( + data={}, stacktrace_infos=None, project=self.create_project() + ) + processor._initialize_fetcher() + smap_view = processor._fetch_sourcemap_view_by_url(base64_sourcemap) token = smap_view.lookup(1, 1, 0) assert token.src == "/test.js" @@ -1167,7 +1170,11 @@ def test_simple_base64(self): assert token.context_line == 'console.log("hello, World!")' def test_base64_without_padding(self): - smap_view = fetch_sourcemap(base64_sourcemap.rstrip("=")) + processor = JavaScriptStacktraceProcessor( + data={}, stacktrace_infos=None, project=self.create_project() + ) + processor._initialize_fetcher() + smap_view = processor._fetch_sourcemap_view_by_url(base64_sourcemap.rstrip("=")) token = smap_view.lookup(1, 1, 0) assert token.src == "/test.js" @@ -1177,7 +1184,11 @@ def test_base64_without_padding(self): def test_broken_base64(self): with pytest.raises(UnparseableSourcemap): - fetch_sourcemap("data:application/json;base64,xxx") + processor = JavaScriptStacktraceProcessor( + data={}, stacktrace_infos=None, project=self.create_project() + ) + processor._initialize_fetcher() + processor._fetch_sourcemap_view_by_url("data:application/json;base64,xxx") @responses.activate def test_garbage_json(self): @@ -1186,7 +1197,11 @@ def test_garbage_json(self): ) with pytest.raises(UnparseableSourcemap): - fetch_sourcemap("http://example.com") + processor = JavaScriptStacktraceProcessor( + data={}, stacktrace_infos=None, project=self.create_project() + ) + processor._initialize_fetcher() + processor._fetch_sourcemap_view_by_url("http://example.com") class TrimLineTest(unittest.TestCase): @@ -1443,6 +1458,7 @@ def test_file_no_source_records_error(self): project = self.create_project() processor = JavaScriptStacktraceProcessor(data={}, stacktrace_infos=None, project=project) + processor._initialize_fetcher() # no release on the event, so won't find file in database assert processor.release is None @@ -1451,13 +1467,16 @@ def test_file_no_source_records_error(self): abs_path = "app:///i/dont/exist.js" # before caching, no errors - assert len(processor.cache.get_errors(abs_path)) == 0 + assert len(processor.fetch_by_url_errors.get(abs_path, [])) == 0 - processor._fetch_and_cache_sourceview(abs_path) + processor.get_or_fetch_sourceview(url=abs_path) # now we have an error - assert len(processor.cache.get_errors(abs_path)) == 1 - assert processor.cache.get_errors(abs_path)[0] == {"url": abs_path, "type": "js_no_source"} + assert len(processor.fetch_by_url_errors.get(abs_path, [])) == 1 + assert processor.fetch_by_url_errors.get(abs_path, [])[0] == { + "url": abs_path, + "type": "js_no_source", + } def test_node_modules_file_no_source_no_error(self): """ @@ -1467,6 +1486,8 @@ def test_node_modules_file_no_source_no_error(self): project = self.create_project() processor = JavaScriptStacktraceProcessor(data={}, stacktrace_infos=None, project=project) + # We need to initialize the fetcher so that it can capture the necessary context to execute file fetching. + processor._initialize_fetcher() # no release on the event, so won't find file in database assert processor.release is None @@ -1474,10 +1495,10 @@ def test_node_modules_file_no_source_no_error(self): # not a real url, so won't find file on the internet abs_path = "app:///../node_modules/i/dont/exist.js" - processor._fetch_and_cache_sourceview(abs_path) + processor.get_or_fetch_sourceview(url=abs_path) # no errors, even though the file can't have been found - assert len(processor.cache.get_errors(abs_path)) == 0 + assert len(processor.fetch_by_url_errors.get(abs_path, [])) == 0 def test_node_modules_file_with_source_is_used(self): """ @@ -1498,12 +1519,14 @@ def test_node_modules_file_with_source_is_used(self): # dictionary passed to the JavaScriptStacktraceProcessor constructor, # but since this is just a unit test, we have to set it manually processor.release = release + # We need to initialize the fetcher so that it can capture the necessary context to execute file fetching. + processor._initialize_fetcher() - processor._fetch_and_cache_sourceview(abs_path) - - # file is cached, no errors are generated - assert processor.cache.get(abs_path) - assert len(processor.cache.get_errors(abs_path)) == 0 + # We found the source view. + assert processor.get_or_fetch_sourceview(url=abs_path) + # Source view exists in cache. + assert processor.fetch_by_url_result.get(abs_path) + assert len(processor.fetch_by_url_errors.get(abs_path, [])) == 0 @patch("sentry.lang.javascript.processor.discover_sourcemap") def test_node_modules_file_with_source_but_no_map_records_error(self, mock_discover_sourcemap): @@ -1528,12 +1551,18 @@ def test_node_modules_file_with_source_but_no_map_records_error(self, mock_disco # dictionary passed to the JavaScriptStacktraceProcessor constructor, # but since this is just a unit test, we have to set it manually processor.release = release + # We need to initialize the fetcher so that it can capture the necessary context to execute file fetching. + processor._initialize_fetcher() # before caching, no errors - assert len(processor.cache.get_errors(abs_path)) == 0 + assert len(processor.fetch_by_url_errors.get(abs_path, [])) == 0 - processor._fetch_and_cache_sourceview(abs_path) + processor.get_or_fetch_sourceview(url=abs_path) + processor.get_or_fetch_sourcemap_view(url=abs_path) # now we have an error - assert len(processor.cache.get_errors(abs_path)) == 1 - assert processor.cache.get_errors(abs_path)[0] == {"url": map_url, "type": "js_no_source"} + assert len(processor.fetch_by_url_errors.get(abs_path, [])) == 1 + assert processor.fetch_by_url_errors.get(abs_path, [])[0] == { + "url": map_url, + "type": "js_no_source", + } From 6929724a0804c23b6e71cf230fb85c785ae7b893 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Thu, 23 Feb 2023 16:10:20 +0100 Subject: [PATCH 14/28] Fix tests --- src/sentry/lang/javascript/processor.py | 19 ++++++++++--------- .../sentry/lang/javascript/test_sourcemaps.py | 6 +++--- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py index d0b9e82e1716ff..d4fedfc2f06a41 100644 --- a/src/sentry/lang/javascript/processor.py +++ b/src/sentry/lang/javascript/processor.py @@ -876,6 +876,10 @@ def __init__(self, *args, **kwargs): self.fetch_count = 0 self.sourcemaps_touched = set() + # All the following dictionaries have been defined top level for simplicity reasons. Because this code will + # be ported to Symbolicator we wanted to keep it as simple and explicit as possible. This comment also + # applies to the many repetitions across the code. + # Cache holding the results of the fetching by url. self.fetch_by_url_result = {} self.fetch_by_url_errors = {} @@ -952,7 +956,6 @@ def process_frame(self, processable_frame, processing_task): Attempt to demangle the given frame. """ frame = processable_frame.frame - token = None all_errors = [] sourcemap_applied = False @@ -1036,6 +1039,7 @@ def process_frame(self, processable_frame, processing_task): "Mapping compressed source %r to mapping in %r", frame["abs_path"], abs_path ) + # This 'if' should always evaluate to True when debug_id is used. if token.context_line is not None: source_context = token.pre_context, token.context_line, token.post_context else: @@ -1253,13 +1257,15 @@ def _fetch_and_cache_sourceview(self, url, debug_id, source_file_type): # TODO: this branch should only apply to http fetches. if self.fetch_count > self.max_fetches: - cache.add_error(url, {"type": EventError.JS_TOO_MANY_REMOTE_SOURCES}) + self.fetch_by_url_errors.setdefault(url, []).append( + {"type": EventError.JS_TOO_MANY_REMOTE_SOURCES} + ) return None if debug_id is not None: logger.debug("Attempting to cache source with debug id %r", debug_id) with sentry_sdk.start_span( - op="JavaScriptStacktraceProcessor._fetch_and_cache_sourceview.fetch_by_debug_id" + op="JavaScriptStacktraceProcessor.fetch_and_cache_sourceview.fetch_by_debug_id" ) as span: span.set_data("debug_id", debug_id) result = self.fetcher.fetch_by_debug_id(debug_id, source_file_type) @@ -1273,7 +1279,7 @@ def _fetch_and_cache_sourceview(self, url, debug_id, source_file_type): try: logger.debug("Attempting to cache source with url %r", url) with sentry_sdk.start_span( - op="JavaScriptStacktraceProcessor._fetch_and_cache_sourceview.fetch_by_url" + op="JavaScriptStacktraceProcessor.fetch_and_cache_sourceview.fetch_by_url" ) as span: span.set_data("url", url) result = self.fetcher.fetch_by_url(url) @@ -1293,8 +1299,6 @@ def _fetch_and_cache_sourceview(self, url, debug_id, source_file_type): self.fetch_by_url_result[url] = result sourceview = SourceView.from_bytes(result.body) - # TODO: the discovery of sourcemap urls here is very opaque, consider of putting it closer to the - # function's callsite. sourcemap_url = discover_sourcemap(result) if sourcemap_url: self.minified_source_url_to_sourcemap_url[url] = sourcemap_url @@ -1442,9 +1446,6 @@ def populate_source_cache(self, frames): # TODO: we want to implement the construction of the debug # We first want to fetch the sourceview of the file. self.get_or_fetch_sourceview(url=filename) - # We then want to fetch the sourcemap of the file, but this call can fail in case filename points to a - # non minified file. - self.get_or_fetch_sourcemap_view(url=filename) def close(self): StacktraceProcessor.close(self) diff --git a/tests/sentry/lang/javascript/test_sourcemaps.py b/tests/sentry/lang/javascript/test_sourcemaps.py index 2d167db82421b3..a5b343ad8a0f8a 100644 --- a/tests/sentry/lang/javascript/test_sourcemaps.py +++ b/tests/sentry/lang/javascript/test_sourcemaps.py @@ -102,16 +102,16 @@ def test_no_inline(self): # basic sourcemap fixture has no inlined sources, so expect None smap_view = SourceMapView.from_json_bytes(sourcemap) - source = smap_view.get_or_fetch_sourceview(0) + source = smap_view.get_sourceview(0) assert source is None def test_indexed_inline(self): smap_view = SourceMapView.from_json_bytes(indexed_sourcemap_example) - assert smap_view.get_or_fetch_sourceview(0).get_source() == ( + assert smap_view.get_sourceview(0).get_source() == ( " ONE.foo = function (bar) {\n" + " return baz(bar);\n" + " };" ) - assert smap_view.get_or_fetch_sourceview(1).get_source() == ( + assert smap_view.get_sourceview(1).get_source() == ( " TWO.inc = function (n) {\n" + " return n + 1;\n" + " };" ) From d12a6d11d4514ee112ce3148778a1b1b6d7adf3e Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Fri, 24 Feb 2023 09:25:10 +0100 Subject: [PATCH 15/28] Fix errors --- src/sentry/lang/javascript/processor.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py index d4fedfc2f06a41..30f2c4098c1a37 100644 --- a/src/sentry/lang/javascript/processor.py +++ b/src/sentry/lang/javascript/processor.py @@ -972,17 +972,19 @@ def process_frame(self, processable_frame, processing_task): ): return - errors = self.fetch_by_url_errors.get(frame["abs_path"]) - if errors is not None: - all_errors.extend(errors) - - # `sourceview` is used for pre/post and `context_line` frame expansion. + # The 'sourceview' is used for pre/post and 'context_line' frame expansion. # Here it's pointing to the minified source, however the variable can be shadowed with the original sourceview - # (or `None` if the token doesnt provide us with the `context_line`) down the road. + # (or 'None' if the token doesnt provide us with the 'context_line') down the road. # TODO: add support for debug ids. sourceview = self.get_or_fetch_sourceview(url=frame["abs_path"]) sourcemap_view = self.get_or_fetch_sourcemap_view(url=frame["abs_path"]) + # We check for errors once both the minified file and the corresponding sourcemaps are loaded. In case errors + # happen in one of the two we will detect them and add them to 'all_errors'. + errors = self.fetch_by_url_errors.get(frame["abs_path"]) + if errors is not None: + all_errors.extend(errors) + # We get the url that we discovered for the sourcemap which we use down in the processing pipeline. sourcemap_url = self.minified_source_url_to_sourcemap_url.get(frame["abs_path"]) self.sourcemaps_touched.add(sourcemap_url) From db55adeb67765716d5cf8f4cb5795e1e0f136c19 Mon Sep 17 00:00:00 2001 From: Armin Ronacher Date: Fri, 24 Feb 2023 11:14:17 +0100 Subject: [PATCH 16/28] ref: Small refactor for the fetcher --- src/sentry/lang/javascript/processor.py | 33 +++++++++--------- .../sentry/lang/javascript/test_processor.py | 34 +++++++------------ 2 files changed, 29 insertions(+), 38 deletions(-) diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py index 30f2c4098c1a37..09f9fb7f09adf7 100644 --- a/src/sentry/lang/javascript/processor.py +++ b/src/sentry/lang/javascript/processor.py @@ -706,6 +706,11 @@ def __init__(self, project=None, release=None, dist=None, allow_scraping=True): self.dist = dist self.allow_scraping = allow_scraping + def bind_release(self, release=None, dist=None): + """Updates the fetcher with a project, release and dist.""" + self.release = release + self.dist = dist + def fetch_by_debug_id(self, debug_id, source_file_type): # TODO: implement debug id lookup. pass @@ -869,9 +874,6 @@ def __init__(self, *args, **kwargs): self.organization = organization self.max_fetches = MAX_RESOURCE_FETCHES - self.allow_scraping = organization.get_option( - "sentry:scrape_javascript", True - ) is not False and self.project.get_option("sentry:scrape_javascript", True) self.fetch_count = 0 self.sourcemaps_touched = set() @@ -899,10 +901,10 @@ def __init__(self, *args, **kwargs): self.minified_source_url_to_sourcemap_url = {} # Component responsible for fetching the files. - self.fetcher = None - - self.release = None - self.dist = None + self.fetcher = Fetcher( + allow_scraping=organization.get_option("sentry:scrape_javascript", True) is not False + and self.project.get_option("sentry:scrape_javascript", True) + ) def get_valid_frames(self): # build list of frames that we can actually grab source for @@ -920,16 +922,17 @@ def preprocess_step(self, processing_task): ) return False + release = None + dist = None + with sentry_sdk.start_span(op="JavaScriptStacktraceProcessor.preprocess_step.get_release"): - self.release = self.get_release(create=True) - if self.data.get("dist") and self.release: + release = self.get_release(create=True) + if self.data.get("dist") and release: timestamp = self.data.get("timestamp") date = timestamp and datetime.fromtimestamp(timestamp).replace(tzinfo=timezone.utc) - self.dist = self.release.add_dist(self.data["dist"], date) + dist = release.add_dist(self.data["dist"], date) - # We initialize the fetcher during preprocessing, so that we can start fetching files during source cache - # population. - self._initialize_fetcher() + self.fetcher.bind_release(release=release, dist=dist) with sentry_sdk.start_span( op="JavaScriptStacktraceProcessor.preprocess_step.populate_source_cache" @@ -938,10 +941,6 @@ def preprocess_step(self, processing_task): return True - def _initialize_fetcher(self): - # We initialize the fetcher once all data has been loaded. - self.fetcher = Fetcher(self.project, self.release, self.dist, self.allow_scraping) - def handles_frame(self, frame, stacktrace_info): platform = frame.get("platform") or self.data.get("platform") return platform in ("javascript", "node") diff --git a/tests/sentry/lang/javascript/test_processor.py b/tests/sentry/lang/javascript/test_processor.py index e5dcec82bc56bd..fcda647b89e30a 100644 --- a/tests/sentry/lang/javascript/test_processor.py +++ b/tests/sentry/lang/javascript/test_processor.py @@ -56,18 +56,18 @@ def test_infers_allow_scraping(self): project = self.create_project() r = JavaScriptStacktraceProcessor({}, None, project) # defaults - assert r.allow_scraping + assert r.fetcher.allow_scraping # disabled for project project.update_option("sentry:scrape_javascript", False) r = JavaScriptStacktraceProcessor({}, None, project) - assert not r.allow_scraping + assert not r.fetcher.allow_scraping # disabled for org project.delete_option("sentry:scrape_javascript") project.organization.update_option("sentry:scrape_javascript", False) r = JavaScriptStacktraceProcessor({}, None, project) - assert not r.allow_scraping + assert not r.fetcher.allow_scraping @patch( "sentry.lang.javascript.processor.JavaScriptStacktraceProcessor.get_valid_frames", @@ -87,15 +87,15 @@ def test_missing_dist(self, _1, _2): project=project, ) - assert processor.release is None - assert processor.dist is None + assert processor.fetcher.release is None + assert processor.fetcher.dist is None processor.preprocess_step(None) - assert processor.release == release - assert processor.dist is not None - assert processor.dist.name == "foo" - assert processor.dist.date_added.timestamp() == processor.data["timestamp"] + assert processor.fetcher.release == release + assert processor.fetcher.dist is not None + assert processor.fetcher.dist.name == "foo" + assert processor.fetcher.dist.date_added.timestamp() == processor.data["timestamp"] @with_feature("organizations:javascript-console-error-tag") def test_tag_suspected_console_error(self): @@ -1160,7 +1160,6 @@ def test_simple_base64(self): processor = JavaScriptStacktraceProcessor( data={}, stacktrace_infos=None, project=self.create_project() ) - processor._initialize_fetcher() smap_view = processor._fetch_sourcemap_view_by_url(base64_sourcemap) token = smap_view.lookup(1, 1, 0) @@ -1173,7 +1172,6 @@ def test_base64_without_padding(self): processor = JavaScriptStacktraceProcessor( data={}, stacktrace_infos=None, project=self.create_project() ) - processor._initialize_fetcher() smap_view = processor._fetch_sourcemap_view_by_url(base64_sourcemap.rstrip("=")) token = smap_view.lookup(1, 1, 0) @@ -1187,7 +1185,6 @@ def test_broken_base64(self): processor = JavaScriptStacktraceProcessor( data={}, stacktrace_infos=None, project=self.create_project() ) - processor._initialize_fetcher() processor._fetch_sourcemap_view_by_url("data:application/json;base64,xxx") @responses.activate @@ -1200,7 +1197,6 @@ def test_garbage_json(self): processor = JavaScriptStacktraceProcessor( data={}, stacktrace_infos=None, project=self.create_project() ) - processor._initialize_fetcher() processor._fetch_sourcemap_view_by_url("http://example.com") @@ -1458,10 +1454,9 @@ def test_file_no_source_records_error(self): project = self.create_project() processor = JavaScriptStacktraceProcessor(data={}, stacktrace_infos=None, project=project) - processor._initialize_fetcher() # no release on the event, so won't find file in database - assert processor.release is None + assert processor.fetcher.release is None # not a real url, so won't find file on the internet abs_path = "app:///i/dont/exist.js" @@ -1487,10 +1482,9 @@ def test_node_modules_file_no_source_no_error(self): project = self.create_project() processor = JavaScriptStacktraceProcessor(data={}, stacktrace_infos=None, project=project) # We need to initialize the fetcher so that it can capture the necessary context to execute file fetching. - processor._initialize_fetcher() # no release on the event, so won't find file in database - assert processor.release is None + assert processor.fetcher.release is None # not a real url, so won't find file on the internet abs_path = "app:///../node_modules/i/dont/exist.js" @@ -1518,9 +1512,8 @@ def test_node_modules_file_with_source_is_used(self): # in real life the preprocess step will pull release out of the data # dictionary passed to the JavaScriptStacktraceProcessor constructor, # but since this is just a unit test, we have to set it manually - processor.release = release + processor.fetcher.bind_release(release=release) # We need to initialize the fetcher so that it can capture the necessary context to execute file fetching. - processor._initialize_fetcher() # We found the source view. assert processor.get_or_fetch_sourceview(url=abs_path) @@ -1550,9 +1543,8 @@ def test_node_modules_file_with_source_but_no_map_records_error(self, mock_disco # in real life the preprocess step will pull release out of the data # dictionary passed to the JavaScriptStacktraceProcessor constructor, # but since this is just a unit test, we have to set it manually - processor.release = release + processor.fetcher.bind_release(release=release) # We need to initialize the fetcher so that it can capture the necessary context to execute file fetching. - processor._initialize_fetcher() # before caching, no errors assert len(processor.fetch_by_url_errors.get(abs_path, [])) == 0 From f8f9305d036bfbf0ba06b7f61b536a46e3a051e0 Mon Sep 17 00:00:00 2001 From: Armin Ronacher Date: Fri, 24 Feb 2023 11:41:52 +0100 Subject: [PATCH 17/28] test: Added dummy test to ensure debug_ids are properly ignored for now --- .../lang/javascript/test_plugin.py | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/tests/relay_integration/lang/javascript/test_plugin.py b/tests/relay_integration/lang/javascript/test_plugin.py index 6dc0e8d3c1f17e..ad2d67c8460c2e 100644 --- a/tests/relay_integration/lang/javascript/test_plugin.py +++ b/tests/relay_integration/lang/javascript/test_plugin.py @@ -343,6 +343,107 @@ def test_sourcemap_source_expansion(self): # its raw and original form should be identical assert raw_frame_list[1] == frame_list[1] + @responses.activate + def test_sourcemap_ignore_debug_id(self): + # This tests the same as the one above, but it ensures that for now the debug_id + # code that is not yet implemented, will be not make any damage yet. + responses.add( + responses.GET, + "http://example.com/file.min.js", + body=load_fixture("file.min.js"), + content_type="application/javascript; charset=utf-8", + ) + responses.add( + responses.GET, + "http://example.com/file1.js", + body=load_fixture("file1.js"), + content_type="application/javascript; charset=utf-8", + ) + responses.add( + responses.GET, + "http://example.com/file2.js", + body=load_fixture("file2.js"), + content_type="application/javascript; charset=utf-8", + ) + responses.add( + responses.GET, + "http://example.com/file.sourcemap.js", + body=load_fixture("file.sourcemap.js"), + content_type="application/javascript; charset=utf-8", + ) + responses.add(responses.GET, "http://example.com/index.html", body="Not Found", status=404) + + data = { + "timestamp": self.min_ago, + "message": "hello", + "platform": "javascript", + "debug_meta": { + "images": [ + { + "type": "sourcemap", + "debug_id": "c941d872-af1f-4f0c-a7ff-ad3d295fe153", + "code_file": "http://example.com/file.min.js", + } + ] + }, + "exception": { + "values": [ + { + "type": "Error", + "stacktrace": { + "frames": [ + { + "abs_path": "http://example.com/file.min.js", + "filename": "file.min.js", + "lineno": 1, + "colno": 39, + }, + # NOTE: Intentionally source is not retrieved from this HTML file + { + "function": 'function: "HTMLDocument."', + "abs_path": "http//example.com/index.html", + "filename": "index.html", + "lineno": 283, + "colno": 17, + "in_app": False, + }, + ] + }, + } + ] + }, + } + + event = self.post_and_retrieve_event(data) + + assert event.data["errors"] == [ + {"type": "js_no_source", "url": "http//example.com/index.html"} + ] + + exception = event.interfaces["exception"] + frame_list = exception.values[0].stacktrace.frames + + frame = frame_list[0] + assert frame.pre_context == ["function add(a, b) {", '\t"use strict";'] + expected = "\treturn a + b; // fôo" + assert frame.context_line == expected + assert frame.post_context == ["}", ""] + + raw_frame_list = exception.values[0].raw_stacktrace.frames + raw_frame = raw_frame_list[0] + assert not raw_frame.pre_context + assert ( + raw_frame.context_line + == 'function add(a,b){"use strict";return a+b}function multiply(a,b){"use strict";return a*b}function ' + 'divide(a,b){"use strict";try{return multip {snip}' + ) + assert raw_frame.post_context == ["//@ sourceMappingURL=file.sourcemap.js", ""] + assert raw_frame.lineno == 1 + + # Since we couldn't expand source for the 2nd frame, both + # its raw and original form should be identical + assert raw_frame_list[1] == frame_list[1] + @responses.activate def test_sourcemap_embedded_source_expansion(self): responses.add( From 45cb57fa158071ffc855fdd01773d4fd932e211b Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Fri, 24 Feb 2023 11:48:31 +0100 Subject: [PATCH 18/28] Add invalid base64 sourcemap test --- src/sentry/lang/javascript/processor.py | 3 +- .../lang/javascript/test_plugin.py | 48 +++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py index 09f9fb7f09adf7..ac44aa05df4347 100644 --- a/src/sentry/lang/javascript/processor.py +++ b/src/sentry/lang/javascript/processor.py @@ -1,4 +1,5 @@ import base64 +import binascii import errno import logging import re @@ -1396,7 +1397,7 @@ def _fetch_sourcemap_view_by_url(self, url, source=b""): force_bytes(url[BASE64_PREAMBLE_LENGTH:]) + (b"=" * (-(len(url) - BASE64_PREAMBLE_LENGTH) % 4)) ) - except TypeError as e: + except binascii.Error as e: raise UnparseableSourcemap({"url": "", "reason": str(e)}) else: # look in the database and, if not found, optionally try to scrape the web diff --git a/tests/relay_integration/lang/javascript/test_plugin.py b/tests/relay_integration/lang/javascript/test_plugin.py index ad2d67c8460c2e..e2769402fbbb2a 100644 --- a/tests/relay_integration/lang/javascript/test_plugin.py +++ b/tests/relay_integration/lang/javascript/test_plugin.py @@ -22,6 +22,8 @@ .replace("\n", "") ) +INVALID_BASE64_SOURCEMAP = "data:application/json;base64,A" + def get_fixture_path(name): return os.path.join(os.path.dirname(__file__), "fixtures", name) @@ -216,6 +218,52 @@ def test_inlined_sources(self, mock_discover_sourcemap, mock_fetch_by_url): assert not frame.post_context assert frame.data["sourcemap"] == "http://example.com/test.min.js" + @patch("sentry.lang.javascript.processor.Fetcher.fetch_by_url") + @patch("sentry.lang.javascript.processor.discover_sourcemap") + def test_invalid_base64_sourcemap_returns_an_error( + self, mock_discover_sourcemap, mock_fetch_by_url + ): + data = { + "timestamp": self.min_ago, + "message": "hello", + "platform": "javascript", + "exception": { + "values": [ + { + "type": "Error", + "stacktrace": { + "frames": [ + { + "abs_path": "http://example.com/test.min.js", + "filename": "test.js", + "lineno": 1, + "colno": 1, + } + ] + }, + } + ] + }, + } + + mock_discover_sourcemap.return_value = INVALID_BASE64_SOURCEMAP + + mock_fetch_by_url.return_value.url = "http://example.com/test.min.js" + mock_fetch_by_url.return_value.body = force_bytes("\n".join("")) + mock_fetch_by_url.return_value.encoding = None + + event = self.post_and_retrieve_event(data) + + mock_fetch_by_url.assert_called_once_with("http://example.com/test.min.js") + + assert len(event.data["errors"]) == 1 + assert event.data["errors"][0] == { + "url": "", + "reason": "Invalid base64-encoded string: " + "number of data characters (1) cannot be 1 more than a multiple of 4", + "type": "js_invalid_source", + } + @responses.activate def test_error_message_translations(self): data = { From daa99d7d82cc8fe415a8740863d1fcfd66380fa9 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Fri, 24 Feb 2023 11:52:09 +0100 Subject: [PATCH 19/28] Remove models --- src/sentry/models/artifactbundle.py | 90 ----------------------------- 1 file changed, 90 deletions(-) delete mode 100644 src/sentry/models/artifactbundle.py diff --git a/src/sentry/models/artifactbundle.py b/src/sentry/models/artifactbundle.py deleted file mode 100644 index 1c5f0822d9d273..00000000000000 --- a/src/sentry/models/artifactbundle.py +++ /dev/null @@ -1,90 +0,0 @@ -from enum import Enum - -from django.db import models -from django.utils import timezone - -from sentry.db.models import ( - BoundedBigIntegerField, - BoundedPositiveIntegerField, - FlexibleForeignKey, - Model, - region_silo_only_model, -) -from sentry.models import DB_VERSION_LENGTH - - -class SourceFileType(Enum): - SOURCE = 1 - MINIFIED_SOURCE = 2 - SOURCE_MAP = 3 - INDEXED_RAM_BUNDLE = 4 - - @classmethod - def choices(cls): - return [(key.value, key.name) for key in cls] - - -@region_silo_only_model -class ArtifactBundle(Model): - __include_in_export__ = False - - organization_id = BoundedBigIntegerField(db_index=True) - bundle_id = models.UUIDField(null=True) - file = FlexibleForeignKey("sentry.File") - artifact_count = BoundedPositiveIntegerField() - - class Meta: - app_label = "sentry" - db_table = "sentry_artifactbundle" - - unique_together = (("organization_id", "bundle_id"),) - - -@region_silo_only_model -class ReleaseArtifactBundle(Model): - __include_in_export__ = False - - organization_id = BoundedBigIntegerField(db_index=True) - release_name = models.CharField(max_length=DB_VERSION_LENGTH) - dist_id = BoundedBigIntegerField(null=True) - artifact_bundle = FlexibleForeignKey("sentry.ArtifactBundle") - - class Meta: - app_label = "sentry" - db_table = "sentry_releaseartifactbundle" - - unique_together = (("organization_id", "release_name", "dist_id", "artifact_bundle"),) - - -@region_silo_only_model -class DebugIdArtifactBundle(Model): - __include_in_export__ = False - - organization_id = BoundedBigIntegerField(db_index=True) - debug_id = models.UUIDField() - artifact_bundle = FlexibleForeignKey("sentry.ArtifactBundle") - source_file_type = models.IntegerField(choices=SourceFileType.choices()) - date_added = models.DateTimeField(default=timezone.now) - date_last_accessed = models.DateTimeField(default=timezone.now) - - class Meta: - app_label = "sentry" - db_table = "sentry_debugidartifactbundle" - - # We can have the same debug_id pointing to different artifact_bundle(s) because the user might upload - # the same artifacts twice, or they might have certain build files that don't change across builds. - unique_together = (("debug_id", "artifact_bundle", "source_file_type"),) - - -@region_silo_only_model -class ProjectArtifactBundle(Model): - __include_in_export__ = False - - project_id = BoundedBigIntegerField(db_index=True) - artifact_bundle = FlexibleForeignKey("sentry.ArtifactBundle") - - class Meta: - app_label = "sentry" - db_table = "sentry_projectartifactbundle" - - unique_together = (("project_id", "artifact_bundle"),) From 0c21f128c87b17dc2de7c0ed5a4bea5a260bc6b4 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Fri, 24 Feb 2023 12:03:53 +0100 Subject: [PATCH 20/28] Update comments --- src/sentry/lang/javascript/processor.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py index ac44aa05df4347..dd67c693042182 100644 --- a/src/sentry/lang/javascript/processor.py +++ b/src/sentry/lang/javascript/processor.py @@ -1292,7 +1292,8 @@ def _fetch_and_cache_sourceview(self, url, debug_id, source_file_type): # a valid file to cache return None else: - # TODO: it should do something but we don't know. + # TODO: aliasing is not used in the codebase as of now, we might want to remove it because we always call + # this function with url == frame.abs_path. if result.url != url: self.url_aliases[result.url] = url @@ -1445,8 +1446,8 @@ def populate_source_cache(self, frames): op="JavaScriptStacktraceProcessor.populate_source_cache.cache_source" ) as span: span.set_data("filename", filename) - # TODO: we want to implement the construction of the debug - # We first want to fetch the sourceview of the file. + # TODO: we want to implement the construction of the abs_path -> debug_id table. + # We first want to fetch the sourceview of the file but not the sourcemaps. self.get_or_fetch_sourceview(url=filename) def close(self): From 3e955cc8cfb794e2df01071f73fb47b793d5d30d Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Fri, 24 Feb 2023 12:05:25 +0100 Subject: [PATCH 21/28] Use return None --- src/sentry/lang/javascript/processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py index dd67c693042182..9848328f018317 100644 --- a/src/sentry/lang/javascript/processor.py +++ b/src/sentry/lang/javascript/processor.py @@ -714,7 +714,7 @@ def bind_release(self, release=None, dist=None): def fetch_by_debug_id(self, debug_id, source_file_type): # TODO: implement debug id lookup. - pass + return None def fetch_by_url(self, url): """ @@ -845,7 +845,7 @@ def fetch_by_url(self, url): def _fetch_sourcemap_by_debug_id(self): # TODO: implement debug id lookup. - pass + return None class JavaScriptStacktraceProcessor(StacktraceProcessor): From 6d2b0a1eac820998222acb36aa16e0174fc766e3 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Mon, 27 Feb 2023 14:03:44 +0100 Subject: [PATCH 22/28] Fix error lookup --- src/sentry/lang/javascript/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py index 9848328f018317..aedba381d0479e 100644 --- a/src/sentry/lang/javascript/processor.py +++ b/src/sentry/lang/javascript/processor.py @@ -1048,7 +1048,7 @@ def process_frame(self, processable_frame, processing_task): sourceview = self.get_or_fetch_sourceview(url=abs_path) if sourceview is None: - errors = self.fetch_by_url_errors[abs_path] + errors = self.fetch_by_url_errors.get(abs_path) if errors: all_errors.extend(errors) else: From 4032847a3ce1ed6c794aad637b3d83d17dccf244 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Mon, 27 Feb 2023 15:50:27 +0100 Subject: [PATCH 23/28] Fix performance issues --- src/sentry/lang/javascript/processor.py | 87 +++++++++---------- .../lang/javascript/test_plugin.py | 56 +++++++++++- .../sentry/lang/javascript/test_processor.py | 10 +-- 3 files changed, 101 insertions(+), 52 deletions(-) diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py index aedba381d0479e..26b9d78c07ac97 100644 --- a/src/sentry/lang/javascript/processor.py +++ b/src/sentry/lang/javascript/processor.py @@ -884,19 +884,17 @@ def __init__(self, *args, **kwargs): # applies to the many repetitions across the code. # Cache holding the results of the fetching by url. - self.fetch_by_url_result = {} + self.fetch_by_url_sourceviews = {} self.fetch_by_url_errors = {} + self.result_map = {} # Cache holding the results of the fetching by debug id. - self.fetch_by_debug_id_result = {} + self.fetch_by_debug_id_sourceviews = {} # Cache holding the sourcemaps views indexed by either debug id or sourcemap url. self.debug_id_sourcemap_cache = {} self.sourcemap_url_sourcemap_cache = {} - # Contains a mapping between the 'result.url' of the request and the file url from 'abs_path'. - self.url_aliases = {} - # Contains a mapping between the minified source url ('abs_path' of the frame) and the sourcemap url # which is discovered by looking at the minified source. self.minified_source_url_to_sourcemap_url = {} @@ -977,7 +975,7 @@ def process_frame(self, processable_frame, processing_task): # (or 'None' if the token doesnt provide us with the 'context_line') down the road. # TODO: add support for debug ids. sourceview = self.get_or_fetch_sourceview(url=frame["abs_path"]) - sourcemap_view = self.get_or_fetch_sourcemap_view(url=frame["abs_path"]) + sourcemap_cache = self.get_or_fetch_sourcemap_cache(url=frame["abs_path"]) # We check for errors once both the minified file and the corresponding sourcemaps are loaded. In case errors # happen in one of the two we will detect them and add them to 'all_errors'. @@ -995,11 +993,11 @@ def process_frame(self, processable_frame, processing_task): new_frame = dict(frame) raw_frame = dict(frame) - if sourcemap_view and frame.get("colno") is None: + if sourcemap_cache and frame.get("colno") is None: all_errors.append( {"type": EventError.JS_NO_COLUMN, "url": http.expose_url(frame["abs_path"])} ) - elif sourcemap_view: + elif sourcemap_cache: if is_data_uri(sourcemap_url): sourcemap_label = frame["abs_path"] else: @@ -1010,7 +1008,7 @@ def process_frame(self, processable_frame, processing_task): try: # Errors are 1-indexed in the frames. assert frame["lineno"] > 0, "line numbers are 1-indexed" - token = sourcemap_view.lookup(frame["lineno"], frame["colno"], LINES_OF_CONTEXT) + token = sourcemap_cache.lookup(frame["lineno"], frame["colno"], LINES_OF_CONTEXT) except Exception: token = None all_errors.append( @@ -1240,16 +1238,13 @@ def get_or_fetch_sourceview(self, url=None, debug_id=None, source_file_type=None def _get_cached_sourceview(self, url=None, debug_id=None, source_file_type=None): if debug_id is not None: - source_result = self.fetch_by_debug_id_result.get((debug_id, source_file_type)) - if source_result is not None: - sourceview = SourceView.from_bytes(source_result.body) + sourceview = self.fetch_by_debug_id_sourceviews.get((debug_id, source_file_type)) + if sourceview is not None: return sourceview if url is not None: - real_url = self.url_aliases.get(url, url) - source_result = self.fetch_by_url_result.get(real_url) - if source_result is not None: - sourceview = SourceView.from_bytes(source_result.body) + sourceview = self.fetch_by_url_sourceviews.get(url) + if sourceview is not None: return sourceview return None @@ -1272,10 +1267,8 @@ def _fetch_and_cache_sourceview(self, url, debug_id, source_file_type): span.set_data("debug_id", debug_id) result = self.fetcher.fetch_by_debug_id(debug_id, source_file_type) if result is not None: - # We temporarily store the result directly, in case we want to compute the sourceview on demand - # we can delete the other dictionary. - self.fetch_by_debug_id_result[debug_id, source_file_type] = result sourceview = SourceView.from_bytes(result.body) + self.fetch_by_debug_id_sourceviews[debug_id, source_file_type] = sourceview return sourceview try: @@ -1292,15 +1285,9 @@ def _fetch_and_cache_sourceview(self, url, debug_id, source_file_type): # a valid file to cache return None else: - # TODO: aliasing is not used in the codebase as of now, we might want to remove it because we always call - # this function with url == frame.abs_path. - if result.url != url: - self.url_aliases[result.url] = url - - # We temporarily store the result directly, in case we want to compute the sourceview on demand - # we can delete the other dictionary. - self.fetch_by_url_result[url] = result sourceview = SourceView.from_bytes(result.body) + self.fetch_by_url_sourceviews[url] = sourceview + self.result_map[url] = result.body sourcemap_url = discover_sourcemap(result) if sourcemap_url: @@ -1308,34 +1295,34 @@ def _fetch_and_cache_sourceview(self, url, debug_id, source_file_type): return sourceview - def get_or_fetch_sourcemap_view(self, url=None, debug_id=None): + def get_or_fetch_sourcemap_cache(self, url=None, debug_id=None): """ - Gets from the cache or fetches the sourcemap view of the file at 'url' or 'debug_id'. + Gets from the cache or fetches the SmCache of the file at 'url' or 'debug_id'. Returns None when a sourcemap corresponding to the file at 'url' is not found. """ - rv = self._get_cached_sourcemap_view(url, debug_id) + rv = self._get_cached_sourcemap_cache(url, debug_id) if rv is not None: return rv - return self._fetch_and_cache_sourcemap_view(url, debug_id) + return self._fetch_and_cache_sourcemap_cache(url, debug_id) - def _get_cached_sourcemap_view(self, url, debug_id): + def _get_cached_sourcemap_cache(self, url, debug_id): if debug_id is not None: rv = self.debug_id_sourcemap_cache.get(debug_id) if rv is not None: return rv if url is not None: - real_url = self.url_aliases.get(url, url) - sourcemap_url = self.minified_source_url_to_sourcemap_url.get(real_url) + # We get the url of the sourcemap from the url of the minified file. + sourcemap_url = self.minified_source_url_to_sourcemap_url.get(url) rv = self.sourcemap_url_sourcemap_cache.get(sourcemap_url) if rv is not None: return rv return None - def _fetch_and_cache_sourcemap_view(self, url, debug_id): + def _fetch_and_cache_sourcemap_cache(self, url, debug_id): if debug_id is not None: sourcemap_view = self._handle_debug_id_sourcemap_lookup(debug_id) if sourcemap_view is not None: @@ -1349,20 +1336,27 @@ def _fetch_and_cache_sourcemap_view(self, url, debug_id): return None def _handle_debug_id_sourcemap_lookup(self, debug_id): - minified_result = self.fetch_by_debug_id_result.get(debug_id) - if minified_result is None: + minified_sourceview = self.fetch_by_debug_id_sourceviews.get(debug_id) + if minified_sourceview is None: return None # TODO: use enum for source file type when available. result = self.fetcher.fetch_by_debug_id(debug_id, "sourcemap") if result is not None: - sourcemap_view = SmCache.from_bytes(minified_result.body, result.body) + sourcemap_view = SmCache.from_bytes( + minified_sourceview.get_source().encode(), result.body + ) self.debug_id_sourcemap_cache[debug_id] = sourcemap_view return sourcemap_view def _handle_url_sourcemap_lookup(self, url): - minified_result = self.fetch_by_url_result.get(url) - if minified_result is None: + # In case there are any fetching errors tied to the url of the minified file, we don't want to attempt the + # construction of the sourcemap cache. + if len(self.fetch_by_url_errors.get(url, [])) > 0: + return None + + minified_sourceview = self.fetch_by_url_sourceviews.get(url) + if minified_sourceview is None: return None sourcemap_url = self.minified_source_url_to_sourcemap_url.get(url) @@ -1375,8 +1369,8 @@ def _handle_url_sourcemap_lookup(self, url): ) as span: span.set_data("url", url) span.set_data("sourcemap_url", sourcemap_url) - sourcemap_view = self._fetch_sourcemap_view_by_url( - sourcemap_url, source=minified_result.body + sourcemap_cache = self._fetch_sourcemap_cache_by_url( + sourcemap_url, source=minified_sourceview.get_source().encode() ) except http.BadSource as exc: # we don't perform the same check here as above, because if someone has @@ -1388,10 +1382,10 @@ def _handle_url_sourcemap_lookup(self, url): self.fetch_by_url_errors.setdefault(url, []).append(exc.data) return None else: - self.sourcemap_url_sourcemap_cache[sourcemap_url] = sourcemap_view - return sourcemap_view + self.sourcemap_url_sourcemap_cache[sourcemap_url] = sourcemap_cache + return sourcemap_cache - def _fetch_sourcemap_view_by_url(self, url, source=b""): + def _fetch_sourcemap_cache_by_url(self, url, source=b""): if is_data_uri(url): try: body = base64.b64decode( @@ -1409,12 +1403,13 @@ def _fetch_sourcemap_view_by_url(self, url, source=b""): result = self.fetcher.fetch_by_url(url) body = result.body + try: with sentry_sdk.start_span( op="JavaScriptStacktraceProcessor.fetch_sourcemap_view_by_url.SmCache.from_bytes" ): + # This is an expensive operation that should be executed as few times as possible. return SmCache.from_bytes(source, body) - except Exception as exc: # This is in debug because the product shows an error already. logger.debug(str(exc), exc_info=True) diff --git a/tests/relay_integration/lang/javascript/test_plugin.py b/tests/relay_integration/lang/javascript/test_plugin.py index e2769402fbbb2a..3e8ad4cb6fbc65 100644 --- a/tests/relay_integration/lang/javascript/test_plugin.py +++ b/tests/relay_integration/lang/javascript/test_plugin.py @@ -238,7 +238,7 @@ def test_invalid_base64_sourcemap_returns_an_error( "filename": "test.js", "lineno": 1, "colno": 1, - } + }, ] }, } @@ -264,6 +264,60 @@ def test_invalid_base64_sourcemap_returns_an_error( "type": "js_invalid_source", } + @patch("sentry.lang.javascript.processor.SmCache.from_bytes") + @patch("sentry.lang.javascript.processor.Fetcher.fetch_by_url") + @patch("sentry.lang.javascript.processor.discover_sourcemap") + def test_sourcemap_cache_is_constructed_only_once_if_an_error_is_raised( + self, mock_discover_sourcemap, mock_fetch_by_url, mock_from_bytes + ): + data = { + "timestamp": self.min_ago, + "message": "hello", + "platform": "javascript", + "exception": { + "values": [ + { + "type": "Error", + "stacktrace": { + "frames": [ + { + "abs_path": "http://example.com/test.min.js", + "filename": "test.js", + "lineno": 1, + "colno": 1, + }, + { + "abs_path": "http://example.com/test.min.js", + "filename": "test.js", + "lineno": 1, + "colno": 1, + }, + { + "abs_path": "http://example.com/test.min.js", + "filename": "test.js", + "lineno": 1, + "colno": 1, + }, + ] + }, + } + ] + }, + } + + mock_discover_sourcemap.return_value = BASE64_SOURCEMAP + + mock_fetch_by_url.return_value.url = "http://example.com/test.min.js" + mock_fetch_by_url.return_value.body = force_bytes("\n".join("")) + mock_fetch_by_url.return_value.encoding = None + + mock_from_bytes.side_effect = Exception() + + self.post_and_retrieve_event(data) + + mock_fetch_by_url.assert_called_once_with("http://example.com/test.min.js") + mock_from_bytes.assert_called_once() + @responses.activate def test_error_message_translations(self): data = { diff --git a/tests/sentry/lang/javascript/test_processor.py b/tests/sentry/lang/javascript/test_processor.py index fcda647b89e30a..bb24c87d528381 100644 --- a/tests/sentry/lang/javascript/test_processor.py +++ b/tests/sentry/lang/javascript/test_processor.py @@ -1160,7 +1160,7 @@ def test_simple_base64(self): processor = JavaScriptStacktraceProcessor( data={}, stacktrace_infos=None, project=self.create_project() ) - smap_view = processor._fetch_sourcemap_view_by_url(base64_sourcemap) + smap_view = processor._fetch_sourcemap_cache_by_url(base64_sourcemap) token = smap_view.lookup(1, 1, 0) assert token.src == "/test.js" @@ -1172,7 +1172,7 @@ def test_base64_without_padding(self): processor = JavaScriptStacktraceProcessor( data={}, stacktrace_infos=None, project=self.create_project() ) - smap_view = processor._fetch_sourcemap_view_by_url(base64_sourcemap.rstrip("=")) + smap_view = processor._fetch_sourcemap_cache_by_url(base64_sourcemap.rstrip("=")) token = smap_view.lookup(1, 1, 0) assert token.src == "/test.js" @@ -1185,7 +1185,7 @@ def test_broken_base64(self): processor = JavaScriptStacktraceProcessor( data={}, stacktrace_infos=None, project=self.create_project() ) - processor._fetch_sourcemap_view_by_url("data:application/json;base64,xxx") + processor._fetch_sourcemap_cache_by_url("data:application/json;base64,xxx") @responses.activate def test_garbage_json(self): @@ -1197,7 +1197,7 @@ def test_garbage_json(self): processor = JavaScriptStacktraceProcessor( data={}, stacktrace_infos=None, project=self.create_project() ) - processor._fetch_sourcemap_view_by_url("http://example.com") + processor._fetch_sourcemap_cache_by_url("http://example.com") class TrimLineTest(unittest.TestCase): @@ -1550,7 +1550,7 @@ def test_node_modules_file_with_source_but_no_map_records_error(self, mock_disco assert len(processor.fetch_by_url_errors.get(abs_path, [])) == 0 processor.get_or_fetch_sourceview(url=abs_path) - processor.get_or_fetch_sourcemap_view(url=abs_path) + processor.get_or_fetch_sourcemap_cache(url=abs_path) # now we have an error assert len(processor.fetch_by_url_errors.get(abs_path, [])) == 1 From f9fa5e05ab87daad47e586cefca06bf57164b705 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Mon, 27 Feb 2023 16:04:14 +0100 Subject: [PATCH 24/28] Improve --- src/sentry/lang/javascript/processor.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py index 26b9d78c07ac97..cb0ab8117df342 100644 --- a/src/sentry/lang/javascript/processor.py +++ b/src/sentry/lang/javascript/processor.py @@ -886,7 +886,6 @@ def __init__(self, *args, **kwargs): # Cache holding the results of the fetching by url. self.fetch_by_url_sourceviews = {} self.fetch_by_url_errors = {} - self.result_map = {} # Cache holding the results of the fetching by debug id. self.fetch_by_debug_id_sourceviews = {} @@ -1287,7 +1286,6 @@ def _fetch_and_cache_sourceview(self, url, debug_id, source_file_type): else: sourceview = SourceView.from_bytes(result.body) self.fetch_by_url_sourceviews[url] = sourceview - self.result_map[url] = result.body sourcemap_url = discover_sourcemap(result) if sourcemap_url: From a2300490d377aff9dc55d70933484facd15f2845 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Mon, 27 Feb 2023 16:23:32 +0100 Subject: [PATCH 25/28] noop From 2499cad97021ff82a51fae2b659ff03b101656de Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Mon, 27 Feb 2023 16:25:14 +0100 Subject: [PATCH 26/28] Fix test --- tests/sentry/lang/javascript/test_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/sentry/lang/javascript/test_processor.py b/tests/sentry/lang/javascript/test_processor.py index bb24c87d528381..d0bf1700153ebd 100644 --- a/tests/sentry/lang/javascript/test_processor.py +++ b/tests/sentry/lang/javascript/test_processor.py @@ -1518,7 +1518,7 @@ def test_node_modules_file_with_source_is_used(self): # We found the source view. assert processor.get_or_fetch_sourceview(url=abs_path) # Source view exists in cache. - assert processor.fetch_by_url_result.get(abs_path) + assert processor.fetch_by_url_sourceviews.get(abs_path) assert len(processor.fetch_by_url_errors.get(abs_path, [])) == 0 @patch("sentry.lang.javascript.processor.discover_sourcemap") From ffbf97b41a3a1d0599daa21a540524007258907b Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Tue, 28 Feb 2023 13:50:28 +0100 Subject: [PATCH 27/28] Fix typing --- src/sentry/lang/javascript/processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py index cb0ab8117df342..9d3d76b400a3dd 100644 --- a/src/sentry/lang/javascript/processor.py +++ b/src/sentry/lang/javascript/processor.py @@ -1342,7 +1342,7 @@ def _handle_debug_id_sourcemap_lookup(self, debug_id): result = self.fetcher.fetch_by_debug_id(debug_id, "sourcemap") if result is not None: sourcemap_view = SmCache.from_bytes( - minified_sourceview.get_source().encode(), result.body + minified_sourceview.get_source().encode("utf-8"), result.body ) self.debug_id_sourcemap_cache[debug_id] = sourcemap_view return sourcemap_view @@ -1350,7 +1350,7 @@ def _handle_debug_id_sourcemap_lookup(self, debug_id): def _handle_url_sourcemap_lookup(self, url): # In case there are any fetching errors tied to the url of the minified file, we don't want to attempt the # construction of the sourcemap cache. - if len(self.fetch_by_url_errors.get(url, [])) > 0: + if url in self.fetch_by_url_errors: return None minified_sourceview = self.fetch_by_url_sourceviews.get(url) From bff6baf154e57344c8a0a3fd2fe656a46c7765c4 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Tue, 28 Feb 2023 15:11:20 +0100 Subject: [PATCH 28/28] Fix wrong condition --- src/sentry/lang/javascript/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py index 9d3d76b400a3dd..1ee577821f0023 100644 --- a/src/sentry/lang/javascript/processor.py +++ b/src/sentry/lang/javascript/processor.py @@ -1226,7 +1226,7 @@ def get_or_fetch_sourceview(self, url=None, debug_id=None, source_file_type=None """ # We require that artifact bundles with debug ids used embedded source in the source map. # TODO: use enum for source file type when available. - if debug_id is not None and source_file_type != "source": + if debug_id is not None and source_file_type == "source": return None rv = self._get_cached_sourceview(url, debug_id, source_file_type)