ref(profiling): Eagerly hash stack for profiles (#1755)

Zylphrex · web-flow · commit 01dc7ee45c93 · 2022-11-30T08:51:24.000-08:00
Hashing the stack is an expensive operation and the same stack is used for
parallel transactions happening on various threads. Instead of hashing it each
time it's used.
diff --git a/sentry_sdk/profiler.py b/sentry_sdk/profiler.py
@@ -53,7 +53,9 @@
     from typing_extensions import TypedDict
     import sentry_sdk.tracing
 
-    RawSampleData = Tuple[int, Sequence[Tuple[str, Sequence[RawFrameData]]]]
+    RawStack = Tuple[RawFrameData, ...]
+    RawSample = Sequence[Tuple[str, RawStack]]
+    RawSampleWithId = Sequence[Tuple[str, int, RawStack]]
 
     ProcessedStack = Tuple[int, ...]
 
@@ -153,7 +155,7 @@ def teardown_profiler():
 
 
 def extract_stack(frame, max_stack_depth=MAX_STACK_DEPTH):
-    # type: (Optional[FrameType], int) -> Sequence[RawFrameData]
+    # type: (Optional[FrameType], int) -> Tuple[RawFrameData, ...]
     """
     Extracts the stack starting the specified frame. The extracted stack
     assumes the specified frame is the top of the stack, and works back
@@ -328,12 +330,14 @@ class SampleBuffer(object):
     def __init__(self, capacity):
         # type: (int) -> None
 
-        self.buffer = [None] * capacity  # type: List[Optional[RawSampleData]]
+        self.buffer = [
+            None
+        ] * capacity  # type: List[Optional[Tuple[int, RawSampleWithId]]]
         self.capacity = capacity  # type: int
         self.idx = 0  # type: int
 
-    def write(self, sample):
-        # type: (RawSampleData) -> None
+    def write(self, ts, raw_sample):
+        # type: (int, RawSample) -> None
         """
         Writing to the buffer is not thread safe. There is the possibility
         that parallel writes will overwrite one another.
@@ -346,7 +350,24 @@ def write(self, sample):
         any synchronization mechanisms here like locks.
         """
         idx = self.idx
-        self.buffer[idx] = sample
+
+        sample = [
+            (
+                thread_id,
+                # Instead of mapping the stack into frame ids and hashing
+                # that as a tuple, we can directly hash the stack.
+                # This saves us from having to generate yet another list.
+                # Additionally, using the stack as the key directly is
+                # costly because the stack can be large, so we pre-hash
+                # the stack, and use the hash as the key as this will be
+                # needed a few times to improve performance.
+                hash(stack),
+                stack,
+            )
+            for thread_id, stack in raw_sample
+        ]
+
+        self.buffer[idx] = (ts, sample)
         self.idx = (idx + 1) % self.capacity
 
     def slice_profile(self, start_ns, stop_ns):
@@ -357,27 +378,13 @@ def slice_profile(self, start_ns, stop_ns):
         frames = dict()  # type: Dict[RawFrameData, int]
         frames_list = list()  # type: List[ProcessedFrame]
 
-        # TODO: This is doing an naive iteration over the
-        # buffer and extracting the appropriate samples.
-        #
-        # Is it safe to assume that the samples are always in
-        # chronological order and binary search the buffer?
         for ts, sample in filter(None, self.buffer):
             if start_ns > ts or ts > stop_ns:
                 continue
 
             elapsed_since_start_ns = str(ts - start_ns)
 
-            for tid, stack in sample:
-                # Instead of mapping the stack into frame ids and hashing
-                # that as a tuple, we can directly hash the stack.
-                # This saves us from having to generate yet another list.
-                # Additionally, using the stack as the key directly is
-                # costly because the stack can be large, so we pre-hash
-                # the stack, and use the hash as the key as this will be
-                # needed a few times to improve performance.
-                hashed_stack = hash(stack)
-
+            for tid, hashed_stack, stack in sample:
                 # Check if the stack is indexed first, this lets us skip
                 # indexing frames if it's not necessary
                 if hashed_stack not in stacks:
@@ -433,13 +440,11 @@ def _sample_stack(*args, **kwargs):
             """
 
             self.write(
-                (
-                    nanosecond_time(),
-                    [
-                        (str(tid), extract_stack(frame))
-                        for tid, frame in sys._current_frames().items()
-                    ],
-                )
+                nanosecond_time(),
+                [
+                    (str(tid), extract_stack(frame))
+                    for tid, frame in sys._current_frames().items()
+                ],
             )
 
         return _sample_stack
diff --git a/tests/test_profiler.py b/tests/test_profiler.py
@@ -249,8 +249,8 @@ def __init__(self, capacity, sample_data=None):
 
     def make_sampler(self):
         def _sample_stack(*args, **kwargs):
-            print("writing", self.sample_data[0])
-            self.write(self.sample_data.pop(0))
+            ts, sample = self.sample_data.pop(0)
+            self.write(ts, sample)
 
         return _sample_stack
 
@@ -760,7 +760,7 @@ def test_thread_scheduler_single_background_thread(scheduler_class):
 )
 def test_sample_buffer(capacity, start_ns, stop_ns, samples, profile):
     buffer = SampleBuffer(capacity)
-    for sample in samples:
-        buffer.write(sample)
+    for ts, sample in samples:
+        buffer.write(ts, sample)
     result = buffer.slice_profile(start_ns, stop_ns)
     assert result == profile