Skip to content

Commit ca00876

Browse files
feat(uptime): Implement detector handler (#91107)
Still a work in progress
1 parent 2c8cfac commit ca00876

File tree

7 files changed

+722
-109
lines changed

7 files changed

+722
-109
lines changed

src/sentry/uptime/consumers/results_consumer.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
)
2727
from sentry.uptime.detectors.ranking import _get_cluster
2828
from sentry.uptime.detectors.result_handler import handle_onboarding_result
29+
from sentry.uptime.grouptype import UptimePacketValue
2930
from sentry.uptime.issue_platform import create_issue_platform_occurrence, resolve_uptime_issue
3031
from sentry.uptime.models import (
3132
UptimeStatus,
@@ -43,11 +44,17 @@
4344
send_uptime_config_deletion,
4445
update_remote_uptime_subscription,
4546
)
46-
from sentry.uptime.types import IncidentStatus, ProjectUptimeSubscriptionMode
47+
from sentry.uptime.types import (
48+
DATA_SOURCE_UPTIME_SUBSCRIPTION,
49+
IncidentStatus,
50+
ProjectUptimeSubscriptionMode,
51+
)
4752
from sentry.utils import metrics
4853
from sentry.utils.arroyo_producer import SingletonProducer
4954
from sentry.utils.kafka_config import get_kafka_producer_cluster_options, get_topic_definition
55+
from sentry.workflow_engine.models.data_source import DataPacket
5056
from sentry.workflow_engine.models.detector import Detector
57+
from sentry.workflow_engine.processors.data_packet import process_data_packets
5158

5259
logger = logging.getLogger(__name__)
5360

@@ -292,6 +299,33 @@ def handle_active_result(
292299
result: CheckResult,
293300
metric_tags: dict[str, str],
294301
):
302+
organization = detector.project.organization
303+
304+
if features.has("organizations:uptime-detector-handler", organization):
305+
# XXX(epurkhiser): Enabling the uptime-detector-handler will process
306+
# check results via the uptime detector handler. However the handler
307+
# WILL NOT produce issue occurrences (however it will do nearly
308+
# everything else, including logging that it will produce)
309+
packet = UptimePacketValue(
310+
check_result=result,
311+
subscription=uptime_subscription,
312+
metric_tags=metric_tags,
313+
)
314+
process_data_packets(
315+
[DataPacket(source_id=str(uptime_subscription.id), packet=packet)],
316+
DATA_SOURCE_UPTIME_SUBSCRIPTION,
317+
)
318+
319+
# Bail if we're doing issue creation via detectors, we don't want to
320+
# create issues using the legacy system in this case. If this flag is
321+
# not enabkled the detector will still run, but will not produce an
322+
# issue occurrence.
323+
#
324+
# Once we've determined that the detector handler is producing issues
325+
# the same as the legacy issue creation, we can remove this.
326+
if features.has("organizations:uptime-detector-create-issues", organization):
327+
return
328+
295329
uptime_status = uptime_subscription.uptime_status
296330
result_status = result["status"]
297331

src/sentry/uptime/grouptype.py

Lines changed: 276 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,289 @@
11
from __future__ import annotations
22

3+
import logging
34
from dataclasses import dataclass
5+
from datetime import datetime
6+
from typing import override
47

8+
from django.utils import timezone as django_timezone
9+
from sentry_kafka_schemas.schema_types.uptime_results_v1 import CheckResult, CheckStatus
10+
11+
from sentry import features, options
512
from sentry.issues.grouptype import GroupCategory, GroupType
13+
from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
14+
from sentry.issues.status_change_message import StatusChangeMessage
615
from sentry.ratelimits.sliding_windows import Quota
716
from sentry.types.group import PriorityLevel
17+
from sentry.uptime.models import UptimeStatus, UptimeSubscription, get_project_subscription
818
from sentry.uptime.types import (
919
GROUP_TYPE_UPTIME_DOMAIN_CHECK_FAILURE,
1020
ProjectUptimeSubscriptionMode,
1121
)
12-
from sentry.workflow_engine.types import DetectorSettings
22+
from sentry.utils import metrics
23+
from sentry.workflow_engine.handlers.detector.base import DetectorOccurrence, EventData
24+
from sentry.workflow_engine.handlers.detector.stateful import (
25+
DetectorThresholds,
26+
StatefulDetectorHandler,
27+
)
28+
from sentry.workflow_engine.models import DataPacket, Detector
29+
from sentry.workflow_engine.processors.data_condition_group import ProcessedDataConditionGroup
30+
from sentry.workflow_engine.types import (
31+
DetectorEvaluationResult,
32+
DetectorGroupKey,
33+
DetectorPriorityLevel,
34+
DetectorSettings,
35+
)
36+
37+
logger = logging.getLogger(__name__)
38+
39+
40+
@dataclass(frozen=True)
41+
class UptimePacketValue:
42+
"""
43+
Represents the value passed into the uptime detector
44+
"""
45+
46+
check_result: CheckResult
47+
subscription: UptimeSubscription
48+
metric_tags: dict[str, str]
49+
50+
51+
def build_detector_fingerprint_component(detector: Detector) -> str:
52+
return f"uptime-detector:{detector.id}"
53+
54+
55+
def build_fingerprint(detector: Detector) -> list[str]:
56+
return [build_detector_fingerprint_component(detector)]
57+
58+
59+
def get_active_failure_threshold() -> int:
60+
"""
61+
When in active monitoring mode, overrides how many failures in a row we
62+
need to see to mark the monitor as down
63+
"""
64+
return options.get("uptime.active-failure-threshold")
65+
66+
67+
def get_active_recovery_threshold() -> int:
68+
"""
69+
When in active monitoring mode, how many successes in a row do we need to
70+
mark it as up
71+
"""
72+
return options.get("uptime.active-recovery-threshold")
73+
74+
75+
def build_evidence_display(result: CheckResult) -> list[IssueEvidence]:
76+
evidence_display: list[IssueEvidence] = []
77+
78+
status_reason = result["status_reason"]
79+
if status_reason:
80+
reason_evidence = IssueEvidence(
81+
name="Failure reason",
82+
value=f'{status_reason["type"]} - {status_reason["description"]}',
83+
important=True,
84+
)
85+
evidence_display.extend([reason_evidence])
86+
87+
duration_evidence = IssueEvidence(
88+
name="Duration",
89+
value=f"{result["duration_ms"]}ms",
90+
important=False,
91+
)
92+
evidence_display.append(duration_evidence)
93+
94+
request_info = result["request_info"]
95+
if request_info:
96+
method_evidence = IssueEvidence(
97+
name="Method",
98+
value=request_info["request_type"],
99+
important=False,
100+
)
101+
status_code_evidence = IssueEvidence(
102+
name="Status Code",
103+
value=str(request_info["http_status_code"]),
104+
important=False,
105+
)
106+
evidence_display.extend([method_evidence, status_code_evidence])
107+
108+
return evidence_display
109+
110+
111+
def build_event_data(result: CheckResult, detector: Detector) -> EventData:
112+
# Default environment when it hasn't been configured
113+
env = detector.config.get("environment", "prod")
114+
115+
# Received time is the actual time the check was performed.
116+
received = datetime.fromtimestamp(result["actual_check_time_ms"] / 1000)
117+
118+
# XXX(epurkhiser): This can be changed over to using the detector ID in the
119+
# future once we're no longer using the ProjectUptimeSubscription.id as a tag.
120+
project_subscription = get_project_subscription(detector)
121+
122+
return {
123+
"project_id": detector.project_id,
124+
"environment": env,
125+
"received": received,
126+
"platform": "other",
127+
"sdk": None,
128+
"tags": {
129+
"uptime_rule": str(project_subscription.id),
130+
},
131+
"contexts": {
132+
"trace": {"trace_id": result["trace_id"], "span_id": result.get("span_id")},
133+
},
134+
}
135+
136+
137+
class UptimeDetectorHandler(StatefulDetectorHandler[UptimePacketValue, CheckStatus]):
138+
@override
139+
@property
140+
def thresholds(self) -> DetectorThresholds:
141+
return {
142+
DetectorPriorityLevel.OK: get_active_recovery_threshold(),
143+
DetectorPriorityLevel.HIGH: get_active_failure_threshold(),
144+
}
145+
146+
@override
147+
def extract_value(self, data_packet: DataPacket[UptimePacketValue]) -> CheckStatus:
148+
return data_packet.packet.check_result["status"]
149+
150+
@override
151+
def build_issue_fingerprint(self, group_key: DetectorGroupKey = None) -> list[str]:
152+
# TODO(epurkhiser): We should migrate the fingerprints over to match
153+
# what the default fingerprint is.
154+
return build_fingerprint(self.detector)
155+
156+
@override
157+
def extract_dedupe_value(self, data_packet: DataPacket[UptimePacketValue]) -> int:
158+
return int(data_packet.packet.check_result["scheduled_check_time_ms"])
159+
160+
@override
161+
def evaluate(
162+
self, data_packet: DataPacket[UptimePacketValue]
163+
) -> dict[DetectorGroupKey, DetectorEvaluationResult]:
164+
result = super().evaluate(data_packet)
165+
166+
if not result:
167+
return result
168+
169+
# Uptime does not use stateful detector value grouping
170+
evaluation = result[None]
171+
172+
uptime_subscription = data_packet.packet.subscription
173+
metric_tags = data_packet.packet.metric_tags
174+
175+
detector_issue_creation_enabled = features.has(
176+
"organizations:uptime-detector-create-issues",
177+
self.detector.project.organization,
178+
)
179+
issue_creation_flag_enabled = features.has(
180+
"organizations:uptime-create-issues",
181+
self.detector.project.organization,
182+
)
183+
restricted_host_provider_ids = options.get(
184+
"uptime.restrict-issue-creation-by-hosting-provider-id"
185+
)
186+
host_provider_id = uptime_subscription.host_provider_id
187+
host_provider_enabled = host_provider_id not in restricted_host_provider_ids
188+
189+
issue_creation_allowed = (
190+
detector_issue_creation_enabled
191+
and issue_creation_flag_enabled
192+
and host_provider_enabled
193+
)
194+
195+
# XXX(epurkhiser): We currently are duplicating the detector state onto
196+
# the uptime_subscription when the detector changes state. Once we stop
197+
# using this field we can drop this update logic.
198+
#
199+
# We ONLY do this when detector issue creation is enabled, otherwise we
200+
# let the legacy uptime consumer handle this.
201+
if detector_issue_creation_enabled:
202+
if evaluation.priority == DetectorPriorityLevel.OK:
203+
uptime_status = UptimeStatus.OK
204+
elif evaluation.priority != DetectorPriorityLevel.OK:
205+
uptime_status = UptimeStatus.FAILED
206+
207+
uptime_subscription.update(
208+
uptime_status=uptime_status,
209+
uptime_status_update_date=django_timezone.now(),
210+
)
211+
212+
if not host_provider_enabled:
213+
metrics.incr(
214+
"uptime.result_processor.restricted_by_provider",
215+
sample_rate=1.0,
216+
tags={
217+
"host_provider_id": host_provider_id,
218+
**metric_tags,
219+
},
220+
)
221+
222+
result_creates_issue = isinstance(evaluation.result, IssueOccurrence)
223+
result_resolves_issue = isinstance(evaluation.result, StatusChangeMessage)
224+
225+
if result_creates_issue:
226+
metrics.incr(
227+
"uptime.detector.will_create_issue",
228+
tags=metric_tags,
229+
sample_rate=1.0,
230+
)
231+
# XXX(epurkhiser): This logging includes the same extra arguments
232+
# as the `uptime_active_sent_occurrence` log in the consumer for
233+
# legacy creation
234+
logger.info(
235+
"uptime.detector.will_create_issue",
236+
extra={
237+
"project_id": self.detector.project_id,
238+
"url": uptime_subscription.url,
239+
**data_packet.packet.check_result,
240+
},
241+
)
242+
if result_resolves_issue:
243+
metrics.incr(
244+
"uptime.detector.will_resolve_issue",
245+
sample_rate=1.0,
246+
tags=metric_tags,
247+
)
248+
logger.info(
249+
"uptime.detector.will_resolve_issue",
250+
extra={
251+
"project_id": self.detector.project_id,
252+
"url": uptime_subscription.url,
253+
**data_packet.packet.check_result,
254+
},
255+
)
256+
257+
# Reutning an empty dict effectively causes the detector processor to
258+
# bail and not produce an issue occurrence.
259+
if result_creates_issue and not issue_creation_allowed:
260+
return {}
261+
262+
return result
263+
264+
@override
265+
def create_occurrence(
266+
self,
267+
evaluation_result: ProcessedDataConditionGroup,
268+
data_packet: DataPacket[UptimePacketValue],
269+
priority: DetectorPriorityLevel,
270+
) -> tuple[DetectorOccurrence, EventData]:
271+
result = data_packet.packet.check_result
272+
uptime_subscription = data_packet.packet.subscription
273+
274+
occurrence = DetectorOccurrence(
275+
issue_title=f"Downtime detected for {uptime_subscription.url}",
276+
subtitle="Your monitored domain is down",
277+
evidence_display=build_evidence_display(result),
278+
type=UptimeDomainCheckFailure,
279+
level="error",
280+
culprit="", # TODO: The url?
281+
assignee=self.detector.owner,
282+
priority=priority,
283+
)
284+
event_data = build_event_data(result, self.detector)
285+
286+
return (occurrence, event_data)
13287

14288

15289
@dataclass(frozen=True)
@@ -24,6 +298,7 @@ class UptimeDomainCheckFailure(GroupType):
24298
enable_auto_resolve = False
25299
enable_escalation_detection = False
26300
detector_settings = DetectorSettings(
301+
handler=UptimeDetectorHandler,
27302
config_schema={
28303
"$schema": "https://json-schema.org/draft/2020-12/schema",
29304
"description": "A representation of an uptime alert",

0 commit comments

Comments
 (0)