Skip to content

Commit a01173f

Browse files
committed
feat(uptime): Implement detector handler
1 parent d599ed7 commit a01173f

File tree

7 files changed

+597
-109
lines changed

7 files changed

+597
-109
lines changed

src/sentry/uptime/consumers/results_consumer.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
)
2727
from sentry.uptime.detectors.ranking import _get_cluster
2828
from sentry.uptime.detectors.result_handler import handle_onboarding_result
29+
from sentry.uptime.grouptype import UptimePacketValue
2930
from sentry.uptime.issue_platform import create_issue_platform_occurrence, resolve_uptime_issue
3031
from sentry.uptime.models import (
3132
UptimeStatus,
@@ -43,11 +44,17 @@
4344
send_uptime_config_deletion,
4445
update_remote_uptime_subscription,
4546
)
46-
from sentry.uptime.types import IncidentStatus, ProjectUptimeSubscriptionMode
47+
from sentry.uptime.types import (
48+
DATA_SOURCE_UPTIME_SUBSCRIPTION,
49+
IncidentStatus,
50+
ProjectUptimeSubscriptionMode,
51+
)
4752
from sentry.utils import metrics
4853
from sentry.utils.arroyo_producer import SingletonProducer
4954
from sentry.utils.kafka_config import get_kafka_producer_cluster_options, get_topic_definition
55+
from sentry.workflow_engine.models.data_source import DataPacket
5056
from sentry.workflow_engine.models.detector import Detector
57+
from sentry.workflow_engine.processors.data_packet import process_data_packets
5158

5259
logger = logging.getLogger(__name__)
5360

@@ -292,6 +299,33 @@ def handle_active_result(
292299
result: CheckResult,
293300
metric_tags: dict[str, str],
294301
):
302+
organization = detector.project.organization
303+
304+
if features.has("organizations:uptime-detector-handler", organization):
305+
# XXX(epurkhiser): Enabling the uptime-detector-handler will process
306+
# check results via the uptime detector handler. However the handler
307+
# WILL NOT produce issue occurrences (however it will do nearly
308+
# everything else, including logging that it will produce)
309+
packet = UptimePacketValue(
310+
check_result=result,
311+
subscription=uptime_subscription,
312+
metric_tags=metric_tags,
313+
)
314+
process_data_packets(
315+
[DataPacket(source_id=str(uptime_subscription.id), packet=packet)],
316+
DATA_SOURCE_UPTIME_SUBSCRIPTION,
317+
)
318+
319+
# Bail if we're doing issue creation via detectors, we don't want to
320+
# create issues using the legacy system in this case. If this flag is
321+
# not enabkled the detector will still run, but will not produce an
322+
# issue occurrence.
323+
#
324+
# Once we've determined that the detector handler is producing issues
325+
# the same as the legacy issue creation, we can remove this.
326+
if features.has("organizations:uptime-detector-create-issues", organization):
327+
return
328+
295329
uptime_status = uptime_subscription.uptime_status
296330
result_status = result["status"]
297331

src/sentry/uptime/grouptype.py

Lines changed: 299 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,312 @@
11
from __future__ import annotations
22

3+
import logging
34
from dataclasses import dataclass
5+
from datetime import datetime
6+
from typing import override
47

8+
from django.utils import timezone as django_timezone
9+
from sentry_kafka_schemas.schema_types.uptime_results_v1 import CheckResult, CheckStatus
10+
11+
from sentry import features, options
512
from sentry.issues.grouptype import GroupCategory, GroupType
13+
from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
14+
from sentry.issues.status_change_message import StatusChangeMessage
615
from sentry.ratelimits.sliding_windows import Quota
716
from sentry.types.group import PriorityLevel
17+
from sentry.uptime.models import UptimeStatus, UptimeSubscription, get_project_subscription
818
from sentry.uptime.types import (
919
GROUP_TYPE_UPTIME_DOMAIN_CHECK_FAILURE,
1020
ProjectUptimeSubscriptionMode,
1121
)
12-
from sentry.workflow_engine.types import DetectorSettings
22+
from sentry.utils import metrics
23+
from sentry.workflow_engine.handlers.detector.base import DetectorOccurrence, EventData
24+
from sentry.workflow_engine.handlers.detector.stateful import (
25+
DetectorThresholds,
26+
StatefulDetectorHandler,
27+
)
28+
from sentry.workflow_engine.models import DataPacket, Detector
29+
from sentry.workflow_engine.processors.data_condition_group import ProcessedDataConditionGroup
30+
from sentry.workflow_engine.types import (
31+
DetectorEvaluationResult,
32+
DetectorGroupKey,
33+
DetectorPriorityLevel,
34+
DetectorSettings,
35+
)
36+
37+
logger = logging.getLogger(__name__)
38+
39+
40+
@dataclass(frozen=True)
41+
class UptimePacketValue:
42+
"""
43+
Represents the value passed into the uptime detector
44+
"""
45+
46+
check_result: CheckResult
47+
subscription: UptimeSubscription
48+
metric_tags: dict[str, str]
49+
50+
51+
def build_detector_fingerprint_component(detector: Detector) -> str:
52+
return f"uptime-detector:{detector.id}"
53+
54+
55+
def build_fingerprint(detector: Detector) -> list[str]:
56+
return [build_detector_fingerprint_component(detector)]
57+
58+
59+
def get_active_failure_threshold() -> int:
60+
"""
61+
When in active monitoring mode, overrides how many failures in a row we
62+
need to see to mark the monitor as down
63+
"""
64+
return options.get("uptime.active-failure-threshold")
65+
66+
67+
def get_active_recovery_threshold() -> int:
68+
"""
69+
When in active monitoring mode, how many successes in a row do we need to
70+
mark it as up
71+
"""
72+
return options.get("uptime.active-recovery-threshold")
73+
74+
75+
def build_evidence_display(result: CheckResult) -> list[IssueEvidence]:
76+
evidence_display: list[IssueEvidence] = []
77+
78+
status_reason = result["status_reason"]
79+
if status_reason:
80+
reason_evidence = IssueEvidence(
81+
name="Failure reason",
82+
value=f'{status_reason["type"]} - {status_reason["description"]}',
83+
important=True,
84+
)
85+
evidence_display.extend([reason_evidence])
86+
87+
duration_evidence = IssueEvidence(
88+
name="Duration",
89+
value=f"{result["duration_ms"]}ms",
90+
important=False,
91+
)
92+
evidence_display.append(duration_evidence)
93+
94+
request_info = result["request_info"]
95+
if request_info:
96+
method_evidence = IssueEvidence(
97+
name="Method",
98+
value=request_info["request_type"],
99+
important=False,
100+
)
101+
status_code_evidence = IssueEvidence(
102+
name="Status Code",
103+
value=str(request_info["http_status_code"]),
104+
important=False,
105+
)
106+
evidence_display.extend([method_evidence, status_code_evidence])
107+
108+
return evidence_display
109+
110+
111+
def build_event_data(result: CheckResult, detector: Detector) -> EventData:
112+
# Default environment when it hasn't been configured
113+
env = detector.config.get("environment", "prod")
114+
115+
# Received time is the actual time the check was performed.
116+
received = datetime.fromtimestamp(result["actual_check_time_ms"] / 1000)
117+
118+
# XXX(epurkhiser): This can be changed over to using the detector ID in the
119+
# future once we're no longer using the ProjectUptimeSubscription.id as a tag.
120+
project_subscription = get_project_subscription(detector)
121+
122+
return {
123+
"project_id": detector.project_id,
124+
"environment": env,
125+
"received": received,
126+
"platform": "other",
127+
"sdk": None,
128+
"tags": {
129+
"uptime_rule": str(project_subscription.id),
130+
},
131+
"contexts": {
132+
"trace": {"trace_id": result["trace_id"], "span_id": result.get("span_id")},
133+
},
134+
}
135+
136+
137+
class UptimeDetectorHandler(StatefulDetectorHandler[UptimePacketValue, CheckStatus]):
138+
@override
139+
@property
140+
def thresholds(self) -> DetectorThresholds:
141+
return {
142+
DetectorPriorityLevel.OK: get_active_recovery_threshold(),
143+
DetectorPriorityLevel.HIGH: get_active_failure_threshold(),
144+
}
145+
146+
@override
147+
def extract_value(self, data_packet: DataPacket[UptimePacketValue]) -> CheckStatus:
148+
return data_packet.packet.check_result["status"]
149+
150+
@override
151+
def build_issue_fingerprint(self, group_key: DetectorGroupKey = None) -> list[str]:
152+
return build_fingerprint(self.detector)
153+
154+
@override
155+
def extract_dedupe_value(self, data_packet: DataPacket[UptimePacketValue]) -> int:
156+
return int(data_packet.packet.check_result["scheduled_check_time_ms"])
157+
158+
@override
159+
def evaluate(
160+
self, data_packet: DataPacket[UptimePacketValue]
161+
) -> dict[DetectorGroupKey, DetectorEvaluationResult] | None:
162+
result = super().evaluate(data_packet)
163+
164+
if not result:
165+
return result
166+
167+
# Uptime does not use stateful detector value grouping
168+
evaluation = result[None]
169+
170+
uptime_subscription = data_packet.packet.subscription
171+
metric_tags = data_packet.packet.metric_tags
172+
173+
detector_issue_creation_enabled = features.has(
174+
"organizations:uptime-detector-create-issues",
175+
self.detector.project.organization,
176+
)
177+
issue_creation_flag_enabled = features.has(
178+
"organizations:uptime-create-issues",
179+
self.detector.project.organization,
180+
)
181+
restricted_host_provider_ids = options.get(
182+
"uptime.restrict-issue-creation-by-hosting-provider-id"
183+
)
184+
host_provider_id = uptime_subscription.host_provider_id
185+
host_provider_enabled = host_provider_id not in restricted_host_provider_ids
186+
187+
issue_creation_allowed = (
188+
detector_issue_creation_enabled
189+
and issue_creation_flag_enabled
190+
and host_provider_enabled
191+
)
192+
193+
# XXX(epurkhiser): We currently are duplicating the detector state onto
194+
# the uptime_subscription when the detector changes state. Once we stop
195+
# using this field we can drop this update logic.
196+
#
197+
# We ONLY do this when detector issue creation is enabled, otherwise we
198+
# let the legacy uptime consumer handle this.
199+
if detector_issue_creation_enabled:
200+
if evaluation.priority == DetectorPriorityLevel.OK:
201+
uptime_status = UptimeStatus.OK
202+
elif evaluation.priority != DetectorPriorityLevel.OK:
203+
uptime_status = UptimeStatus.FAILED
204+
205+
uptime_subscription.update(
206+
uptime_status=uptime_status,
207+
uptime_status_update_date=django_timezone.now(),
208+
)
209+
210+
if not host_provider_enabled:
211+
metrics.incr(
212+
"uptime.result_processor.restricted_by_provider",
213+
sample_rate=1.0,
214+
tags={
215+
"host_provider_id": host_provider_id,
216+
**metric_tags,
217+
},
218+
)
219+
220+
result_creates_issue = isinstance(evaluation.result, IssueOccurrence)
221+
result_resolves_issue = isinstance(evaluation.result, StatusChangeMessage)
222+
223+
if result_creates_issue:
224+
metrics.incr(
225+
"uptime.detector.will_create_issue",
226+
tags=metric_tags,
227+
sample_rate=1.0,
228+
)
229+
# XXX(epurkhiser): This logging includes the same extra arguments
230+
# as the `uptime_active_sent_occurrence` log in the consumer for
231+
# legacy creation
232+
logger.info(
233+
"uptime.detector.will_create_issue",
234+
extra={
235+
"project_id": self.detector.project_id,
236+
"url": uptime_subscription.url,
237+
**data_packet.packet.check_result,
238+
},
239+
)
240+
if result_resolves_issue:
241+
metrics.incr(
242+
"uptime.detector.will_resolve_issue",
243+
sample_rate=1.0,
244+
tags=metric_tags,
245+
)
246+
logger.info(
247+
"uptime.detector.will_resolve_issue",
248+
extra={
249+
"project_id": self.detector.project_id,
250+
"url": uptime_subscription.url,
251+
**data_packet.packet.check_result,
252+
},
253+
)
254+
255+
# Reutning an empty dict effectively causes the detector processor to
256+
# bail and not produce an issue occurrence.
257+
if result_creates_issue and not issue_creation_allowed:
258+
return {}
259+
260+
return result
261+
262+
@override
263+
def create_occurrence(
264+
self,
265+
evaluation_result: ProcessedDataConditionGroup,
266+
data_packet: DataPacket[UptimePacketValue],
267+
priority: DetectorPriorityLevel,
268+
) -> tuple[DetectorOccurrence, EventData]:
269+
result = data_packet.packet.check_result
270+
uptime_subscription = data_packet.packet.subscription
271+
272+
occurrence = DetectorOccurrence(
273+
issue_title=f"Downtime detected for {uptime_subscription.url}",
274+
subtitle="Your monitored domain is down",
275+
evidence_display=build_evidence_display(result),
276+
type=UptimeDomainCheckFailure,
277+
level="error",
278+
culprit="", # TODO: The url?
279+
assignee=self.detector.owner,
280+
priority=priority,
281+
)
282+
event_data = build_event_data(result, self.detector)
283+
284+
return (occurrence, event_data)
285+
286+
def issue_creation_allowed(self, data_packet: DataPacket[UptimePacketValue]) -> bool:
287+
uptime_subscription = data_packet.packet.subscription
288+
289+
detector_issue_creation_enabled = features.has(
290+
"organizations:uptime-detector-create-issues",
291+
self.detector.project.organization,
292+
)
293+
issue_creation_flag_enabled = features.has(
294+
"organizations:uptime-create-issues",
295+
self.detector.project.organization,
296+
)
297+
restricted_host_provider_ids = options.get(
298+
"uptime.restrict-issue-creation-by-hosting-provider-id"
299+
)
300+
host_provider_id = uptime_subscription.host_provider_id
301+
host_provider_enabled = host_provider_id not in restricted_host_provider_ids
302+
303+
# TODO metrics
304+
305+
return (
306+
detector_issue_creation_enabled
307+
and issue_creation_flag_enabled
308+
and host_provider_enabled
309+
)
13310

14311

15312
@dataclass(frozen=True)
@@ -24,6 +321,7 @@ class UptimeDomainCheckFailure(GroupType):
24321
enable_auto_resolve = False
25322
enable_escalation_detection = False
26323
detector_settings = DetectorSettings(
324+
handler=UptimeDetectorHandler,
27325
config_schema={
28326
"$schema": "https://json-schema.org/draft/2020-12/schema",
29327
"description": "A representation of an uptime alert",

0 commit comments

Comments
 (0)