11from __future__ import annotations
22
3+ import logging
34from dataclasses import dataclass
5+ from datetime import datetime
6+ from typing import override
47
8+ from django .utils import timezone as django_timezone
9+ from sentry_kafka_schemas .schema_types .uptime_results_v1 import CheckResult , CheckStatus
10+
11+ from sentry import features , options
512from sentry .issues .grouptype import GroupCategory , GroupType
13+ from sentry .issues .issue_occurrence import IssueEvidence , IssueOccurrence
14+ from sentry .issues .status_change_message import StatusChangeMessage
615from sentry .ratelimits .sliding_windows import Quota
716from sentry .types .group import PriorityLevel
17+ from sentry .uptime .models import UptimeStatus , UptimeSubscription , get_project_subscription
818from sentry .uptime .types import (
919 GROUP_TYPE_UPTIME_DOMAIN_CHECK_FAILURE ,
1020 ProjectUptimeSubscriptionMode ,
1121)
12- from sentry .workflow_engine .types import DetectorSettings
22+ from sentry .utils import metrics
23+ from sentry .workflow_engine .handlers .detector .base import DetectorOccurrence , EventData
24+ from sentry .workflow_engine .handlers .detector .stateful import (
25+ DetectorThresholds ,
26+ StatefulDetectorHandler ,
27+ )
28+ from sentry .workflow_engine .models import DataPacket , Detector
29+ from sentry .workflow_engine .processors .data_condition_group import ProcessedDataConditionGroup
30+ from sentry .workflow_engine .types import (
31+ DetectorEvaluationResult ,
32+ DetectorGroupKey ,
33+ DetectorPriorityLevel ,
34+ DetectorSettings ,
35+ )
36+
37+ logger = logging .getLogger (__name__ )
38+
39+
40+ @dataclass (frozen = True )
41+ class UptimePacketValue :
42+ """
43+ Represents the value passed into the uptime detector
44+ """
45+
46+ check_result : CheckResult
47+ subscription : UptimeSubscription
48+ metric_tags : dict [str , str ]
49+
50+
51+ def build_detector_fingerprint_component (detector : Detector ) -> str :
52+ return f"uptime-detector:{ detector .id } "
53+
54+
55+ def build_fingerprint (detector : Detector ) -> list [str ]:
56+ return [build_detector_fingerprint_component (detector )]
57+
58+
59+ def get_active_failure_threshold () -> int :
60+ """
61+ When in active monitoring mode, overrides how many failures in a row we
62+ need to see to mark the monitor as down
63+ """
64+ return options .get ("uptime.active-failure-threshold" )
65+
66+
67+ def get_active_recovery_threshold () -> int :
68+ """
69+ When in active monitoring mode, how many successes in a row do we need to
70+ mark it as up
71+ """
72+ return options .get ("uptime.active-recovery-threshold" )
73+
74+
75+ def build_evidence_display (result : CheckResult ) -> list [IssueEvidence ]:
76+ evidence_display : list [IssueEvidence ] = []
77+
78+ status_reason = result ["status_reason" ]
79+ if status_reason :
80+ reason_evidence = IssueEvidence (
81+ name = "Failure reason" ,
82+ value = f'{ status_reason ["type" ]} - { status_reason ["description" ]} ' ,
83+ important = True ,
84+ )
85+ evidence_display .extend ([reason_evidence ])
86+
87+ duration_evidence = IssueEvidence (
88+ name = "Duration" ,
89+ value = f"{ result ["duration_ms" ]} ms" ,
90+ important = False ,
91+ )
92+ evidence_display .append (duration_evidence )
93+
94+ request_info = result ["request_info" ]
95+ if request_info :
96+ method_evidence = IssueEvidence (
97+ name = "Method" ,
98+ value = request_info ["request_type" ],
99+ important = False ,
100+ )
101+ status_code_evidence = IssueEvidence (
102+ name = "Status Code" ,
103+ value = str (request_info ["http_status_code" ]),
104+ important = False ,
105+ )
106+ evidence_display .extend ([method_evidence , status_code_evidence ])
107+
108+ return evidence_display
109+
110+
111+ def build_event_data (result : CheckResult , detector : Detector ) -> EventData :
112+ # Default environment when it hasn't been configured
113+ env = detector .config .get ("environment" , "prod" )
114+
115+ # Received time is the actual time the check was performed.
116+ received = datetime .fromtimestamp (result ["actual_check_time_ms" ] / 1000 )
117+
118+ # XXX(epurkhiser): This can be changed over to using the detector ID in the
119+ # future once we're no longer using the ProjectUptimeSubscription.id as a tag.
120+ project_subscription = get_project_subscription (detector )
121+
122+ return {
123+ "project_id" : detector .project_id ,
124+ "environment" : env ,
125+ "received" : received ,
126+ "platform" : "other" ,
127+ "sdk" : None ,
128+ "tags" : {
129+ "uptime_rule" : str (project_subscription .id ),
130+ },
131+ "contexts" : {
132+ "trace" : {"trace_id" : result ["trace_id" ], "span_id" : result .get ("span_id" )},
133+ },
134+ }
135+
136+
137+ class UptimeDetectorHandler (StatefulDetectorHandler [UptimePacketValue , CheckStatus ]):
138+ @override
139+ @property
140+ def thresholds (self ) -> DetectorThresholds :
141+ return {
142+ DetectorPriorityLevel .OK : get_active_recovery_threshold (),
143+ DetectorPriorityLevel .HIGH : get_active_failure_threshold (),
144+ }
145+
146+ @override
147+ def extract_value (self , data_packet : DataPacket [UptimePacketValue ]) -> CheckStatus :
148+ return data_packet .packet .check_result ["status" ]
149+
150+ @override
151+ def build_issue_fingerprint (self , group_key : DetectorGroupKey = None ) -> list [str ]:
152+ return build_fingerprint (self .detector )
153+
154+ @override
155+ def extract_dedupe_value (self , data_packet : DataPacket [UptimePacketValue ]) -> int :
156+ return int (data_packet .packet .check_result ["scheduled_check_time_ms" ])
157+
158+ @override
159+ def evaluate (
160+ self , data_packet : DataPacket [UptimePacketValue ]
161+ ) -> dict [DetectorGroupKey , DetectorEvaluationResult ] | None :
162+ result = super ().evaluate (data_packet )
163+
164+ if not result :
165+ return result
166+
167+ # Uptime does not use stateful detector value grouping
168+ evaluation = result [None ]
169+
170+ uptime_subscription = data_packet .packet .subscription
171+ metric_tags = data_packet .packet .metric_tags
172+
173+ detector_issue_creation_enabled = features .has (
174+ "organizations:uptime-detector-create-issues" ,
175+ self .detector .project .organization ,
176+ )
177+ issue_creation_flag_enabled = features .has (
178+ "organizations:uptime-create-issues" ,
179+ self .detector .project .organization ,
180+ )
181+ restricted_host_provider_ids = options .get (
182+ "uptime.restrict-issue-creation-by-hosting-provider-id"
183+ )
184+ host_provider_id = uptime_subscription .host_provider_id
185+ host_provider_enabled = host_provider_id not in restricted_host_provider_ids
186+
187+ issue_creation_allowed = (
188+ detector_issue_creation_enabled
189+ and issue_creation_flag_enabled
190+ and host_provider_enabled
191+ )
192+
193+ # XXX(epurkhiser): We currently are duplicating the detector state onto
194+ # the uptime_subscription when the detector changes state. Once we stop
195+ # using this field we can drop this update logic.
196+ #
197+ # We ONLY do this when detector issue creation is enabled, otherwise we
198+ # let the legacy uptime consumer handle this.
199+ if detector_issue_creation_enabled :
200+ if evaluation .priority == DetectorPriorityLevel .OK :
201+ uptime_status = UptimeStatus .OK
202+ elif evaluation .priority != DetectorPriorityLevel .OK :
203+ uptime_status = UptimeStatus .FAILED
204+
205+ uptime_subscription .update (
206+ uptime_status = uptime_status ,
207+ uptime_status_update_date = django_timezone .now (),
208+ )
209+
210+ if not host_provider_enabled :
211+ metrics .incr (
212+ "uptime.result_processor.restricted_by_provider" ,
213+ sample_rate = 1.0 ,
214+ tags = {
215+ "host_provider_id" : host_provider_id ,
216+ ** metric_tags ,
217+ },
218+ )
219+
220+ result_creates_issue = isinstance (evaluation .result , IssueOccurrence )
221+ result_resolves_issue = isinstance (evaluation .result , StatusChangeMessage )
222+
223+ if result_creates_issue :
224+ metrics .incr (
225+ "uptime.detector.will_create_issue" ,
226+ tags = metric_tags ,
227+ sample_rate = 1.0 ,
228+ )
229+ # XXX(epurkhiser): This logging includes the same extra arguments
230+ # as the `uptime_active_sent_occurrence` log in the consumer for
231+ # legacy creation
232+ logger .info (
233+ "uptime.detector.will_create_issue" ,
234+ extra = {
235+ "project_id" : self .detector .project_id ,
236+ "url" : uptime_subscription .url ,
237+ ** data_packet .packet .check_result ,
238+ },
239+ )
240+ if result_resolves_issue :
241+ metrics .incr (
242+ "uptime.detector.will_resolve_issue" ,
243+ sample_rate = 1.0 ,
244+ tags = metric_tags ,
245+ )
246+ logger .info (
247+ "uptime.detector.will_resolve_issue" ,
248+ extra = {
249+ "project_id" : self .detector .project_id ,
250+ "url" : uptime_subscription .url ,
251+ ** data_packet .packet .check_result ,
252+ },
253+ )
254+
255+ # Reutning an empty dict effectively causes the detector processor to
256+ # bail and not produce an issue occurrence.
257+ if result_creates_issue and not issue_creation_allowed :
258+ return {}
259+
260+ return result
261+
262+ @override
263+ def create_occurrence (
264+ self ,
265+ evaluation_result : ProcessedDataConditionGroup ,
266+ data_packet : DataPacket [UptimePacketValue ],
267+ priority : DetectorPriorityLevel ,
268+ ) -> tuple [DetectorOccurrence , EventData ]:
269+ result = data_packet .packet .check_result
270+ uptime_subscription = data_packet .packet .subscription
271+
272+ occurrence = DetectorOccurrence (
273+ issue_title = f"Downtime detected for { uptime_subscription .url } " ,
274+ subtitle = "Your monitored domain is down" ,
275+ evidence_display = build_evidence_display (result ),
276+ type = UptimeDomainCheckFailure ,
277+ level = "error" ,
278+ culprit = "" , # TODO: The url?
279+ assignee = self .detector .owner ,
280+ priority = priority ,
281+ )
282+ event_data = build_event_data (result , self .detector )
283+
284+ return (occurrence , event_data )
285+
286+ def issue_creation_allowed (self , data_packet : DataPacket [UptimePacketValue ]) -> bool :
287+ uptime_subscription = data_packet .packet .subscription
288+
289+ detector_issue_creation_enabled = features .has (
290+ "organizations:uptime-detector-create-issues" ,
291+ self .detector .project .organization ,
292+ )
293+ issue_creation_flag_enabled = features .has (
294+ "organizations:uptime-create-issues" ,
295+ self .detector .project .organization ,
296+ )
297+ restricted_host_provider_ids = options .get (
298+ "uptime.restrict-issue-creation-by-hosting-provider-id"
299+ )
300+ host_provider_id = uptime_subscription .host_provider_id
301+ host_provider_enabled = host_provider_id not in restricted_host_provider_ids
302+
303+ # TODO metrics
304+
305+ return (
306+ detector_issue_creation_enabled
307+ and issue_creation_flag_enabled
308+ and host_provider_enabled
309+ )
13310
14311
15312@dataclass (frozen = True )
@@ -24,6 +321,7 @@ class UptimeDomainCheckFailure(GroupType):
24321 enable_auto_resolve = False
25322 enable_escalation_detection = False
26323 detector_settings = DetectorSettings (
324+ handler = UptimeDetectorHandler ,
27325 config_schema = {
28326 "$schema" : "https://json-schema.org/draft/2020-12/schema" ,
29327 "description" : "A representation of an uptime alert" ,
0 commit comments