11from __future__ import annotations
22
3+ import logging
34from dataclasses import dataclass
5+ from datetime import datetime
6+ from typing import override
47
8+ from django .utils import timezone as django_timezone
9+ from sentry_kafka_schemas .schema_types .uptime_results_v1 import CheckResult , CheckStatus
10+
11+ from sentry import features , options
512from sentry .issues .grouptype import GroupCategory , GroupType
13+ from sentry .issues .issue_occurrence import IssueEvidence , IssueOccurrence
14+ from sentry .issues .status_change_message import StatusChangeMessage
615from sentry .ratelimits .sliding_windows import Quota
716from sentry .types .group import PriorityLevel
17+ from sentry .uptime .models import UptimeStatus , UptimeSubscription , get_project_subscription
818from sentry .uptime .types import (
919 GROUP_TYPE_UPTIME_DOMAIN_CHECK_FAILURE ,
1020 ProjectUptimeSubscriptionMode ,
1121)
12- from sentry .workflow_engine .types import DetectorSettings
22+ from sentry .utils import metrics
23+ from sentry .workflow_engine .handlers .detector .base import DetectorOccurrence , EventData
24+ from sentry .workflow_engine .handlers .detector .stateful import (
25+ DetectorThresholds ,
26+ StatefulDetectorHandler ,
27+ )
28+ from sentry .workflow_engine .models import DataPacket , Detector
29+ from sentry .workflow_engine .processors .data_condition_group import ProcessedDataConditionGroup
30+ from sentry .workflow_engine .types import (
31+ DetectorEvaluationResult ,
32+ DetectorGroupKey ,
33+ DetectorPriorityLevel ,
34+ DetectorSettings ,
35+ )
36+
37+ logger = logging .getLogger (__name__ )
38+
39+
40+ @dataclass (frozen = True )
41+ class UptimePacketValue :
42+ """
43+ Represents the value passed into the uptime detector
44+ """
45+
46+ check_result : CheckResult
47+ subscription : UptimeSubscription
48+ metric_tags : dict [str , str ]
49+
50+
51+ def build_detector_fingerprint_component (detector : Detector ) -> str :
52+ return f"uptime-detector:{ detector .id } "
53+
54+
55+ def build_fingerprint (detector : Detector ) -> list [str ]:
56+ return [build_detector_fingerprint_component (detector )]
57+
58+
59+ def get_active_failure_threshold () -> int :
60+ """
61+ When in active monitoring mode, overrides how many failures in a row we
62+ need to see to mark the monitor as down
63+ """
64+ return options .get ("uptime.active-failure-threshold" )
65+
66+
67+ def get_active_recovery_threshold () -> int :
68+ """
69+ When in active monitoring mode, how many successes in a row do we need to
70+ mark it as up
71+ """
72+ return options .get ("uptime.active-recovery-threshold" )
73+
74+
75+ def build_evidence_display (result : CheckResult ) -> list [IssueEvidence ]:
76+ evidence_display : list [IssueEvidence ] = []
77+
78+ status_reason = result ["status_reason" ]
79+ if status_reason :
80+ reason_evidence = IssueEvidence (
81+ name = "Failure reason" ,
82+ value = f'{ status_reason ["type" ]} - { status_reason ["description" ]} ' ,
83+ important = True ,
84+ )
85+ evidence_display .extend ([reason_evidence ])
86+
87+ duration_evidence = IssueEvidence (
88+ name = "Duration" ,
89+ value = f"{ result ["duration_ms" ]} ms" ,
90+ important = False ,
91+ )
92+ evidence_display .append (duration_evidence )
93+
94+ request_info = result ["request_info" ]
95+ if request_info :
96+ method_evidence = IssueEvidence (
97+ name = "Method" ,
98+ value = request_info ["request_type" ],
99+ important = False ,
100+ )
101+ status_code_evidence = IssueEvidence (
102+ name = "Status Code" ,
103+ value = str (request_info ["http_status_code" ]),
104+ important = False ,
105+ )
106+ evidence_display .extend ([method_evidence , status_code_evidence ])
107+
108+ return evidence_display
109+
110+
111+ def build_event_data (result : CheckResult , detector : Detector ) -> EventData :
112+ # Default environment when it hasn't been configured
113+ env = detector .config .get ("environment" , "prod" )
114+
115+ # Received time is the actual time the check was performed.
116+ received = datetime .fromtimestamp (result ["actual_check_time_ms" ] / 1000 )
117+
118+ # XXX(epurkhiser): This can be changed over to using the detector ID in the
119+ # future once we're no longer using the ProjectUptimeSubscription.id as a tag.
120+ project_subscription = get_project_subscription (detector )
121+
122+ return {
123+ "project_id" : detector .project_id ,
124+ "environment" : env ,
125+ "received" : received ,
126+ "platform" : "other" ,
127+ "sdk" : None ,
128+ "tags" : {
129+ "uptime_rule" : str (project_subscription .id ),
130+ },
131+ "contexts" : {
132+ "trace" : {"trace_id" : result ["trace_id" ], "span_id" : result .get ("span_id" )},
133+ },
134+ }
135+
136+
137+ class UptimeDetectorHandler (StatefulDetectorHandler [UptimePacketValue , CheckStatus ]):
138+ @override
139+ @property
140+ def thresholds (self ) -> DetectorThresholds :
141+ return {
142+ DetectorPriorityLevel .OK : get_active_recovery_threshold (),
143+ DetectorPriorityLevel .HIGH : get_active_failure_threshold (),
144+ }
145+
146+ @override
147+ def extract_value (self , data_packet : DataPacket [UptimePacketValue ]) -> CheckStatus :
148+ return data_packet .packet .check_result ["status" ]
149+
150+ @override
151+ def build_issue_fingerprint (self , group_key : DetectorGroupKey = None ) -> list [str ]:
152+ # TODO(epurkhiser): We should migrate the fingerprints over to match
153+ # what the default fingerprint is.
154+ return build_fingerprint (self .detector )
155+
156+ @override
157+ def extract_dedupe_value (self , data_packet : DataPacket [UptimePacketValue ]) -> int :
158+ return int (data_packet .packet .check_result ["scheduled_check_time_ms" ])
159+
160+ @override
161+ def evaluate (
162+ self , data_packet : DataPacket [UptimePacketValue ]
163+ ) -> dict [DetectorGroupKey , DetectorEvaluationResult ]:
164+ result = super ().evaluate (data_packet )
165+
166+ if not result :
167+ return result
168+
169+ # Uptime does not use stateful detector value grouping
170+ evaluation = result [None ]
171+
172+ uptime_subscription = data_packet .packet .subscription
173+ metric_tags = data_packet .packet .metric_tags
174+
175+ detector_issue_creation_enabled = features .has (
176+ "organizations:uptime-detector-create-issues" ,
177+ self .detector .project .organization ,
178+ )
179+ issue_creation_flag_enabled = features .has (
180+ "organizations:uptime-create-issues" ,
181+ self .detector .project .organization ,
182+ )
183+ restricted_host_provider_ids = options .get (
184+ "uptime.restrict-issue-creation-by-hosting-provider-id"
185+ )
186+ host_provider_id = uptime_subscription .host_provider_id
187+ host_provider_enabled = host_provider_id not in restricted_host_provider_ids
188+
189+ issue_creation_allowed = (
190+ detector_issue_creation_enabled
191+ and issue_creation_flag_enabled
192+ and host_provider_enabled
193+ )
194+
195+ # XXX(epurkhiser): We currently are duplicating the detector state onto
196+ # the uptime_subscription when the detector changes state. Once we stop
197+ # using this field we can drop this update logic.
198+ #
199+ # We ONLY do this when detector issue creation is enabled, otherwise we
200+ # let the legacy uptime consumer handle this.
201+ if detector_issue_creation_enabled :
202+ if evaluation .priority == DetectorPriorityLevel .OK :
203+ uptime_status = UptimeStatus .OK
204+ elif evaluation .priority != DetectorPriorityLevel .OK :
205+ uptime_status = UptimeStatus .FAILED
206+
207+ uptime_subscription .update (
208+ uptime_status = uptime_status ,
209+ uptime_status_update_date = django_timezone .now (),
210+ )
211+
212+ if not host_provider_enabled :
213+ metrics .incr (
214+ "uptime.result_processor.restricted_by_provider" ,
215+ sample_rate = 1.0 ,
216+ tags = {
217+ "host_provider_id" : host_provider_id ,
218+ ** metric_tags ,
219+ },
220+ )
221+
222+ result_creates_issue = isinstance (evaluation .result , IssueOccurrence )
223+ result_resolves_issue = isinstance (evaluation .result , StatusChangeMessage )
224+
225+ if result_creates_issue :
226+ metrics .incr (
227+ "uptime.detector.will_create_issue" ,
228+ tags = metric_tags ,
229+ sample_rate = 1.0 ,
230+ )
231+ # XXX(epurkhiser): This logging includes the same extra arguments
232+ # as the `uptime_active_sent_occurrence` log in the consumer for
233+ # legacy creation
234+ logger .info (
235+ "uptime.detector.will_create_issue" ,
236+ extra = {
237+ "project_id" : self .detector .project_id ,
238+ "url" : uptime_subscription .url ,
239+ ** data_packet .packet .check_result ,
240+ },
241+ )
242+ if result_resolves_issue :
243+ metrics .incr (
244+ "uptime.detector.will_resolve_issue" ,
245+ sample_rate = 1.0 ,
246+ tags = metric_tags ,
247+ )
248+ logger .info (
249+ "uptime.detector.will_resolve_issue" ,
250+ extra = {
251+ "project_id" : self .detector .project_id ,
252+ "url" : uptime_subscription .url ,
253+ ** data_packet .packet .check_result ,
254+ },
255+ )
256+
257+ # Reutning an empty dict effectively causes the detector processor to
258+ # bail and not produce an issue occurrence.
259+ if result_creates_issue and not issue_creation_allowed :
260+ return {}
261+
262+ return result
263+
264+ @override
265+ def create_occurrence (
266+ self ,
267+ evaluation_result : ProcessedDataConditionGroup ,
268+ data_packet : DataPacket [UptimePacketValue ],
269+ priority : DetectorPriorityLevel ,
270+ ) -> tuple [DetectorOccurrence , EventData ]:
271+ result = data_packet .packet .check_result
272+ uptime_subscription = data_packet .packet .subscription
273+
274+ occurrence = DetectorOccurrence (
275+ issue_title = f"Downtime detected for { uptime_subscription .url } " ,
276+ subtitle = "Your monitored domain is down" ,
277+ evidence_display = build_evidence_display (result ),
278+ type = UptimeDomainCheckFailure ,
279+ level = "error" ,
280+ culprit = "" , # TODO: The url?
281+ assignee = self .detector .owner ,
282+ priority = priority ,
283+ )
284+ event_data = build_event_data (result , self .detector )
285+
286+ return (occurrence , event_data )
13287
14288
15289@dataclass (frozen = True )
@@ -24,6 +298,7 @@ class UptimeDomainCheckFailure(GroupType):
24298 enable_auto_resolve = False
25299 enable_escalation_detection = False
26300 detector_settings = DetectorSettings (
301+ handler = UptimeDetectorHandler ,
27302 config_schema = {
28303 "$schema" : "https://json-schema.org/draft/2020-12/schema" ,
29304 "description" : "A representation of an uptime alert" ,
0 commit comments