-
-
Notifications
You must be signed in to change notification settings - Fork 4.5k
feat(uptime): Implement detector handler #91107
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,6 +26,7 @@ | |
| ) | ||
| from sentry.uptime.detectors.ranking import _get_cluster | ||
| from sentry.uptime.detectors.result_handler import handle_onboarding_result | ||
| from sentry.uptime.grouptype import UptimePacketValue | ||
| from sentry.uptime.issue_platform import create_issue_platform_occurrence, resolve_uptime_issue | ||
| from sentry.uptime.models import ( | ||
| UptimeStatus, | ||
|
|
@@ -43,11 +44,17 @@ | |
| send_uptime_config_deletion, | ||
| update_remote_uptime_subscription, | ||
| ) | ||
| from sentry.uptime.types import IncidentStatus, ProjectUptimeSubscriptionMode | ||
| from sentry.uptime.types import ( | ||
| DATA_SOURCE_UPTIME_SUBSCRIPTION, | ||
| IncidentStatus, | ||
| ProjectUptimeSubscriptionMode, | ||
| ) | ||
| from sentry.utils import metrics | ||
| from sentry.utils.arroyo_producer import SingletonProducer | ||
| from sentry.utils.kafka_config import get_kafka_producer_cluster_options, get_topic_definition | ||
| from sentry.workflow_engine.models.data_source import DataPacket | ||
| from sentry.workflow_engine.models.detector import Detector | ||
| from sentry.workflow_engine.processors.data_packet import process_data_packets | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
@@ -292,6 +299,33 @@ def handle_active_result( | |
| result: CheckResult, | ||
| metric_tags: dict[str, str], | ||
| ): | ||
| organization = detector.project.organization | ||
|
|
||
| if features.has("organizations:uptime-detector-handler", organization): | ||
| # XXX(epurkhiser): Enabling the uptime-detector-handler will process | ||
| # check results via the uptime detector handler. However the handler | ||
| # WILL NOT produce issue occurrences (however it will do nearly | ||
| # everything else, including logging that it will produce) | ||
| packet = UptimePacketValue( | ||
| check_result=result, | ||
| subscription=uptime_subscription, | ||
| metric_tags=metric_tags, | ||
| ) | ||
| process_data_packets( | ||
| [DataPacket(source_id=str(uptime_subscription.id), packet=packet)], | ||
| DATA_SOURCE_UPTIME_SUBSCRIPTION, | ||
| ) | ||
|
|
||
| # Bail if we're doing issue creation via detectors, we don't want to | ||
| # create issues using the legacy system in this case. If this flag is | ||
| # not enabkled the detector will still run, but will not produce an | ||
| # issue occurrence. | ||
| # | ||
| # Once we've determined that the detector handler is producing issues | ||
| # the same as the legacy issue creation, we can remove this. | ||
| if features.has("organizations:uptime-detector-create-issues", organization): | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This bit is important as it makes sure we don't create multiple occurrences at the same time. |
||
| return | ||
|
|
||
| uptime_status = uptime_subscription.uptime_status | ||
| result_status = result["status"] | ||
|
|
||
|
|
||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For metrics we also have the handler inside the |
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
| @@ -1,15 +1,289 @@ | ||||
| from __future__ import annotations | ||||
|
|
||||
| import logging | ||||
| from dataclasses import dataclass | ||||
| from datetime import datetime | ||||
| from typing import override | ||||
|
|
||||
| from django.utils import timezone as django_timezone | ||||
| from sentry_kafka_schemas.schema_types.uptime_results_v1 import CheckResult, CheckStatus | ||||
|
|
||||
| from sentry import features, options | ||||
| from sentry.issues.grouptype import GroupCategory, GroupType | ||||
| from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence | ||||
| from sentry.issues.status_change_message import StatusChangeMessage | ||||
| from sentry.ratelimits.sliding_windows import Quota | ||||
| from sentry.types.group import PriorityLevel | ||||
| from sentry.uptime.models import UptimeStatus, UptimeSubscription, get_project_subscription | ||||
| from sentry.uptime.types import ( | ||||
| GROUP_TYPE_UPTIME_DOMAIN_CHECK_FAILURE, | ||||
| ProjectUptimeSubscriptionMode, | ||||
| ) | ||||
| from sentry.workflow_engine.types import DetectorSettings | ||||
| from sentry.utils import metrics | ||||
| from sentry.workflow_engine.handlers.detector.base import DetectorOccurrence, EventData | ||||
| from sentry.workflow_engine.handlers.detector.stateful import ( | ||||
| DetectorThresholds, | ||||
| StatefulDetectorHandler, | ||||
| ) | ||||
| from sentry.workflow_engine.models import DataPacket, Detector | ||||
| from sentry.workflow_engine.processors.data_condition_group import ProcessedDataConditionGroup | ||||
| from sentry.workflow_engine.types import ( | ||||
| DetectorEvaluationResult, | ||||
| DetectorGroupKey, | ||||
| DetectorPriorityLevel, | ||||
| DetectorSettings, | ||||
| ) | ||||
|
|
||||
| logger = logging.getLogger(__name__) | ||||
|
|
||||
|
|
||||
| @dataclass(frozen=True) | ||||
| class UptimePacketValue: | ||||
| """ | ||||
| Represents the value passed into the uptime detector | ||||
| """ | ||||
|
|
||||
| check_result: CheckResult | ||||
| subscription: UptimeSubscription | ||||
| metric_tags: dict[str, str] | ||||
|
|
||||
|
|
||||
| def build_detector_fingerprint_component(detector: Detector) -> str: | ||||
| return f"uptime-detector:{detector.id}" | ||||
|
Comment on lines
+51
to
+52
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need to start sending this new fingerprint type along with issue occurrences now, so then we can backfill
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have been actually, we use the same fingerprint here
|
||||
|
|
||||
|
|
||||
| def build_fingerprint(detector: Detector) -> list[str]: | ||||
| return [build_detector_fingerprint_component(detector)] | ||||
|
|
||||
|
|
||||
| def get_active_failure_threshold() -> int: | ||||
| """ | ||||
| When in active monitoring mode, overrides how many failures in a row we | ||||
| need to see to mark the monitor as down | ||||
| """ | ||||
| return options.get("uptime.active-failure-threshold") | ||||
|
|
||||
|
|
||||
| def get_active_recovery_threshold() -> int: | ||||
| """ | ||||
| When in active monitoring mode, how many successes in a row do we need to | ||||
| mark it as up | ||||
| """ | ||||
| return options.get("uptime.active-recovery-threshold") | ||||
|
|
||||
|
|
||||
| def build_evidence_display(result: CheckResult) -> list[IssueEvidence]: | ||||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shared now with the |
||||
| evidence_display: list[IssueEvidence] = [] | ||||
|
|
||||
| status_reason = result["status_reason"] | ||||
| if status_reason: | ||||
| reason_evidence = IssueEvidence( | ||||
| name="Failure reason", | ||||
| value=f'{status_reason["type"]} - {status_reason["description"]}', | ||||
| important=True, | ||||
| ) | ||||
| evidence_display.extend([reason_evidence]) | ||||
|
|
||||
| duration_evidence = IssueEvidence( | ||||
| name="Duration", | ||||
| value=f"{result["duration_ms"]}ms", | ||||
| important=False, | ||||
| ) | ||||
| evidence_display.append(duration_evidence) | ||||
|
|
||||
| request_info = result["request_info"] | ||||
| if request_info: | ||||
| method_evidence = IssueEvidence( | ||||
| name="Method", | ||||
| value=request_info["request_type"], | ||||
| important=False, | ||||
| ) | ||||
| status_code_evidence = IssueEvidence( | ||||
| name="Status Code", | ||||
| value=str(request_info["http_status_code"]), | ||||
| important=False, | ||||
| ) | ||||
| evidence_display.extend([method_evidence, status_code_evidence]) | ||||
|
|
||||
| return evidence_display | ||||
|
|
||||
|
|
||||
| def build_event_data(result: CheckResult, detector: Detector) -> EventData: | ||||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also shared now with the issue platform |
||||
| # Default environment when it hasn't been configured | ||||
| env = detector.config.get("environment", "prod") | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this normal behavior? Does every detector need this? Should we at least make the default a constant?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is specific to the uptime detector. Unfortunately we hardcode default environment names a number of places in the sentry backend, so this is par for the course right now |
||||
|
|
||||
| # Received time is the actual time the check was performed. | ||||
| received = datetime.fromtimestamp(result["actual_check_time_ms"] / 1000) | ||||
|
|
||||
| # XXX(epurkhiser): This can be changed over to using the detector ID in the | ||||
| # future once we're no longer using the ProjectUptimeSubscription.id as a tag. | ||||
| project_subscription = get_project_subscription(detector) | ||||
|
|
||||
| return { | ||||
| "project_id": detector.project_id, | ||||
| "environment": env, | ||||
| "received": received, | ||||
| "platform": "other", | ||||
| "sdk": None, | ||||
| "tags": { | ||||
| "uptime_rule": str(project_subscription.id), | ||||
| }, | ||||
| "contexts": { | ||||
| "trace": {"trace_id": result["trace_id"], "span_id": result.get("span_id")}, | ||||
| }, | ||||
| } | ||||
|
|
||||
|
|
||||
| class UptimeDetectorHandler(StatefulDetectorHandler[UptimePacketValue, CheckStatus]): | ||||
| @override | ||||
| @property | ||||
| def thresholds(self) -> DetectorThresholds: | ||||
| return { | ||||
| DetectorPriorityLevel.OK: get_active_recovery_threshold(), | ||||
| DetectorPriorityLevel.HIGH: get_active_failure_threshold(), | ||||
| } | ||||
|
|
||||
| @override | ||||
| def extract_value(self, data_packet: DataPacket[UptimePacketValue]) -> CheckStatus: | ||||
| return data_packet.packet.check_result["status"] | ||||
|
|
||||
| @override | ||||
| def build_issue_fingerprint(self, group_key: DetectorGroupKey = None) -> list[str]: | ||||
| # TODO(epurkhiser): We should migrate the fingerprints over to match | ||||
| # what the default fingerprint is. | ||||
| return build_fingerprint(self.detector) | ||||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TODO: This isn't the only fingerprint actually, the stateful detector also adds it's own "default" fingerprint, which ends up being just the detector ID. This means these issues could potentially collide with our previous issues that were using the project subscription ID as the fingerprint. |
||||
|
|
||||
| @override | ||||
| def extract_dedupe_value(self, data_packet: DataPacket[UptimePacketValue]) -> int: | ||||
| return int(data_packet.packet.check_result["scheduled_check_time_ms"]) | ||||
|
|
||||
| @override | ||||
| def evaluate( | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. High level, I feel we should aim for this evaluate function to be pure. All the side effects and extra conditionals in here are a design smell. I'd encourage us to generalize these:
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes I agree. Parts of this have to do with the fact that we’re migrating away from our existing issue creation system for uptime. Once we’ve cleaned up more of that this should simplify and we can clean things up more. |
||||
| self, data_packet: DataPacket[UptimePacketValue] | ||||
| ) -> dict[DetectorGroupKey, DetectorEvaluationResult]: | ||||
| result = super().evaluate(data_packet) | ||||
|
|
||||
| if not result: | ||||
| return result | ||||
|
|
||||
| # Uptime does not use stateful detector value grouping | ||||
| evaluation = result[None] | ||||
|
|
||||
| uptime_subscription = data_packet.packet.subscription | ||||
| metric_tags = data_packet.packet.metric_tags | ||||
|
|
||||
| detector_issue_creation_enabled = features.has( | ||||
| "organizations:uptime-detector-create-issues", | ||||
| self.detector.project.organization, | ||||
| ) | ||||
| issue_creation_flag_enabled = features.has( | ||||
| "organizations:uptime-create-issues", | ||||
| self.detector.project.organization, | ||||
| ) | ||||
| restricted_host_provider_ids = options.get( | ||||
| "uptime.restrict-issue-creation-by-hosting-provider-id" | ||||
| ) | ||||
| host_provider_id = uptime_subscription.host_provider_id | ||||
| host_provider_enabled = host_provider_id not in restricted_host_provider_ids | ||||
|
|
||||
| issue_creation_allowed = ( | ||||
| detector_issue_creation_enabled | ||||
| and issue_creation_flag_enabled | ||||
| and host_provider_enabled | ||||
| ) | ||||
|
Comment on lines
+172
to
+193
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is where we check a few different feature flags an options to bail out of issue creation, the same way we do in the consumer. |
||||
|
|
||||
| # XXX(epurkhiser): We currently are duplicating the detector state onto | ||||
| # the uptime_subscription when the detector changes state. Once we stop | ||||
| # using this field we can drop this update logic. | ||||
| # | ||||
| # We ONLY do this when detector issue creation is enabled, otherwise we | ||||
| # let the legacy uptime consumer handle this. | ||||
| if detector_issue_creation_enabled: | ||||
| if evaluation.priority == DetectorPriorityLevel.OK: | ||||
| uptime_status = UptimeStatus.OK | ||||
| elif evaluation.priority != DetectorPriorityLevel.OK: | ||||
| uptime_status = UptimeStatus.FAILED | ||||
|
|
||||
| uptime_subscription.update( | ||||
| uptime_status=uptime_status, | ||||
| uptime_status_update_date=django_timezone.now(), | ||||
| ) | ||||
|
|
||||
| if not host_provider_enabled: | ||||
| metrics.incr( | ||||
| "uptime.result_processor.restricted_by_provider", | ||||
| sample_rate=1.0, | ||||
| tags={ | ||||
| "host_provider_id": host_provider_id, | ||||
| **metric_tags, | ||||
| }, | ||||
| ) | ||||
|
|
||||
| result_creates_issue = isinstance(evaluation.result, IssueOccurrence) | ||||
| result_resolves_issue = isinstance(evaluation.result, StatusChangeMessage) | ||||
|
|
||||
| if result_creates_issue: | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All these metrics seem like they should be provided by the platform
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I agree. But I don’t want to lose the metrics we have right now
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for the most these metrics are provided by the platform: datadog, code we will make a metric to denote that we're processing a detector, and if the detector has a state change. do we need to split out the difference between creating an issue and resolving an issue? the platform was just merging them into "a threshold was breached", but if we want to have separate tracking for those cases it should be fairly trivial to add.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can clean this all up later I just wanted to make sure we were not losing any metrics as we cut over from our legacy issue creation system
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It’s also valuable for us to have various metric tags. Probably something worth talking about for the handler APIs |
||||
| metrics.incr( | ||||
| "uptime.detector.will_create_issue", | ||||
| tags=metric_tags, | ||||
| sample_rate=1.0, | ||||
| ) | ||||
| # XXX(epurkhiser): This logging includes the same extra arguments | ||||
| # as the `uptime_active_sent_occurrence` log in the consumer for | ||||
| # legacy creation | ||||
| logger.info( | ||||
| "uptime.detector.will_create_issue", | ||||
| extra={ | ||||
| "project_id": self.detector.project_id, | ||||
| "url": uptime_subscription.url, | ||||
| **data_packet.packet.check_result, | ||||
| }, | ||||
| ) | ||||
| if result_resolves_issue: | ||||
| metrics.incr( | ||||
| "uptime.detector.will_resolve_issue", | ||||
| sample_rate=1.0, | ||||
| tags=metric_tags, | ||||
| ) | ||||
| logger.info( | ||||
| "uptime.detector.will_resolve_issue", | ||||
| extra={ | ||||
| "project_id": self.detector.project_id, | ||||
| "url": uptime_subscription.url, | ||||
| **data_packet.packet.check_result, | ||||
| }, | ||||
| ) | ||||
|
|
||||
| # Reutning an empty dict effectively causes the detector processor to | ||||
| # bail and not produce an issue occurrence. | ||||
| if result_creates_issue and not issue_creation_allowed: | ||||
| return {} | ||||
|
|
||||
| return result | ||||
|
Comment on lines
+172
to
+262
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. curious if y'all have any thoughts about this evaluation bit -- is there anything for you that would make this easier to implement? think it's okay to just invoke the super and handle it like this?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For now I think this is fine. Once we remove a lot of the legacy issue creation code for uptime some of this is going to go away and it will be more clear what can be factored and abstracted. |
||||
|
|
||||
| @override | ||||
| def create_occurrence( | ||||
| self, | ||||
| evaluation_result: ProcessedDataConditionGroup, | ||||
| data_packet: DataPacket[UptimePacketValue], | ||||
| priority: DetectorPriorityLevel, | ||||
| ) -> tuple[DetectorOccurrence, EventData]: | ||||
| result = data_packet.packet.check_result | ||||
| uptime_subscription = data_packet.packet.subscription | ||||
|
|
||||
| occurrence = DetectorOccurrence( | ||||
| issue_title=f"Downtime detected for {uptime_subscription.url}", | ||||
| subtitle="Your monitored domain is down", | ||||
| evidence_display=build_evidence_display(result), | ||||
| type=UptimeDomainCheckFailure, | ||||
| level="error", | ||||
| culprit="", # TODO: The url? | ||||
| assignee=self.detector.owner, | ||||
| priority=priority, | ||||
| ) | ||||
| event_data = build_event_data(result, self.detector) | ||||
|
|
||||
| return (occurrence, event_data) | ||||
|
|
||||
|
|
||||
| @dataclass(frozen=True) | ||||
|
|
@@ -24,6 +298,7 @@ class UptimeDomainCheckFailure(GroupType): | |||
| enable_auto_resolve = False | ||||
| enable_escalation_detection = False | ||||
| detector_settings = DetectorSettings( | ||||
| handler=UptimeDetectorHandler, | ||||
| config_schema={ | ||||
| "$schema": "https://json-schema.org/draft/2020-12/schema", | ||||
| "description": "A representation of an uptime alert", | ||||
|
|
||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: Enabled
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will get this in a follow up