diff --git a/src/sentry/uptime/consumers/results_consumer.py b/src/sentry/uptime/consumers/results_consumer.py index c78a94366f5906..dd973695215c59 100644 --- a/src/sentry/uptime/consumers/results_consumer.py +++ b/src/sentry/uptime/consumers/results_consumer.py @@ -26,6 +26,7 @@ ) from sentry.uptime.detectors.ranking import _get_cluster from sentry.uptime.detectors.result_handler import handle_onboarding_result +from sentry.uptime.grouptype import UptimePacketValue from sentry.uptime.issue_platform import create_issue_platform_occurrence, resolve_uptime_issue from sentry.uptime.models import ( UptimeStatus, @@ -43,11 +44,17 @@ send_uptime_config_deletion, update_remote_uptime_subscription, ) -from sentry.uptime.types import IncidentStatus, ProjectUptimeSubscriptionMode +from sentry.uptime.types import ( + DATA_SOURCE_UPTIME_SUBSCRIPTION, + IncidentStatus, + ProjectUptimeSubscriptionMode, +) from sentry.utils import metrics from sentry.utils.arroyo_producer import SingletonProducer from sentry.utils.kafka_config import get_kafka_producer_cluster_options, get_topic_definition +from sentry.workflow_engine.models.data_source import DataPacket from sentry.workflow_engine.models.detector import Detector +from sentry.workflow_engine.processors.data_packet import process_data_packets logger = logging.getLogger(__name__) @@ -292,6 +299,33 @@ def handle_active_result( result: CheckResult, metric_tags: dict[str, str], ): + organization = detector.project.organization + + if features.has("organizations:uptime-detector-handler", organization): + # XXX(epurkhiser): Enabling the uptime-detector-handler will process + # check results via the uptime detector handler. However the handler + # WILL NOT produce issue occurrences (however it will do nearly + # everything else, including logging that it will produce) + packet = UptimePacketValue( + check_result=result, + subscription=uptime_subscription, + metric_tags=metric_tags, + ) + process_data_packets( + [DataPacket(source_id=str(uptime_subscription.id), packet=packet)], + DATA_SOURCE_UPTIME_SUBSCRIPTION, + ) + + # Bail if we're doing issue creation via detectors, we don't want to + # create issues using the legacy system in this case. If this flag is + # not enabkled the detector will still run, but will not produce an + # issue occurrence. + # + # Once we've determined that the detector handler is producing issues + # the same as the legacy issue creation, we can remove this. + if features.has("organizations:uptime-detector-create-issues", organization): + return + uptime_status = uptime_subscription.uptime_status result_status = result["status"] diff --git a/src/sentry/uptime/grouptype.py b/src/sentry/uptime/grouptype.py index a69fbf9f926242..eda8f52454c639 100644 --- a/src/sentry/uptime/grouptype.py +++ b/src/sentry/uptime/grouptype.py @@ -1,15 +1,289 @@ from __future__ import annotations +import logging from dataclasses import dataclass +from datetime import datetime +from typing import override +from django.utils import timezone as django_timezone +from sentry_kafka_schemas.schema_types.uptime_results_v1 import CheckResult, CheckStatus + +from sentry import features, options from sentry.issues.grouptype import GroupCategory, GroupType +from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence +from sentry.issues.status_change_message import StatusChangeMessage from sentry.ratelimits.sliding_windows import Quota from sentry.types.group import PriorityLevel +from sentry.uptime.models import UptimeStatus, UptimeSubscription, get_project_subscription from sentry.uptime.types import ( GROUP_TYPE_UPTIME_DOMAIN_CHECK_FAILURE, ProjectUptimeSubscriptionMode, ) -from sentry.workflow_engine.types import DetectorSettings +from sentry.utils import metrics +from sentry.workflow_engine.handlers.detector.base import DetectorOccurrence, EventData +from sentry.workflow_engine.handlers.detector.stateful import ( + DetectorThresholds, + StatefulDetectorHandler, +) +from sentry.workflow_engine.models import DataPacket, Detector +from sentry.workflow_engine.processors.data_condition_group import ProcessedDataConditionGroup +from sentry.workflow_engine.types import ( + DetectorEvaluationResult, + DetectorGroupKey, + DetectorPriorityLevel, + DetectorSettings, +) + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class UptimePacketValue: + """ + Represents the value passed into the uptime detector + """ + + check_result: CheckResult + subscription: UptimeSubscription + metric_tags: dict[str, str] + + +def build_detector_fingerprint_component(detector: Detector) -> str: + return f"uptime-detector:{detector.id}" + + +def build_fingerprint(detector: Detector) -> list[str]: + return [build_detector_fingerprint_component(detector)] + + +def get_active_failure_threshold() -> int: + """ + When in active monitoring mode, overrides how many failures in a row we + need to see to mark the monitor as down + """ + return options.get("uptime.active-failure-threshold") + + +def get_active_recovery_threshold() -> int: + """ + When in active monitoring mode, how many successes in a row do we need to + mark it as up + """ + return options.get("uptime.active-recovery-threshold") + + +def build_evidence_display(result: CheckResult) -> list[IssueEvidence]: + evidence_display: list[IssueEvidence] = [] + + status_reason = result["status_reason"] + if status_reason: + reason_evidence = IssueEvidence( + name="Failure reason", + value=f'{status_reason["type"]} - {status_reason["description"]}', + important=True, + ) + evidence_display.extend([reason_evidence]) + + duration_evidence = IssueEvidence( + name="Duration", + value=f"{result["duration_ms"]}ms", + important=False, + ) + evidence_display.append(duration_evidence) + + request_info = result["request_info"] + if request_info: + method_evidence = IssueEvidence( + name="Method", + value=request_info["request_type"], + important=False, + ) + status_code_evidence = IssueEvidence( + name="Status Code", + value=str(request_info["http_status_code"]), + important=False, + ) + evidence_display.extend([method_evidence, status_code_evidence]) + + return evidence_display + + +def build_event_data(result: CheckResult, detector: Detector) -> EventData: + # Default environment when it hasn't been configured + env = detector.config.get("environment", "prod") + + # Received time is the actual time the check was performed. + received = datetime.fromtimestamp(result["actual_check_time_ms"] / 1000) + + # XXX(epurkhiser): This can be changed over to using the detector ID in the + # future once we're no longer using the ProjectUptimeSubscription.id as a tag. + project_subscription = get_project_subscription(detector) + + return { + "project_id": detector.project_id, + "environment": env, + "received": received, + "platform": "other", + "sdk": None, + "tags": { + "uptime_rule": str(project_subscription.id), + }, + "contexts": { + "trace": {"trace_id": result["trace_id"], "span_id": result.get("span_id")}, + }, + } + + +class UptimeDetectorHandler(StatefulDetectorHandler[UptimePacketValue, CheckStatus]): + @override + @property + def thresholds(self) -> DetectorThresholds: + return { + DetectorPriorityLevel.OK: get_active_recovery_threshold(), + DetectorPriorityLevel.HIGH: get_active_failure_threshold(), + } + + @override + def extract_value(self, data_packet: DataPacket[UptimePacketValue]) -> CheckStatus: + return data_packet.packet.check_result["status"] + + @override + def build_issue_fingerprint(self, group_key: DetectorGroupKey = None) -> list[str]: + # TODO(epurkhiser): We should migrate the fingerprints over to match + # what the default fingerprint is. + return build_fingerprint(self.detector) + + @override + def extract_dedupe_value(self, data_packet: DataPacket[UptimePacketValue]) -> int: + return int(data_packet.packet.check_result["scheduled_check_time_ms"]) + + @override + def evaluate( + self, data_packet: DataPacket[UptimePacketValue] + ) -> dict[DetectorGroupKey, DetectorEvaluationResult]: + result = super().evaluate(data_packet) + + if not result: + return result + + # Uptime does not use stateful detector value grouping + evaluation = result[None] + + uptime_subscription = data_packet.packet.subscription + metric_tags = data_packet.packet.metric_tags + + detector_issue_creation_enabled = features.has( + "organizations:uptime-detector-create-issues", + self.detector.project.organization, + ) + issue_creation_flag_enabled = features.has( + "organizations:uptime-create-issues", + self.detector.project.organization, + ) + restricted_host_provider_ids = options.get( + "uptime.restrict-issue-creation-by-hosting-provider-id" + ) + host_provider_id = uptime_subscription.host_provider_id + host_provider_enabled = host_provider_id not in restricted_host_provider_ids + + issue_creation_allowed = ( + detector_issue_creation_enabled + and issue_creation_flag_enabled + and host_provider_enabled + ) + + # XXX(epurkhiser): We currently are duplicating the detector state onto + # the uptime_subscription when the detector changes state. Once we stop + # using this field we can drop this update logic. + # + # We ONLY do this when detector issue creation is enabled, otherwise we + # let the legacy uptime consumer handle this. + if detector_issue_creation_enabled: + if evaluation.priority == DetectorPriorityLevel.OK: + uptime_status = UptimeStatus.OK + elif evaluation.priority != DetectorPriorityLevel.OK: + uptime_status = UptimeStatus.FAILED + + uptime_subscription.update( + uptime_status=uptime_status, + uptime_status_update_date=django_timezone.now(), + ) + + if not host_provider_enabled: + metrics.incr( + "uptime.result_processor.restricted_by_provider", + sample_rate=1.0, + tags={ + "host_provider_id": host_provider_id, + **metric_tags, + }, + ) + + result_creates_issue = isinstance(evaluation.result, IssueOccurrence) + result_resolves_issue = isinstance(evaluation.result, StatusChangeMessage) + + if result_creates_issue: + metrics.incr( + "uptime.detector.will_create_issue", + tags=metric_tags, + sample_rate=1.0, + ) + # XXX(epurkhiser): This logging includes the same extra arguments + # as the `uptime_active_sent_occurrence` log in the consumer for + # legacy creation + logger.info( + "uptime.detector.will_create_issue", + extra={ + "project_id": self.detector.project_id, + "url": uptime_subscription.url, + **data_packet.packet.check_result, + }, + ) + if result_resolves_issue: + metrics.incr( + "uptime.detector.will_resolve_issue", + sample_rate=1.0, + tags=metric_tags, + ) + logger.info( + "uptime.detector.will_resolve_issue", + extra={ + "project_id": self.detector.project_id, + "url": uptime_subscription.url, + **data_packet.packet.check_result, + }, + ) + + # Reutning an empty dict effectively causes the detector processor to + # bail and not produce an issue occurrence. + if result_creates_issue and not issue_creation_allowed: + return {} + + return result + + @override + def create_occurrence( + self, + evaluation_result: ProcessedDataConditionGroup, + data_packet: DataPacket[UptimePacketValue], + priority: DetectorPriorityLevel, + ) -> tuple[DetectorOccurrence, EventData]: + result = data_packet.packet.check_result + uptime_subscription = data_packet.packet.subscription + + occurrence = DetectorOccurrence( + issue_title=f"Downtime detected for {uptime_subscription.url}", + subtitle="Your monitored domain is down", + evidence_display=build_evidence_display(result), + type=UptimeDomainCheckFailure, + level="error", + culprit="", # TODO: The url? + assignee=self.detector.owner, + priority=priority, + ) + event_data = build_event_data(result, self.detector) + + return (occurrence, event_data) @dataclass(frozen=True) @@ -24,6 +298,7 @@ class UptimeDomainCheckFailure(GroupType): enable_auto_resolve = False enable_escalation_detection = False detector_settings = DetectorSettings( + handler=UptimeDetectorHandler, config_schema={ "$schema": "https://json-schema.org/draft/2020-12/schema", "description": "A representation of an uptime alert", diff --git a/src/sentry/uptime/issue_platform.py b/src/sentry/uptime/issue_platform.py index 38aee56d6dd332..bb002c9cd4f3c6 100644 --- a/src/sentry/uptime/issue_platform.py +++ b/src/sentry/uptime/issue_platform.py @@ -5,14 +5,23 @@ from sentry_kafka_schemas.schema_types.uptime_results_v1 import CheckResult -from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence +from sentry.issues.issue_occurrence import IssueOccurrence from sentry.issues.producer import PayloadType, produce_occurrence_to_kafka from sentry.issues.status_change_message import StatusChangeMessage from sentry.models.group import GroupStatus -from sentry.uptime.grouptype import UptimeDomainCheckFailure -from sentry.uptime.models import get_project_subscription, get_uptime_subscription +from sentry.uptime.grouptype import ( + UptimeDomainCheckFailure, + build_event_data, + build_evidence_display, + build_fingerprint, +) +from sentry.uptime.models import get_uptime_subscription from sentry.workflow_engine.models.detector import Detector +# XXX(epurkhiser): This module supports the legacy issue creation of uptime +# failures NOT using the uptime detector handler. In the future this module +# will be removed. + def create_issue_platform_occurrence(result: CheckResult, detector: Detector): occurrence = build_occurrence_from_result(result, detector) @@ -24,47 +33,8 @@ def create_issue_platform_occurrence(result: CheckResult, detector: Detector): ) -def build_detector_fingerprint_component(detector: Detector) -> str: - return f"uptime-detector:{detector.id}" - - -def build_fingerprint(detector: Detector) -> list[str]: - return [build_detector_fingerprint_component(detector)] - - def build_occurrence_from_result(result: CheckResult, detector: Detector) -> IssueOccurrence: uptime_subscription = get_uptime_subscription(detector) - status_reason = result["status_reason"] - assert status_reason - failure_reason = f'{status_reason["type"]} - {status_reason["description"]}' - evidence_display = [ - IssueEvidence( - name="Failure reason", - value=failure_reason, - important=True, - ), - IssueEvidence( - name="Duration", - value=f"{result["duration_ms"]}ms", - important=False, - ), - ] - request_info = result["request_info"] - if request_info: - evidence_display.append( - IssueEvidence( - name="Method", - value=request_info["request_type"], - important=False, - ) - ) - evidence_display.append( - IssueEvidence( - name="Status Code", - value=str(request_info["http_status_code"]), - important=False, - ), - ) return IssueOccurrence( id=uuid.uuid4().hex, @@ -75,7 +45,7 @@ def build_occurrence_from_result(result: CheckResult, detector: Detector) -> Iss type=UptimeDomainCheckFailure, issue_title=f"Downtime detected for {uptime_subscription.url}", subtitle="Your monitored domain is down", - evidence_display=evidence_display, + evidence_display=build_evidence_display(result), evidence_data={}, culprit="", # TODO: The url? detection_time=datetime.now(timezone.utc), @@ -89,27 +59,13 @@ def build_event_data_for_occurrence( detector: Detector, occurrence: IssueOccurrence, ): - # Default environment when it hasn't been configured - env = detector.config.get("environment", "prod") - - # XXX(epurkhiser): This can be changed over to using the detector ID in the - # future once we're no longer using the ProjectUptimeSubscription.id as a tag. - project_subscription = get_project_subscription(detector) + common_event_data = build_event_data(result, detector) return { - "environment": env, + **common_event_data, "event_id": occurrence.event_id, "fingerprint": occurrence.fingerprint, - "platform": "other", - "project_id": occurrence.project_id, - # We set this to the time that the check was performed - "received": datetime.fromtimestamp(result["actual_check_time_ms"] / 1000), - "sdk": None, - "tags": { - "uptime_rule": str(project_subscription.id), - }, "timestamp": occurrence.detection_time.isoformat(), - "contexts": {"trace": {"trace_id": result["trace_id"], "span_id": result.get("span_id")}}, } diff --git a/tests/sentry/uptime/consumers/test_results_consumer.py b/tests/sentry/uptime/consumers/test_results_consumer.py index 243bf182e372b4..3a0535a88de351 100644 --- a/tests/sentry/uptime/consumers/test_results_consumer.py +++ b/tests/sentry/uptime/consumers/test_results_consumer.py @@ -38,8 +38,7 @@ build_onboarding_failure_key, ) from sentry.uptime.detectors.tasks import is_failed_url -from sentry.uptime.grouptype import UptimeDomainCheckFailure -from sentry.uptime.issue_platform import build_detector_fingerprint_component +from sentry.uptime.grouptype import UptimeDomainCheckFailure, build_detector_fingerprint_component from sentry.uptime.models import ( ProjectUptimeSubscription, UptimeStatus, @@ -95,13 +94,18 @@ def send_result( consumer.submit(message) def test(self): + features = [ + "organizations:uptime", + "organizations:uptime-create-issues", + "organizations:uptime-detector-handler", + ] result = self.create_uptime_result( self.subscription.subscription_id, scheduled_check_time=datetime.now() - timedelta(minutes=5), ) with ( mock.patch("sentry.uptime.consumers.results_consumer.metrics") as metrics, - self.feature(["organizations:uptime", "organizations:uptime-create-issues"]), + self.feature(features), mock.patch( "sentry.uptime.consumers.results_consumer.get_active_failure_threshold", return_value=2, @@ -164,7 +168,110 @@ def test(self): self.project_subscription.refresh_from_db() assert self.project_subscription.uptime_subscription.uptime_status == UptimeStatus.FAILED + def test_detector_handler(self): + """ + Simple test that the detector handler works as expected end-to-end. + """ + features = [ + "organizations:uptime", + "organizations:uptime-create-issues", + "organizations:uptime-detector-handler", + "organizations:uptime-detector-create-issues", + ] + + fingerprint = build_detector_fingerprint_component(self.detector).encode("utf-8") + hashed_fingerprint = md5(fingerprint).hexdigest() + + with ( + self.feature(features), + mock.patch("sentry.uptime.consumers.results_consumer.metrics") as metrics, + mock.patch("sentry.uptime.grouptype.get_active_failure_threshold", return_value=2), + # Only needed to make sure we don't inadvertently also create an + # issue using the legacy issue creation. + mock.patch( + "sentry.uptime.consumers.results_consumer.get_active_failure_threshold", + return_value=2, + ), + ): + self.send_result( + self.create_uptime_result( + self.subscription.subscription_id, + scheduled_check_time=datetime.now() - timedelta(minutes=5), + ) + ) + assert not Group.objects.filter(grouphash__hash=hashed_fingerprint).exists() + self.send_result( + self.create_uptime_result( + self.subscription.subscription_id, + scheduled_check_time=datetime.now() - timedelta(minutes=4), + ) + ) + # Issue is created + assert Group.objects.filter(grouphash__hash=hashed_fingerprint).exists() + + # Be sure we did NOT create this issue using the legacy metrics + legacy_sent_occurrence_calls = [ + c + for c in metrics.incr.mock_calls + if c[1][0] == "uptime.result_processor.active.sent_occurrence" + ] + assert len(legacy_sent_occurrence_calls) == 0 + + group = Group.objects.get(grouphash__hash=hashed_fingerprint) + assert group.issue_type == UptimeDomainCheckFailure + assignee = group.get_assignee() + assert assignee and (assignee.id == self.user.id) + self.project_subscription.refresh_from_db() + assert self.project_subscription.uptime_subscription.uptime_status == UptimeStatus.FAILED + + # Issue is resolved + with ( + self.feature(features), + mock.patch("sentry.uptime.consumers.results_consumer.metrics") as metrics, + mock.patch("sentry.uptime.grouptype.get_active_recovery_threshold", return_value=2), + # Only needed to make sure we don't inadvertently also attempt to + # resolve the issue using the legacy system. + mock.patch( + "sentry.uptime.consumers.results_consumer.get_active_recovery_threshold", + return_value=2, + ), + ): + self.send_result( + self.create_uptime_result( + self.subscription.subscription_id, + status=CHECKSTATUS_SUCCESS, + scheduled_check_time=datetime.now() - timedelta(minutes=3), + ) + ) + assert not Group.objects.filter( + grouphash__hash=hashed_fingerprint, status=GroupStatus.RESOLVED + ).exists() + self.send_result( + self.create_uptime_result( + self.subscription.subscription_id, + status=CHECKSTATUS_SUCCESS, + scheduled_check_time=datetime.now() - timedelta(minutes=2), + ) + ) + # Issue is resolved + assert Group.objects.filter( + grouphash__hash=hashed_fingerprint, status=GroupStatus.RESOLVED + ).exists() + + # Be sure we did NOT create this issue using the legacy metrics + legacy_resolve_calls = [ + c + for c in metrics.incr.mock_calls + if c[1][0] == "uptime.result_processor.active.resolved" + ] + assert len(legacy_resolve_calls) == 0 + def test_does_nothing_when_missing_project_subscription(self): + features = [ + "organizations:uptime", + "organizations:uptime-create-issues", + "organizations:uptime-detector-handler", + ] self.detector.delete() result = self.create_uptime_result( @@ -172,7 +279,7 @@ def test_does_nothing_when_missing_project_subscription(self): scheduled_check_time=datetime.now() - timedelta(minutes=5), ) with ( - self.feature(["organizations:uptime", "organizations:uptime-create-issues"]), + self.feature(features), mock.patch("sentry.remote_subscriptions.consumers.result_consumer.logger") as logger, mock.patch( "sentry.uptime.consumers.results_consumer.remove_uptime_subscription_if_unused" @@ -189,13 +296,18 @@ def test_restricted_host_provider_id(self): has been restricted using the `restrict-issue-creation-by-hosting-provider-id` option. """ + features = [ + "organizations:uptime", + "organizations:uptime-create-issues", + "organizations:uptime-detector-handler", + ] result = self.create_uptime_result( self.subscription.subscription_id, scheduled_check_time=datetime.now() - timedelta(minutes=5), ) with ( mock.patch("sentry.uptime.consumers.results_consumer.metrics") as metrics, - self.feature(["organizations:uptime", "organizations:uptime-create-issues"]), + self.feature(features), mock.patch( "sentry.uptime.consumers.results_consumer.get_active_failure_threshold", return_value=1, @@ -230,9 +342,14 @@ def test_restricted_host_provider_id(self): assert self.project_subscription.uptime_subscription.uptime_status == UptimeStatus.FAILED def test_reset_fail_count(self): + features = [ + "organizations:uptime", + "organizations:uptime-create-issues", + "organizations:uptime-detector-handler", + ] with ( mock.patch("sentry.uptime.consumers.results_consumer.metrics") as metrics, - self.feature(["organizations:uptime", "organizations:uptime-create-issues"]), + self.feature(features), ): self.send_result( self.create_uptime_result( @@ -360,9 +477,14 @@ def test_no_create_issues_feature(self): assert self.project_subscription.uptime_subscription.uptime_status == UptimeStatus.FAILED def test_resolve(self): + features = [ + "organizations:uptime", + "organizations:uptime-create-issues", + "organizations:uptime-detector-handler", + ] with ( mock.patch("sentry.uptime.consumers.results_consumer.metrics") as metrics, - self.feature(["organizations:uptime", "organizations:uptime-create-issues"]), + self.feature(features), mock.patch( "sentry.uptime.consumers.results_consumer.get_active_failure_threshold", return_value=2, @@ -451,11 +573,16 @@ def test_resolve(self): assert self.project_subscription.uptime_subscription.uptime_status == UptimeStatus.OK def test_no_subscription(self): + features = [ + "organizations:uptime", + "organizations:uptime-create-issues", + "organizations:uptime-detector-handler", + ] subscription_id = uuid.uuid4().hex result = self.create_uptime_result(subscription_id, uptime_region="default") with ( mock.patch("sentry.uptime.consumers.results_consumer.metrics") as metrics, - self.feature(["organizations:uptime", "organizations:uptime-create-issues"]), + self.feature(features), ): self.send_result(result) metrics.incr.assert_has_calls( @@ -496,6 +623,11 @@ def test_organization_feature_disabled(self): ) def test_skip_already_processed(self): + features = [ + "organizations:uptime", + "organizations:uptime-create-issues", + "organizations:uptime-detector-handler", + ] result = self.create_uptime_result(self.subscription.subscription_id) _get_cluster().set( build_last_update_key(self.detector), @@ -503,7 +635,7 @@ def test_skip_already_processed(self): ) with ( mock.patch("sentry.uptime.consumers.results_consumer.metrics") as metrics, - self.feature(["organizations:uptime", "organizations:uptime-create-issues"]), + self.feature(features), ): self.send_result(result) metrics.incr.assert_has_calls( @@ -538,6 +670,11 @@ def test_skip_already_processed(self): Group.objects.get(grouphash__hash=hashed_fingerprint) def test_skip_shadow_region(self): + features = [ + "organizations:uptime", + "organizations:uptime-create-issues", + "organizations:uptime-detector-handler", + ] region_name = "shadow" self.create_uptime_subscription_region( self.subscription, region_name, UptimeSubscriptionRegion.RegionMode.SHADOW @@ -549,7 +686,7 @@ def test_skip_shadow_region(self): ) with ( mock.patch("sentry.uptime.consumers.results_consumer.metrics") as metrics, - self.feature(["organizations:uptime", "organizations:uptime-create-issues"]), + self.feature(features), ): self.send_result(result) metrics.incr.assert_has_calls( @@ -571,13 +708,18 @@ def test_skip_shadow_region(self): Group.objects.get(grouphash__hash=hashed_fingerprint) def test_missed(self): + features = [ + "organizations:uptime", + "organizations:uptime-create-issues", + "organizations:uptime-detector-handler", + ] result = self.create_uptime_result( self.subscription.subscription_id, status=CHECKSTATUS_MISSED_WINDOW ) with ( mock.patch("sentry.uptime.consumers.results_consumer.metrics") as metrics, mock.patch("sentry.uptime.consumers.results_consumer.logger") as logger, - self.feature(["organizations:uptime", "organizations:uptime-create-issues"]), + self.feature(features), ): self.send_result(result) metrics.incr.assert_called_once_with( @@ -601,6 +743,11 @@ def test_missed(self): Group.objects.get(grouphash__hash=hashed_fingerprint) def test_onboarding_failure(self): + features = [ + "organizations:uptime", + "organizations:uptime-create-issues", + "organizations:uptime-detector-handler", + ] self.project_subscription.update( mode=ProjectUptimeSubscriptionMode.AUTO_DETECTED_ONBOARDING ) @@ -621,7 +768,7 @@ def test_onboarding_failure(self): assert redis.get(key) is None with ( mock.patch("sentry.uptime.consumers.results_consumer.metrics") as metrics, - self.feature(["organizations:uptime", "organizations:uptime-create-issues"]), + self.feature(features), ): self.send_result(result) metrics.incr.assert_has_calls( @@ -659,7 +806,7 @@ def test_onboarding_failure(self): "sentry.uptime.detectors.result_handler.ONBOARDING_FAILURE_THRESHOLD", new=2 ), self.tasks(), - self.feature(["organizations:uptime", "organizations:uptime-create-issues"]), + self.feature(features), ): remove_call_vals = [] @@ -715,6 +862,11 @@ def capture_remove_seat(data_category, seat_object): self.project_subscription.refresh_from_db() def test_onboarding_success_ongoing(self): + features = [ + "organizations:uptime", + "organizations:uptime-create-issues", + "organizations:uptime-detector-handler", + ] self.project_subscription.update( mode=ProjectUptimeSubscriptionMode.AUTO_DETECTED_ONBOARDING, date_added=datetime.now(timezone.utc) - timedelta(minutes=5), @@ -737,7 +889,7 @@ def test_onboarding_success_ongoing(self): assert redis.get(key) is None with ( mock.patch("sentry.uptime.consumers.results_consumer.metrics") as metrics, - self.feature(["organizations:uptime", "organizations:uptime-create-issues"]), + self.feature(features), ): self.send_result(result) metrics.incr.assert_has_calls( @@ -763,6 +915,11 @@ def test_onboarding_success_ongoing(self): Group.objects.get(grouphash__hash=hashed_fingerprint) def test_onboarding_success_graduate(self): + features = [ + "organizations:uptime", + "organizations:uptime-create-issues", + "organizations:uptime-detector-handler", + ] self.project_subscription.update( mode=ProjectUptimeSubscriptionMode.AUTO_DETECTED_ONBOARDING, date_added=datetime.now(timezone.utc) @@ -790,7 +947,7 @@ def test_onboarding_success_graduate(self): mock.patch("sentry.uptime.consumers.results_consumer.metrics") as consumer_metrics, mock.patch("sentry.uptime.detectors.result_handler.metrics") as onboarding_metrics, self.tasks(), - self.feature(["organizations:uptime", "organizations:uptime-create-issues"]), + self.feature(features), ): self.send_result(result) consumer_metrics.incr.assert_has_calls( @@ -933,6 +1090,11 @@ def test_parallel_grouping(self, mock_process_group) -> None: assert group_2 == [result_3] def test_provider_stats(self): + features = [ + "organizations:uptime", + "organizations:uptime-create-issues", + "organizations:uptime-detector-handler", + ] subscription = self.create_uptime_subscription( subscription_id=uuid.uuid4().hex, host_provider_name="test_provider", @@ -953,7 +1115,7 @@ def test_provider_stats(self): with ( mock.patch("sentry.uptime.consumers.results_consumer.metrics") as metrics, - self.feature(["organizations:uptime", "organizations:uptime-create-issues"]), + self.feature(features), mock.patch( "sentry.uptime.consumers.results_consumer.get_active_failure_threshold", return_value=2, diff --git a/tests/sentry/uptime/subscriptions/test_subscriptions.py b/tests/sentry/uptime/subscriptions/test_subscriptions.py index 7323a571d351a4..3e8ae2dace55be 100644 --- a/tests/sentry/uptime/subscriptions/test_subscriptions.py +++ b/tests/sentry/uptime/subscriptions/test_subscriptions.py @@ -15,11 +15,8 @@ from sentry.testutils.helpers import override_options from sentry.testutils.skips import requires_kafka from sentry.types.actor import Actor -from sentry.uptime.grouptype import UptimeDomainCheckFailure -from sentry.uptime.issue_platform import ( - build_detector_fingerprint_component, - create_issue_platform_occurrence, -) +from sentry.uptime.grouptype import UptimeDomainCheckFailure, build_detector_fingerprint_component +from sentry.uptime.issue_platform import create_issue_platform_occurrence from sentry.uptime.models import ( ProjectUptimeSubscription, UptimeStatus, diff --git a/tests/sentry/uptime/test_grouptype.py b/tests/sentry/uptime/test_grouptype.py index 47e247d8983354..aec806ba48ae99 100644 --- a/tests/sentry/uptime/test_grouptype.py +++ b/tests/sentry/uptime/test_grouptype.py @@ -1,9 +1,221 @@ +from datetime import datetime, timedelta +from unittest import mock + import pytest from jsonschema import ValidationError +from sentry_kafka_schemas.schema_types.uptime_results_v1 import CheckResult -from sentry.testutils.cases import TestCase -from sentry.uptime.grouptype import UptimeDomainCheckFailure +from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence +from sentry.testutils.cases import TestCase, UptimeTestCase +from sentry.testutils.helpers.datetime import freeze_time +from sentry.uptime.grouptype import ( + UptimeDetectorHandler, + UptimeDomainCheckFailure, + UptimePacketValue, + build_detector_fingerprint_component, + build_event_data, + build_evidence_display, + build_fingerprint, +) +from sentry.uptime.models import UptimeStatus, UptimeSubscription, get_detector from sentry.uptime.types import ProjectUptimeSubscriptionMode +from sentry.workflow_engine.models.data_source import DataPacket +from sentry.workflow_engine.models.detector import Detector +from sentry.workflow_engine.types import DetectorPriorityLevel + + +class BuildDetectorFingerprintComponentTest(UptimeTestCase): + def test_build_detector_fingerprint_component(self): + project_subscription = self.create_project_uptime_subscription() + detector = get_detector(project_subscription.uptime_subscription) + assert detector + + fingerprint_component = build_detector_fingerprint_component(detector) + assert fingerprint_component == f"uptime-detector:{detector.id}" + + +class BuildFingerprintForProjectSubscriptionTest(UptimeTestCase): + def test_build_fingerprint_for_project_subscription(self): + project_subscription = self.create_project_uptime_subscription() + detector = get_detector(project_subscription.uptime_subscription) + assert detector + + fingerprint = build_fingerprint(detector) + expected_fingerprint = [build_detector_fingerprint_component(detector)] + assert fingerprint == expected_fingerprint + + +class BuildEvidenceDisplayTest(UptimeTestCase): + def test_build_evidence_display(self): + result = self.create_uptime_result() + assert build_evidence_display(result) == [ + IssueEvidence(name="Failure reason", value="timeout - it timed out", important=True), + IssueEvidence(name="Duration", value="100ms", important=False), + IssueEvidence(name="Method", value="HEAD", important=False), + IssueEvidence(name="Status Code", value="500", important=False), + ] + + +@freeze_time() +class BuildEventDataTest(UptimeTestCase): + def test_build_event_data(self): + result = self.create_uptime_result() + project_subscription = self.create_project_uptime_subscription() + detector = get_detector(project_subscription.uptime_subscription) + assert detector + + assert build_event_data(result, detector) == { + "environment": "development", + "platform": "other", + "project_id": detector.project_id, + "received": datetime.now().replace(microsecond=0), + "sdk": None, + "tags": {"uptime_rule": str(project_subscription.id)}, + "contexts": { + "trace": {"trace_id": result["trace_id"], "span_id": result.get("span_id")} + }, + } + + +class TestUptimeHandler(UptimeTestCase): + def handle_result(self, detector: Detector, sub: UptimeSubscription, check_result: CheckResult): + handler = UptimeDetectorHandler(detector) + + value = UptimePacketValue( + check_result=check_result, + subscription=sub, + metric_tags={}, + ) + data_packet = DataPacket[UptimePacketValue]( + source_id=str(sub.id), + packet=value, + ) + evaluation = handler.evaluate(data_packet) + + if None not in evaluation: + return None + + return evaluation[None] + + def test_simple_evaluate(self): + project_subscription = self.create_project_uptime_subscription() + uptime_subscription = project_subscription.uptime_subscription + detector = get_detector(project_subscription.uptime_subscription) + assert detector + + assert uptime_subscription.uptime_status == UptimeStatus.OK + + now = datetime.now() + + features = [ + "organizations:uptime-create-issues", + "organizations:uptime-detector-create-issues", + ] + + with ( + self.feature(features), + mock.patch("sentry.uptime.grouptype.get_active_failure_threshold", return_value=2), + ): + evaluation = self.handle_result( + detector, + uptime_subscription, + self.create_uptime_result(scheduled_check_time=now - timedelta(minutes=5)), + ) + assert evaluation is None + + # Second evaluation produces a DetectorEvaluationResult + evaluation = self.handle_result( + detector, + uptime_subscription, + self.create_uptime_result(scheduled_check_time=now - timedelta(minutes=4)), + ) + assert evaluation is not None + assert evaluation.priority == DetectorPriorityLevel.HIGH + assert isinstance(evaluation.result, IssueOccurrence) + assert ( + evaluation.result.issue_title == f"Downtime detected for {uptime_subscription.url}" + ) + + # Fingerprint includes the existing uptime fingerprints, without + # this we would create new issues instead of reusing the existing + # issues. + fingerprint = set(build_fingerprint(detector)) + assert fingerprint & set(evaluation.result.fingerprint) == fingerprint + + # Update the uptime_status. In the future this will be removed and + # we'll just use the DetectorState models to represent this + assert uptime_subscription.uptime_status == UptimeStatus.FAILED + + def test_issue_creation_disabled(self): + project_subscription = self.create_project_uptime_subscription() + uptime_subscription = project_subscription.uptime_subscription + detector = get_detector(project_subscription.uptime_subscription) + assert detector + + assert uptime_subscription.uptime_status == UptimeStatus.OK + + with ( + # Only uptime-create-issues enabled, will not create issues because + # uptime-detector-create-issues is not enabled + self.feature(["organizations:uptime-create-issues"]), + mock.patch("sentry.uptime.grouptype.get_active_failure_threshold", return_value=1), + mock.patch("sentry.uptime.grouptype.logger") as logger, + ): + check_result = self.create_uptime_result() + evaluation = self.handle_result(detector, uptime_subscription, check_result) + assert evaluation is None + + # Produces a log that we can use to validate that it _would_ create + # an issue. + logger.info.assert_called_with( + "uptime.detector.will_create_issue", + extra={ + "project_id": detector.project_id, + "url": uptime_subscription.url, + **check_result, + }, + ) + + # the uptime_status does NOT change even though we did a full + # evaluation. This should only be updated when detectors are also + # creating issues + assert uptime_subscription.uptime_status == UptimeStatus.OK + + with ( + # Only uptime-detector-create-issues enabled, will not create + # issues because uptime-create-issues is not enabled + self.feature(["organizations:uptime-detector-create-issues"]), + mock.patch("sentry.uptime.grouptype.get_active_failure_threshold", return_value=1), + ): + evaluation = self.handle_result( + detector, + uptime_subscription, + self.create_uptime_result(), + ) + assert evaluation is None + + features = [ + "organizations:uptime-create-issues", + "organizations:uptime-detector-create-issues", + ] + options = { + "uptime.restrict-issue-creation-by-hosting-provider-id": [ + uptime_subscription.host_provider_id + ] + } + + with ( + # All features enabled, but the host provider is disabled + self.feature(features), + self.options(options), + mock.patch("sentry.uptime.grouptype.get_active_failure_threshold", return_value=1), + ): + evaluation = self.handle_result( + detector, + uptime_subscription, + self.create_uptime_result(), + ) + assert evaluation is None class TestUptimeDomainCheckFailureDetectorConfig(TestCase): diff --git a/tests/sentry/uptime/test_issue_platform.py b/tests/sentry/uptime/test_issue_platform.py index 1cafa1fd0b58d2..1d52add7513d2a 100644 --- a/tests/sentry/uptime/test_issue_platform.py +++ b/tests/sentry/uptime/test_issue_platform.py @@ -9,11 +9,9 @@ from sentry.models.group import Group, GroupStatus from sentry.testutils.cases import UptimeTestCase from sentry.testutils.helpers.datetime import freeze_time -from sentry.uptime.grouptype import UptimeDomainCheckFailure +from sentry.uptime.grouptype import UptimeDomainCheckFailure, build_detector_fingerprint_component from sentry.uptime.issue_platform import ( - build_detector_fingerprint_component, build_event_data_for_occurrence, - build_fingerprint, build_occurrence_from_result, create_issue_platform_occurrence, resolve_uptime_issue, @@ -21,27 +19,6 @@ from sentry.uptime.models import get_detector -class BuildDetectorFingerprintComponentTest(UptimeTestCase): - def test_build_detector_fingerprint_component(self): - project_subscription = self.create_project_uptime_subscription() - detector = get_detector(project_subscription.uptime_subscription) - assert detector - - fingerprint_component = build_detector_fingerprint_component(detector) - assert fingerprint_component == f"uptime-detector:{detector.id}" - - -class BuildFingerprintForProjectSubscriptionTest(UptimeTestCase): - def test_build_fingerprint_for_project_subscription(self): - project_subscription = self.create_project_uptime_subscription() - detector = get_detector(project_subscription.uptime_subscription) - assert detector - - fingerprint = build_fingerprint(detector) - expected_fingerprint = [build_detector_fingerprint_component(detector)] - assert fingerprint == expected_fingerprint - - @freeze_time() class CreateIssuePlatformOccurrenceTest(UptimeTestCase): @patch("sentry.uptime.issue_platform.produce_occurrence_to_kafka") @@ -135,7 +112,7 @@ def test(self): build_detector_fingerprint_component(detector), ], "platform": "other", - "project_id": 1, + "project_id": detector.project_id, "received": datetime.datetime.now().replace(microsecond=0), "sdk": None, "tags": {"uptime_rule": str(project_subscription.id)},