Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 35 additions & 1 deletion src/sentry/uptime/consumers/results_consumer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
)
from sentry.uptime.detectors.ranking import _get_cluster
from sentry.uptime.detectors.result_handler import handle_onboarding_result
from sentry.uptime.grouptype import UptimePacketValue
from sentry.uptime.issue_platform import create_issue_platform_occurrence, resolve_uptime_issue
from sentry.uptime.models import (
UptimeStatus,
Expand All @@ -43,11 +44,17 @@
send_uptime_config_deletion,
update_remote_uptime_subscription,
)
from sentry.uptime.types import IncidentStatus, ProjectUptimeSubscriptionMode
from sentry.uptime.types import (
DATA_SOURCE_UPTIME_SUBSCRIPTION,
IncidentStatus,
ProjectUptimeSubscriptionMode,
)
from sentry.utils import metrics
from sentry.utils.arroyo_producer import SingletonProducer
from sentry.utils.kafka_config import get_kafka_producer_cluster_options, get_topic_definition
from sentry.workflow_engine.models.data_source import DataPacket
from sentry.workflow_engine.models.detector import Detector
from sentry.workflow_engine.processors.data_packet import process_data_packets

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -292,6 +299,33 @@ def handle_active_result(
result: CheckResult,
metric_tags: dict[str, str],
):
organization = detector.project.organization

if features.has("organizations:uptime-detector-handler", organization):
# XXX(epurkhiser): Enabling the uptime-detector-handler will process
# check results via the uptime detector handler. However the handler
# WILL NOT produce issue occurrences (however it will do nearly
# everything else, including logging that it will produce)
packet = UptimePacketValue(
check_result=result,
subscription=uptime_subscription,
metric_tags=metric_tags,
)
process_data_packets(
[DataPacket(source_id=str(uptime_subscription.id), packet=packet)],
DATA_SOURCE_UPTIME_SUBSCRIPTION,
)

# Bail if we're doing issue creation via detectors, we don't want to
# create issues using the legacy system in this case. If this flag is
# not enabkled the detector will still run, but will not produce an
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Enabled

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will get this in a follow up

# issue occurrence.
#
# Once we've determined that the detector handler is producing issues
# the same as the legacy issue creation, we can remove this.
if features.has("organizations:uptime-detector-create-issues", organization):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This bit is important as it makes sure we don't create multiple occurrences at the same time.

return

uptime_status = uptime_subscription.uptime_status
result_status = result["status"]

Expand Down
277 changes: 276 additions & 1 deletion src/sentry/uptime/grouptype.py
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For metrics we also have the handler inside the grouptype module. It's a bit difficult to have these separated because there's a interdependency on the two.

Original file line number Diff line number Diff line change
@@ -1,15 +1,289 @@
from __future__ import annotations

import logging
from dataclasses import dataclass
from datetime import datetime
from typing import override

from django.utils import timezone as django_timezone
from sentry_kafka_schemas.schema_types.uptime_results_v1 import CheckResult, CheckStatus

from sentry import features, options
from sentry.issues.grouptype import GroupCategory, GroupType
from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
from sentry.issues.status_change_message import StatusChangeMessage
from sentry.ratelimits.sliding_windows import Quota
from sentry.types.group import PriorityLevel
from sentry.uptime.models import UptimeStatus, UptimeSubscription, get_project_subscription
from sentry.uptime.types import (
GROUP_TYPE_UPTIME_DOMAIN_CHECK_FAILURE,
ProjectUptimeSubscriptionMode,
)
from sentry.workflow_engine.types import DetectorSettings
from sentry.utils import metrics
from sentry.workflow_engine.handlers.detector.base import DetectorOccurrence, EventData
from sentry.workflow_engine.handlers.detector.stateful import (
DetectorThresholds,
StatefulDetectorHandler,
)
from sentry.workflow_engine.models import DataPacket, Detector
from sentry.workflow_engine.processors.data_condition_group import ProcessedDataConditionGroup
from sentry.workflow_engine.types import (
DetectorEvaluationResult,
DetectorGroupKey,
DetectorPriorityLevel,
DetectorSettings,
)

logger = logging.getLogger(__name__)


@dataclass(frozen=True)
class UptimePacketValue:
"""
Represents the value passed into the uptime detector
"""

check_result: CheckResult
subscription: UptimeSubscription
metric_tags: dict[str, str]


def build_detector_fingerprint_component(detector: Detector) -> str:
return f"uptime-detector:{detector.id}"
Comment on lines +51 to +52
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to start sending this new fingerprint type along with issue occurrences now, so then we can backfill

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have been actually, we use the same fingerprint here

fingerprint=build_fingerprint(detector),



def build_fingerprint(detector: Detector) -> list[str]:
return [build_detector_fingerprint_component(detector)]


def get_active_failure_threshold() -> int:
"""
When in active monitoring mode, overrides how many failures in a row we
need to see to mark the monitor as down
"""
return options.get("uptime.active-failure-threshold")


def get_active_recovery_threshold() -> int:
"""
When in active monitoring mode, how many successes in a row do we need to
mark it as up
"""
return options.get("uptime.active-recovery-threshold")


def build_evidence_display(result: CheckResult) -> list[IssueEvidence]:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shared now with the issue_platform module (which will be removed later)

evidence_display: list[IssueEvidence] = []

status_reason = result["status_reason"]
if status_reason:
reason_evidence = IssueEvidence(
name="Failure reason",
value=f'{status_reason["type"]} - {status_reason["description"]}',
important=True,
)
evidence_display.extend([reason_evidence])

duration_evidence = IssueEvidence(
name="Duration",
value=f"{result["duration_ms"]}ms",
important=False,
)
evidence_display.append(duration_evidence)

request_info = result["request_info"]
if request_info:
method_evidence = IssueEvidence(
name="Method",
value=request_info["request_type"],
important=False,
)
status_code_evidence = IssueEvidence(
name="Status Code",
value=str(request_info["http_status_code"]),
important=False,
)
evidence_display.extend([method_evidence, status_code_evidence])

return evidence_display


def build_event_data(result: CheckResult, detector: Detector) -> EventData:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also shared now with the issue platform

# Default environment when it hasn't been configured
env = detector.config.get("environment", "prod")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this normal behavior? Does every detector need this? Should we at least make the default a constant?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is specific to the uptime detector. Unfortunately we hardcode default environment names a number of places in the sentry backend, so this is par for the course right now


# Received time is the actual time the check was performed.
received = datetime.fromtimestamp(result["actual_check_time_ms"] / 1000)

# XXX(epurkhiser): This can be changed over to using the detector ID in the
# future once we're no longer using the ProjectUptimeSubscription.id as a tag.
project_subscription = get_project_subscription(detector)

return {
"project_id": detector.project_id,
"environment": env,
"received": received,
"platform": "other",
"sdk": None,
"tags": {
"uptime_rule": str(project_subscription.id),
},
"contexts": {
"trace": {"trace_id": result["trace_id"], "span_id": result.get("span_id")},
},
}


class UptimeDetectorHandler(StatefulDetectorHandler[UptimePacketValue, CheckStatus]):
@override
@property
def thresholds(self) -> DetectorThresholds:
return {
DetectorPriorityLevel.OK: get_active_recovery_threshold(),
DetectorPriorityLevel.HIGH: get_active_failure_threshold(),
}

@override
def extract_value(self, data_packet: DataPacket[UptimePacketValue]) -> CheckStatus:
return data_packet.packet.check_result["status"]

@override
def build_issue_fingerprint(self, group_key: DetectorGroupKey = None) -> list[str]:
# TODO(epurkhiser): We should migrate the fingerprints over to match
# what the default fingerprint is.
return build_fingerprint(self.detector)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: This isn't the only fingerprint actually, the stateful detector also adds it's own "default" fingerprint, which ends up being just the detector ID. This means these issues could potentially collide with our previous issues that were using the project subscription ID as the fingerprint.


@override
def extract_dedupe_value(self, data_packet: DataPacket[UptimePacketValue]) -> int:
return int(data_packet.packet.check_result["scheduled_check_time_ms"])

@override
def evaluate(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

High level, I feel we should aim for this evaluate function to be pure. All the side effects and extra conditionals in here are a design smell. I'd encourage us to generalize these:

  • metrics should be emitted by framework layer
  • feature flags to test things and turn on/off issue creation seem like they're something we'll want for most detectors and the framework should provide this too.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I agree. Parts of this have to do with the fact that we’re migrating away from our existing issue creation system for uptime. Once we’ve cleaned up more of that this should simplify and we can clean things up more.

self, data_packet: DataPacket[UptimePacketValue]
) -> dict[DetectorGroupKey, DetectorEvaluationResult]:
result = super().evaluate(data_packet)

if not result:
return result

# Uptime does not use stateful detector value grouping
evaluation = result[None]

uptime_subscription = data_packet.packet.subscription
metric_tags = data_packet.packet.metric_tags

detector_issue_creation_enabled = features.has(
"organizations:uptime-detector-create-issues",
self.detector.project.organization,
)
issue_creation_flag_enabled = features.has(
"organizations:uptime-create-issues",
self.detector.project.organization,
)
restricted_host_provider_ids = options.get(
"uptime.restrict-issue-creation-by-hosting-provider-id"
)
host_provider_id = uptime_subscription.host_provider_id
host_provider_enabled = host_provider_id not in restricted_host_provider_ids

issue_creation_allowed = (
detector_issue_creation_enabled
and issue_creation_flag_enabled
and host_provider_enabled
)
Comment on lines +172 to +193
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is where we check a few different feature flags an options to bail out of issue creation, the same way we do in the consumer.


# XXX(epurkhiser): We currently are duplicating the detector state onto
# the uptime_subscription when the detector changes state. Once we stop
# using this field we can drop this update logic.
#
# We ONLY do this when detector issue creation is enabled, otherwise we
# let the legacy uptime consumer handle this.
if detector_issue_creation_enabled:
if evaluation.priority == DetectorPriorityLevel.OK:
uptime_status = UptimeStatus.OK
elif evaluation.priority != DetectorPriorityLevel.OK:
uptime_status = UptimeStatus.FAILED

uptime_subscription.update(
uptime_status=uptime_status,
uptime_status_update_date=django_timezone.now(),
)

if not host_provider_enabled:
metrics.incr(
"uptime.result_processor.restricted_by_provider",
sample_rate=1.0,
tags={
"host_provider_id": host_provider_id,
**metric_tags,
},
)

result_creates_issue = isinstance(evaluation.result, IssueOccurrence)
result_resolves_issue = isinstance(evaluation.result, StatusChangeMessage)

if result_creates_issue:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All these metrics seem like they should be provided by the platform

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I agree. But I don’t want to lose the metrics we have right now

Copy link
Contributor

@saponifi3d saponifi3d May 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for the most these metrics are provided by the platform: datadog, code

we will make a metric to denote that we're processing a detector, and if the detector has a state change. do we need to split out the difference between creating an issue and resolving an issue? the platform was just merging them into "a threshold was breached", but if we want to have separate tracking for those cases it should be fairly trivial to add.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can clean this all up later I just wanted to make sure we were not losing any metrics as we cut over from our legacy issue creation system

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It’s also valuable for us to have various metric tags. Probably something worth talking about for the handler APIs

metrics.incr(
"uptime.detector.will_create_issue",
tags=metric_tags,
sample_rate=1.0,
)
# XXX(epurkhiser): This logging includes the same extra arguments
# as the `uptime_active_sent_occurrence` log in the consumer for
# legacy creation
logger.info(
"uptime.detector.will_create_issue",
extra={
"project_id": self.detector.project_id,
"url": uptime_subscription.url,
**data_packet.packet.check_result,
},
)
if result_resolves_issue:
metrics.incr(
"uptime.detector.will_resolve_issue",
sample_rate=1.0,
tags=metric_tags,
)
logger.info(
"uptime.detector.will_resolve_issue",
extra={
"project_id": self.detector.project_id,
"url": uptime_subscription.url,
**data_packet.packet.check_result,
},
)

# Reutning an empty dict effectively causes the detector processor to
# bail and not produce an issue occurrence.
if result_creates_issue and not issue_creation_allowed:
return {}

return result
Comment on lines +172 to +262
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

curious if y'all have any thoughts about this evaluation bit -- is there anything for you that would make this easier to implement? think it's okay to just invoke the super and handle it like this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now I think this is fine. Once we remove a lot of the legacy issue creation code for uptime some of this is going to go away and it will be more clear what can be factored and abstracted.


@override
def create_occurrence(
self,
evaluation_result: ProcessedDataConditionGroup,
data_packet: DataPacket[UptimePacketValue],
priority: DetectorPriorityLevel,
) -> tuple[DetectorOccurrence, EventData]:
result = data_packet.packet.check_result
uptime_subscription = data_packet.packet.subscription

occurrence = DetectorOccurrence(
issue_title=f"Downtime detected for {uptime_subscription.url}",
subtitle="Your monitored domain is down",
evidence_display=build_evidence_display(result),
type=UptimeDomainCheckFailure,
level="error",
culprit="", # TODO: The url?
assignee=self.detector.owner,
priority=priority,
)
event_data = build_event_data(result, self.detector)

return (occurrence, event_data)


@dataclass(frozen=True)
Expand All @@ -24,6 +298,7 @@ class UptimeDomainCheckFailure(GroupType):
enable_auto_resolve = False
enable_escalation_detection = False
detector_settings = DetectorSettings(
handler=UptimeDetectorHandler,
config_schema={
"$schema": "https://json-schema.org/draft/2020-12/schema",
"description": "A representation of an uptime alert",
Expand Down
Loading
Loading