Skip to content

Commit a8bf618

Browse files
feat: MTTR calculation support (#1443)
OSS has implementation of the MTTR calculation 1. total incidents for each alert 2. mean, median of recovery time (in seconds) 3. min and max of all recovery times 4. aggregated results of MTTR calculation for all alerts
1 parent 4c3b85d commit a8bf618

File tree

2 files changed

+196
-24
lines changed

2 files changed

+196
-24
lines changed

src/alerts/alert_structs.rs

Lines changed: 176 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -515,21 +515,140 @@ impl AlertQueryResult {
515515
}
516516
}
517517

518-
#[derive(serde::Deserialize)]
518+
#[derive(Deserialize)]
519519
pub struct NotificationStateRequest {
520520
pub state: String,
521521
}
522522

523+
/// MTTR (Mean Time To Recovery) statistics
524+
#[derive(Debug, Clone, Serialize, Deserialize)]
525+
#[serde(rename_all = "camelCase")]
526+
pub struct MTTRStats {
527+
/// Total number of incidents (triggered -> not-triggered cycles)
528+
pub total_incidents: usize,
529+
/// Mean recovery time in seconds
530+
pub mean_seconds: f64,
531+
/// Median recovery time in seconds
532+
pub median_seconds: f64,
533+
/// Minimum recovery time in seconds
534+
pub min_seconds: f64,
535+
/// Maximum recovery time in seconds
536+
pub max_seconds: f64,
537+
/// All individual recovery times in seconds
538+
pub recovery_times_seconds: Vec<i64>,
539+
}
540+
541+
impl MTTRStats {
542+
/// Check if there are no incidents recorded
543+
pub fn is_empty(&self) -> bool {
544+
self.total_incidents == 0
545+
}
546+
547+
/// Create MTTRStats from a list of recovery times
548+
pub fn from_recovery_times(recovery_times: Vec<i64>) -> MTTRStats {
549+
if recovery_times.is_empty() {
550+
return MTTRStats::default();
551+
}
552+
553+
let total_incidents = recovery_times.len();
554+
let total_recovery_time: i64 = recovery_times.iter().sum();
555+
let mean_seconds = total_recovery_time as f64 / total_incidents as f64;
556+
557+
let min_seconds = *recovery_times.iter().min().unwrap() as f64;
558+
let max_seconds = *recovery_times.iter().max().unwrap() as f64;
559+
560+
// Calculate median
561+
let median_seconds = if total_incidents == 1 {
562+
recovery_times[0] as f64
563+
} else {
564+
let mut sorted_times = recovery_times.clone();
565+
sorted_times.sort_unstable();
566+
567+
if total_incidents.is_multiple_of(2) {
568+
let mid = total_incidents / 2;
569+
(sorted_times[mid - 1] + sorted_times[mid]) as f64 / 2.0
570+
} else {
571+
sorted_times[total_incidents / 2] as f64
572+
}
573+
};
574+
575+
MTTRStats {
576+
total_incidents,
577+
mean_seconds,
578+
median_seconds,
579+
min_seconds,
580+
max_seconds,
581+
recovery_times_seconds: recovery_times,
582+
}
583+
}
584+
}
585+
586+
impl Default for MTTRStats {
587+
fn default() -> Self {
588+
Self {
589+
total_incidents: 0,
590+
mean_seconds: 0.0,
591+
median_seconds: 0.0,
592+
min_seconds: 0.0,
593+
max_seconds: 0.0,
594+
recovery_times_seconds: Vec::new(),
595+
}
596+
}
597+
}
598+
599+
/// Aggregated MTTR statistics across multiple alerts
600+
#[derive(Debug, Clone, Serialize, Deserialize)]
601+
#[serde(rename_all = "camelCase")]
602+
pub struct AggregatedMTTRStats {
603+
/// Overall MTTR statistics
604+
pub overall: MTTRStats,
605+
/// Number of alerts included in the calculation
606+
pub total_alerts: usize,
607+
/// Number of alerts that had incidents
608+
pub alerts_with_incidents: usize,
609+
/// Per-alert breakdown (optional, for detailed analysis)
610+
pub per_alert_stats: HashMap<String, MTTRStats>,
611+
}
612+
613+
impl AggregatedMTTRStats {
614+
/// Calculate aggregated MTTR stats from multiple alert state entries
615+
pub fn from_alert_states(alert_states: Vec<AlertStateEntry>) -> Self {
616+
let mut all_recovery_times = Vec::new();
617+
let mut per_alert_stats = HashMap::new();
618+
let mut alerts_with_incidents = 0;
619+
620+
for alert_state in &alert_states {
621+
let alert_stats = alert_state.get_mttr_stats();
622+
623+
if !alert_stats.is_empty() {
624+
alerts_with_incidents += 1;
625+
all_recovery_times.extend(alert_stats.recovery_times_seconds.iter());
626+
627+
per_alert_stats.insert(alert_state.alert_id.to_string(), alert_stats);
628+
}
629+
}
630+
631+
let overall = MTTRStats::from_recovery_times(all_recovery_times);
632+
633+
Self {
634+
overall,
635+
total_alerts: alert_states.len(),
636+
alerts_with_incidents,
637+
per_alert_stats,
638+
}
639+
}
640+
}
641+
523642
/// Represents a single state transition
524-
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
643+
#[derive(Debug, Clone, Serialize, Deserialize)]
525644
pub struct StateTransition {
526645
/// The alert state
527646
pub state: AlertState,
528647
/// Timestamp when this state was set/updated
529648
pub last_updated_at: DateTime<Utc>,
530649
}
531650

532-
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
651+
#[derive(Debug, Clone, Serialize, Deserialize)]
533652
pub struct AlertStateEntry {
534653
/// The unique identifier for the alert
535654
pub alert_id: Ulid,
@@ -581,6 +700,60 @@ impl AlertStateEntry {
581700
pub fn current_state(&self) -> Option<&StateTransition> {
582701
self.states.last()
583702
}
703+
704+
/// Get all recovery times (in seconds) from triggered to not-triggered
705+
/// Returns recovery times in chronological order
706+
pub fn get_recovery_times(&self) -> Vec<i64> {
707+
let mut recovery_times = Vec::new();
708+
let mut trigger_time: Option<DateTime<Utc>> = None;
709+
710+
// Create a sorted view without mutating the original
711+
let mut sorted_states = self.states.clone();
712+
sorted_states.sort_by(|a, b| a.last_updated_at.cmp(&b.last_updated_at));
713+
714+
for transition in &sorted_states {
715+
match transition.state {
716+
AlertState::Triggered => {
717+
// Record when alert was triggered
718+
trigger_time = Some(transition.last_updated_at);
719+
}
720+
AlertState::NotTriggered => {
721+
// If we have a trigger time, calculate recovery time
722+
if let Some(triggered_at) = trigger_time {
723+
let recovery_duration = transition
724+
.last_updated_at
725+
.signed_duration_since(triggered_at);
726+
let recovery_seconds = recovery_duration.num_seconds();
727+
728+
// Only include positive durations (validation against clock issues)
729+
if recovery_seconds > 0 {
730+
recovery_times.push(recovery_seconds);
731+
} else {
732+
tracing::warn!(
733+
"Negative or zero recovery time detected: {} seconds. Triggered at: {}, Recovered at: {}",
734+
recovery_seconds,
735+
triggered_at,
736+
transition.last_updated_at
737+
);
738+
}
739+
trigger_time = None; // Reset for next cycle
740+
}
741+
}
742+
AlertState::Disabled => {
743+
// Ignore disabled state - it doesn't affect MTTR calculation
744+
// until it's explicitly resolved (moves to not-triggered)
745+
}
746+
}
747+
}
748+
749+
recovery_times
750+
}
751+
752+
/// This is the method that is used for MTTR statistics
753+
pub fn get_mttr_stats(&self) -> MTTRStats {
754+
let recovery_times = self.get_recovery_times();
755+
MTTRStats::from_recovery_times(recovery_times)
756+
}
584757
}
585758

586759
impl MetastoreObject for AlertStateEntry {

src/metastore/metastores/object_store_metastore.rs

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -173,31 +173,30 @@ impl Metastore for ObjectStoreMetastore {
173173
})?
174174
.state;
175175

176-
// Try to read existing file
177-
let mut alert_entry = match self.storage.get_object(&path).await {
178-
Ok(existing_bytes) => {
179-
if let Ok(entry) = serde_json::from_slice::<AlertStateEntry>(&existing_bytes) {
180-
entry
181-
} else {
182-
// Create new entry if parsing fails or file doesn't exist
183-
AlertStateEntry::new(id, new_state)
176+
// Try to read and parse existing file
177+
if let Ok(existing_bytes) = self.storage.get_object(&path).await {
178+
// File exists - try to parse and update
179+
if let Ok(mut existing_entry) =
180+
serde_json::from_slice::<AlertStateEntry>(&existing_bytes)
181+
{
182+
// Update the state and only save if it actually changed
183+
let state_changed = existing_entry.update_state(new_state);
184+
185+
if state_changed {
186+
let updated_bytes = serde_json::to_vec(&existing_entry)
187+
.map_err(MetastoreError::JsonParseError)?;
188+
189+
self.storage.put_object(&path, updated_bytes.into()).await?;
184190
}
191+
return Ok(());
185192
}
186-
Err(_) => {
187-
// File doesn't exist, create new entry
188-
AlertStateEntry::new(id, new_state)
189-
}
190-
};
191-
192-
// Update the state and only save if it actually changed
193-
let state_changed = alert_entry.update_state(new_state);
193+
}
194194

195-
if state_changed {
196-
let updated_bytes =
197-
serde_json::to_vec(&alert_entry).map_err(MetastoreError::JsonParseError)?;
195+
// Create and save new entry (either file didn't exist or parsing failed)
196+
let new_entry = AlertStateEntry::new(id, new_state);
197+
let new_bytes = serde_json::to_vec(&new_entry).map_err(MetastoreError::JsonParseError)?;
198198

199-
self.storage.put_object(&path, updated_bytes.into()).await?;
200-
}
199+
self.storage.put_object(&path, new_bytes.into()).await?;
201200

202201
Ok(())
203202
}

0 commit comments

Comments
 (0)