Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions app/config/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,7 @@ class IncidentTimeouts(BaseModel):
firing: Optional[str] = Field("6h", description="Firing timeout")
unknown: Optional[str] = Field("6h", description="Unknown timeout")
resolved: Optional[str] = Field("12h", description="Resolved timeout")
closed: Optional[str] = Field("90d", description="Closed timeout")

def get(self, key: str) -> str:
return getattr(self, key) or None
Expand Down
4 changes: 2 additions & 2 deletions app/im/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ async def notify(self, incident, notify_type, identifier):
logger.info(f'Incident {incident.uuid} -> chain step {notify_type} \'{identifier}\'')
return response_code

async def update(self, uuid_, incident, incident_status, alert_state, updated_status, chain_enabled,
async def update(self, incident, incident_status, alert_state, updated_status, chain_enabled,
status_enabled):
body = self.body_template.form_message(alert_state, incident)
header = self.header_template.form_message(alert_state, incident)
Expand All @@ -208,7 +208,7 @@ async def update(self, uuid_, incident, incident_status, alert_state, updated_st
incident.channel_id, incident.ts, incident_status, body, header, status_icons, chain_enabled, status_enabled
)
if updated_status:
logger.info(f'Incident {uuid_} updated with new status \'{incident_status}\'')
logger.info(f'Incident {incident.uuid} updated with new status \'{incident_status}\'')
# post to thread
if status_enabled and incident_status != 'closed':
header = self.header_template.form_message(incident.payload, incident)
Expand Down
2 changes: 1 addition & 1 deletion app/im/null/null_application.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ async def update_thread(self, channel_id, id_, status, body, header, status_icon
"""No thread updating for null application"""
pass

async def update(self, uuid_, incident, incident_status, alert_state, updated_status, chain_enabled, status_enabled):
async def update(self, incident, incident_status, alert_state, updated_status, chain_enabled, status_enabled):
"""No message updates for null application"""
pass

Expand Down
2 changes: 1 addition & 1 deletion app/im/telegram/telegram_application.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from app.im.telegram.user import User
from app.logging import logger
from app.config.config import get_config
from app.config.validation import ApplicationConfig, TelegramUser
from app.config.validation import ApplicationConfig


class TelegramApplication(Application):
Expand Down
6 changes: 0 additions & 6 deletions app/incident/helpers.py

This file was deleted.

52 changes: 42 additions & 10 deletions app/incident/incident.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import List, Dict, Optional
import json
import uuid

import yaml

from app.config.config import get_config
from app.config.validation import MessengerType
from app.im.channel_manager import ChannelManager
from app.incident.helpers import gen_uuid
from app.logging import logger
from app.time import unix_sleep_to_timedelta
from app.tools import NoAliasDumper
Expand Down Expand Up @@ -39,18 +40,33 @@ class Incident:
updated: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
created: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
version: str = get_config().INCIDENT_ACTUAL_VERSION
uniq_id: str = field(default=None)
uuid: str = field(init=False)
ts: str = field(default='')
link: str = field(default='')
closed: Optional[datetime] = field(default=None)

next_status = {
'firing': 'unknown',
'unknown': 'closed',
'resolved': 'closed'
'resolved': 'closed',
'closed': 'deleted'
}

@staticmethod
def gen_uuid(group_labels: Dict) -> str:
return str(uuid.uuid5(uuid.NAMESPACE_OID, json.dumps(group_labels)))

@staticmethod
def gen_uniq_id(group_labels: Dict, datetime_: datetime) -> str:
return str(uuid.uuid5(
uuid.NAMESPACE_OID,
json.dumps({'group_labels': group_labels, 'datetime': datetime_.isoformat()})
))

def __post_init__(self):
self.uuid = gen_uuid(self.payload.get('groupLabels'))
self.uuid = self.gen_uuid(self.payload.get('groupLabels'))
self.uniq_id = self.gen_uniq_id(self.payload.get('groupLabels'), self.created)
if not self.created:
self.created = datetime.now(timezone.utc)

Expand Down Expand Up @@ -163,10 +179,6 @@ def chain_update(self, index: int, done: bool, result: Optional[str]):
self.chain[index]['result'] = result
self.dump()

def set_next_status(self):
new_status = Incident.next_status[self.status]
return self.update_status(new_status)

@classmethod
def load(cls, dump_file: str, incident_config: IncidentConfig):
config = get_config()
Expand All @@ -179,6 +191,7 @@ def load(cls, dump_file: str, incident_config: IncidentConfig):
config=incident_config,
chain=content.get('chain', []),
chain_enabled=content.get('chain_enabled', False),
closed=content.get('closed', None),
status_enabled=content.get('status_enabled', False),
status_update_datetime=content.get('status_update_datetime'),
updated=content.get('updated'),
Expand All @@ -187,6 +200,7 @@ def load(cls, dump_file: str, incident_config: IncidentConfig):
assigned_user=content.get('assigned_user', ''),
assigned_fullname=content.get('assigned_fullname', ''),
messenger_type=content.get('messenger_type', ''),
uniq_id=content.get('uniq_id', ''),
version=content.get('version', config.INCIDENT_ACTUAL_VERSION)
)
incident_.set_thread(content.get('ts'), incident_config.application_url)
Expand All @@ -198,6 +212,7 @@ def dump(self):
"chain_enabled": self.chain_enabled,
"chain": self.chain,
"channel_id": self.channel_id,
"closed": self.closed,
"payload": self.payload,
"status_enabled": self.status_enabled,
"status_update_datetime": self.status_update_datetime,
Expand All @@ -209,10 +224,16 @@ def dump(self):
"assigned_user": self.assigned_user,
"assigned_fullname": self.assigned_fullname,
"messenger_type": self.messenger_type,
"uniq_id": self.uniq_id,
"version": self.version
}
try:
with open(f'{config.incidents_path}/{self.uuid}.yml', 'w') as f:
if self.status == 'closed' or self.status == 'deleted':
closed_str = self.datetime_serialize(self.closed)
incident_filename = f'{config.incidents_path}/{self.uuid}__{closed_str}.yml'
else:
incident_filename = f'{config.incidents_path}/{self.uuid}.yml'
with open(incident_filename, 'w') as f:
yaml.dump(data, f, NoAliasDumper, default_flow_style=False)
except (OSError, PermissionError, FileNotFoundError) as e:
logger.error(f'Failed to write incident file for {self.uuid}: {str(e)}')
Expand All @@ -232,6 +253,7 @@ def serialize(self) -> Dict:
"chain": self.chain,
"channel_id": self.channel_id,
"channel_name": ChannelManager().get_channel_name_by_id(self.channel_id),
"closed": self.closed,
"payload": self.payload,
"status_enabled": self.status_enabled,
"status_update_datetime": self.status_update_datetime,
Expand All @@ -244,6 +266,8 @@ def serialize(self) -> Dict:
"messenger_type": self.messenger_type,
"link": self.link,
"ts": self.ts,
"uuid": str(self.uuid),
"uniq_id": self.uniq_id,
}

def get_table_data(self, params) -> Dict:
Expand Down Expand Up @@ -296,13 +320,12 @@ def get_table_data(self, params) -> Dict:
def update_status(self, status: str) -> bool:
now = datetime.now(timezone.utc)
self.updated = now
if status != 'closed':
if status != 'deleted':
config = get_config()
timeout_value = config.incident.timeouts.get(status)
self.status_update_datetime = now + unix_sleep_to_timedelta(timeout_value)
if self.status != status:
self.set_status(status)
logger.debug(f'Incident {self.uuid} status updated to {status}')
self.dump()
return True
self.dump()
Expand All @@ -318,6 +341,9 @@ def update_state(self, alert_state: Dict) -> tuple[bool, bool]:

def set_status(self, status: str):
self.status = status
logger.debug(f'Incident {self.uuid} status set to {status}')
if status == 'closed' and not self.closed:
self.closed = datetime.now(timezone.utc)

def assign_user_id(self, user_id: str):
self.assigned_user_id = user_id
Expand All @@ -341,3 +367,9 @@ def is_some_firing_alerts_removed(self, alert_state: Dict) -> bool:
@staticmethod
def _get_firing_alerts_labels(alert_state):
return [a.get('labels') for a in alert_state['alerts'] if a['status'] == 'firing']

@staticmethod
def datetime_serialize(datetime_: Optional[datetime]) -> str:
if datetime_ is None:
return ''
return datetime_.strftime('%Y_%m_%d__%H_%M_%S')
76 changes: 47 additions & 29 deletions app/incident/incidents.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import yaml
from typing import Dict, Union

from app.incident.helpers import gen_uuid
from app.incident.incident import Incident, IncidentConfig
from app.incident.migrator import IncidentMigrator
from app.logging import logger
Expand All @@ -12,44 +11,59 @@

class Incidents:
def __init__(self, incidents_list):
self.by_uuid: Dict[str, Incident] = {i.uuid: i for i in incidents_list}
self.active_map: Dict[str, str] = {} # {uuid: uniq_id}
self.uniq_ids: Dict[str, Incident] = {}
for i in incidents_list:
self.uniq_ids[i.uniq_id] = i
if i.status != 'closed':
self.active_map[i.uuid] = i.uniq_id

def get(self, alert: Dict) -> Union[Incident, None]:
uuid_ = gen_uuid(alert.get('groupLabels'))
return self.by_uuid.get(uuid_)
uuid = Incident.gen_uuid(alert.get('groupLabels'))
return self.get_by_uuid(uuid)

def get_by_uuid(self, uuid: str) -> Union[Incident, None]:
uniq_id = self.active_map.get(uuid)
return self.uniq_ids.get(uniq_id)

def get_by_ts(self, ts: str) -> Union[Incident, None]:
return next((incident for incident in self.by_uuid.values() if incident.ts == ts), None)
for uuid_ in self.active_map.values():
incident = self.uniq_ids.get(uuid_)
if incident and incident.ts == ts:
return incident
return None

def get_assigned_user_by_id(self, user_id: str) -> Union[str, None]:
"""
Get the assigned_user (full name) from any existing incident with the same user_id.
This serves as a cache to avoid redundant API calls for user name lookup.

Args:
user_id: The user ID to search for

Returns:
The full name if found in any existing incident, None otherwise
"""
for incident in self.by_uuid.values():
for incident in self.uniq_ids.values():
if incident.assigned_user_id == user_id and incident.assigned_fullname and incident.assigned_fullname != "-":
return incident.assigned_fullname

return None

def remove_from_active_map(self, uuid: str):
if uuid in self.active_map:
del self.active_map[uuid]

def add(self, incident: Incident):
self.by_uuid[incident.uuid] = incident
self.uniq_ids[incident.uniq_id] = incident
if incident.status != 'closed':
self.active_map[incident.uuid] = incident.uniq_id

def del_by_uuid(self, uuid_: str):
def remove_file(self, incident: Incident):
config = get_config()
incident = self.by_uuid.pop(uuid_, None)
self.remove_from_active_map(incident.uuid)
try:
if incident.status == 'closed' or incident.status == 'deleted':
closed_str = Incident.datetime_serialize(incident.closed)
os.remove(f'{config.incidents_path}/{incident.uuid}__{closed_str}.yml')
else:
os.remove(f'{config.incidents_path}/{incident.uuid}.yml')
except (OSError, PermissionError, FileNotFoundError) as e:
logger.error(f'Failed to delete incident file for uuid: {incident.uuid}: {str(e)}')

def del_by_uniq_id(self, uniq_id: str):
incident = self.uniq_ids.pop(uniq_id, None)
if incident:
try:
os.remove(f'{config.incidents_path}/{uuid_}.yml')
logger.info(f'Incident {uuid_} closed. Link: {incident.link}')
except (OSError, PermissionError, FileNotFoundError) as e:
logger.error(f'Failed to delete incident file for uuid: {uuid_}: {str(e)}')
self.remove_file(incident)
# Schedule async websocket update
import asyncio
try:
Expand All @@ -59,14 +73,15 @@ def del_by_uuid(self, uuid_: str):
except RuntimeError:
# No event loop running, skip websocket update
pass
logger.info(f'Incident {incident.uuid} deleted')
else:
logger.warning(f'Incident with uuid: {uuid_} not found in the collection.')
logger.warning(f'Incident with uuid {incident.uuid} not found in the collection.')

def serialize(self) -> Dict[str, Dict]:
return {str(uuid_): incident.serialize() for uuid_, incident in self.by_uuid.items()}
return {str(uuid_): incident.serialize() for uuid_, incident in self.uniq_ids.items()}

def get_table(self, params):
return [incident.get_table_data(params) for incident in self.by_uuid.values()]
return [incident.get_table_data(params) for incident in self.uniq_ids.values()]

@classmethod
def create_or_load(cls, application_type, application_url, application_team):
Expand Down Expand Up @@ -97,7 +112,10 @@ def create_or_load(cls, application_type, application_url, application_team):
incident_config=incident_config
)
if incident_.messenger_type == config.messenger.type.value:
incidents.add(incident_)
if incident_.status != 'deleted':
incidents.add(incident_)
else:
os.remove(file_path)
else:
logger.warning(f'Skipping incident {filename}: messenger_type mismatch')

Expand Down
8 changes: 7 additions & 1 deletion app/incident/migrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from app.logging import logger
from app.tools import NoAliasDumper
from app.config.config import get_config
from app.incident.incident import Incident


class IncidentMigrator:
Expand Down Expand Up @@ -47,7 +48,7 @@ def migrate_file(self, file_path: str, incident_data: Dict, current_version: str
try:
with open(file_path, 'w') as f:
yaml.dump(migrated_data, f, NoAliasDumper, default_flow_style=False)
except (OSError, PermissionError, FileNotFoundError) as e:
except (OSError, PermissionError, FileNotFoundError) as e:
logger.error(f'Failed to write migrated incident file {os.path.basename(file_path)}: {str(e)}')

logger.info(f'Successfully migrated {os.path.basename(file_path)}')
Expand Down Expand Up @@ -152,4 +153,9 @@ def _migrate_v3_0_0_to_v3_2_0(self, data: Dict) -> Dict:
new_chain.append(step_copy)
migrated['chain'] = new_chain

migrated['uniq_id'] = Incident.gen_uniq_id(
migrated.get('payload', {}).get('groupLabels', {}),
migrated.get('created')
)

return migrated
10 changes: 5 additions & 5 deletions app/queue/handlers/alert_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,10 @@ async def _handle_create(self, alert_state):

self.incidents.add(incident_)

await self.queue.put(status_update_datetime, 'update_status', incident_.uuid)
await self.queue.put(status_update_datetime, 'update_status', incident_.uniq_id)

incident_.generate_chain(self.app.chains, chain_name)
await self.queue.recreate(status, incident_.uuid, incident_.chain)
await self.queue.recreate(status, incident_.uniq_id, incident_.chain)

async def _handle_update(self, uuid_, incident_, alert_state):
config = get_config()
Expand All @@ -89,7 +89,7 @@ async def _handle_update(self, uuid_, incident_, alert_state):
_, chain_name = self.route.get_route(alert_state)
incident_.generate_chain(self.app.chains, chain_name)

await self.queue.recreate(alert_state.get('status'), uuid_, incident_.get_chain())
await self.queue.recreate(alert_state.get('status'), incident_.uniq_id, incident_.get_chain())

# Check new alerts firing or old alerts resolved
if config.incident.notifications.new_firing:
Expand All @@ -100,13 +100,13 @@ async def _handle_update(self, uuid_, incident_, alert_state):

if is_state_updated or is_status_updated:
await self.app.update(
uuid_, incident_, alert_state['status'], alert_state, is_status_updated,
incident_, alert_state['status'], alert_state, is_status_updated,
incident_.chain_enabled, incident_.status_enabled
)

if prev_status == 'firing' and incident_.status == 'firing' and (is_new_firing_alerts_added or is_some_firing_alerts_removed) and incident_.status_enabled:
await self._notify_new_fire_alert(incident_, is_new_firing_alerts_added, is_some_firing_alerts_removed, uuid_)
await self.queue.update(uuid_, incident_.status_update_datetime, incident_.status)
await self.queue.update(incident_.uniq_id, incident_.status_update_datetime, incident_.status)

async def _notify_new_fire_alert(self, incident_, new_alerts_f, new_alerts_r, uuid_):
"""
Expand Down
Loading