@@ -74,61 +74,30 @@ async def run(self, states: List[State]):
7474 network = self .config ["network" ]["name" ], publishers = self .publishers
7575 )
7676
77+ current_time = datetime .now ()
7778 for check in failed_checks :
7879 for event_type in self .config ["events" ]:
7980 event : Event = globals ()[event_type ](check , context )
8081
8182 if event_type == "ZendutyEvent" :
82- # Add failed check to open alerts
83- alert_identifier = (
84- f"{ check .__class__ .__name__ } -{ check .state ().symbol } "
85- )
86- state = check .state ()
87- if isinstance (state , PublisherState ):
88- alert_identifier += f"-{ state .publisher_name } "
89- try :
90- failures = self .open_alerts [alert_identifier ]["failures" ] + 1
91- except KeyError :
92- failures = 1
93- self .open_alerts [alert_identifier ] = {
94- "last_failure" : datetime .now ().isoformat (),
95- "failures" : failures ,
96- }
97- # store the event to send it later if it fails multiple times
83+ alert_identifier = self .generate_alert_identifier (check )
84+ alert = self .open_alerts .get (alert_identifier )
85+ if alert is None :
86+ self .open_alerts [alert_identifier ] = {
87+ "window_start" : current_time .isoformat (),
88+ "failures" : 1 ,
89+ "last_window_failures" : None ,
90+ "sent" : False ,
91+ }
92+ else :
93+ alert ["failures" ] += 1
9894 self .zenduty_events [alert_identifier ] = event
99- continue # do not immediately send a zenduty alert
95+ continue # Skip sending immediately for ZendutyEvent
10096
10197 sent_events .append (event .send ())
10298
10399 await asyncio .gather (* sent_events )
104-
105- # Check open alerts for zenduty
106- if "ZendutyEvent" in self .config ["events" ]:
107-
108- to_remove = []
109- current_time = datetime .now ()
110- for identifier , info in self .open_alerts .items ():
111- # Resolve the alert if it last failed > 2 minutes ago
112- if current_time - datetime .fromisoformat (
113- info ["last_failure" ]
114- ) >= timedelta (minutes = 2 ):
115- logger .debug (f"Resolving Zenduty alert { identifier } " )
116- response = await send_zenduty_alert (
117- alert_identifier = identifier , message = identifier , resolved = True
118- )
119- if response and 200 <= response .status < 300 :
120- to_remove .append (identifier )
121- elif info ["failures" ] > 2 :
122- # Raise alert if the check has failed more than twice before self-resolving
123- await self .zenduty_events [identifier ].send ()
124-
125- for identifier in to_remove :
126- del self .open_alerts [identifier ]
127- del self .zenduty_events [identifier ]
128-
129- # Write open alerts to file to ensure persistence
130- with open (self .open_alerts_file , "w" ) as file :
131- json .dump (self .open_alerts , file )
100+ await self .process_zenduty_events (current_time )
132101
133102 def check_price_feed (self , state : PriceFeedState ) -> List [Check ]:
134103 failed_checks : List [Check ] = []
@@ -179,3 +148,62 @@ def load_config(self, check_name: str, symbol: str) -> Dict[str, Any]:
179148 config |= self .config ["checks" ][symbol ][check_name ]
180149
181150 return config
151+
152+ # Zenduty Functions
153+ def generate_alert_identifier (self , check ):
154+ alert_identifier = f"{ check .__class__ .__name__ } -{ check .state ().symbol } "
155+ state = check .state ()
156+ if isinstance (state , PublisherState ):
157+ alert_identifier += f"-{ state .publisher_name } "
158+ return alert_identifier
159+
160+ def check_zd_alert_status (self , alert_identifier , current_time ):
161+ alert = self .open_alerts .get (alert_identifier )
162+ if alert is not None :
163+ # Reset the failure count if 5m has elapsed
164+ if current_time - datetime .fromisoformat (
165+ alert ["window_start" ]
166+ ) >= timedelta (minutes = 5 ):
167+ alert ["window_start" ] = current_time .isoformat ()
168+ alert ["last_window_failures" ] = alert ["failures" ]
169+ alert ["failures" ] = 0
170+
171+ async def process_zenduty_events (self , current_time ):
172+ to_remove = []
173+ to_alert = []
174+
175+ for identifier , info in self .open_alerts .items ():
176+ self .check_zd_alert_status (identifier , current_time )
177+ # Resolve the alert if raised and failed < 5 times in the last 5m window
178+ if (
179+ info ["sent" ]
180+ and info ["last_window_failures" ] is not None
181+ and info ["last_window_failures" ] < 5
182+ ):
183+ logger .debug (f"Resolving Zenduty alert { identifier } " )
184+ response = await send_zenduty_alert (
185+ identifier , identifier , resolved = True
186+ )
187+ if response and 200 <= response .status < 300 :
188+ to_remove .append (identifier )
189+ # Raise alert if failed > 5 times within the last 5m window
190+ # re-alert every 5 minutes
191+ elif info ["failures" ] >= 5 and (
192+ not info .get ("last_alert" )
193+ or current_time - datetime .fromisoformat (info ["last_alert" ])
194+ > timedelta (minutes = 5 )
195+ ):
196+ logger .debug (f"Raising Zenduty alert { identifier } " )
197+ self .open_alerts [identifier ]["sent" ] = True
198+ self .open_alerts [identifier ]["last_alert" ] = current_time .isoformat ()
199+ to_alert .append (self .zenduty_events [identifier ].send ())
200+
201+ await asyncio .gather (* to_alert )
202+ for identifier in to_remove :
203+ if self .open_alerts .get (identifier ):
204+ del self .open_alerts [identifier ]
205+ if self .zenduty_events .get (identifier ):
206+ del self .zenduty_events [identifier ]
207+
208+ with open (self .open_alerts_file , "w" ) as file :
209+ json .dump (self .open_alerts , file )
0 commit comments