Skip to content

Commit 0afc991

Browse files
chore: resource check on 2-min interval
take cpu and memory utilisation for 2 min rolling window before decide to reject the request
1 parent db6ba4b commit 0afc991

File tree

1 file changed

+187
-23
lines changed

1 file changed

+187
-23
lines changed

src/handlers/http/resource_check.rs

Lines changed: 187 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616
*
1717
*/
1818

19-
use std::sync::{Arc, LazyLock, atomic::AtomicBool};
19+
use std::collections::VecDeque;
20+
use std::sync::{Arc, LazyLock, Mutex, atomic::AtomicBool};
2021

2122
use actix_web::{
2223
body::MessageBody,
@@ -34,8 +35,75 @@ use tracing::{info, trace, warn};
3435
use crate::analytics::{SYS_INFO, refresh_sys_info};
3536
use crate::parseable::PARSEABLE;
3637

38+
#[derive(Debug, Clone)]
39+
struct ResourceSample {
40+
cpu_usage: f32,
41+
memory_usage: f32,
42+
timestamp: std::time::Instant,
43+
}
44+
45+
/// Structure to maintain rolling average of resource utilization
46+
struct ResourceHistory {
47+
samples: VecDeque<ResourceSample>,
48+
window_duration: Duration,
49+
}
50+
51+
impl ResourceHistory {
52+
fn new(window_duration: Duration) -> Self {
53+
Self {
54+
samples: VecDeque::new(),
55+
window_duration,
56+
}
57+
}
58+
59+
fn add_sample(&mut self, cpu_usage: f32, memory_usage: f32) {
60+
let now = std::time::Instant::now();
61+
let sample = ResourceSample {
62+
cpu_usage,
63+
memory_usage,
64+
timestamp: now,
65+
};
66+
67+
// Add new sample
68+
self.samples.push_back(sample);
69+
70+
// Remove old samples outside the window
71+
let cutoff_time = now - self.window_duration;
72+
while let Some(front) = self.samples.front() {
73+
if front.timestamp < cutoff_time {
74+
self.samples.pop_front();
75+
} else {
76+
break;
77+
}
78+
}
79+
}
80+
81+
fn get_average(&self) -> Option<(f32, f32)> {
82+
if self.samples.is_empty() {
83+
return None;
84+
}
85+
86+
let count = self.samples.len() as f32;
87+
let (total_cpu, total_memory) =
88+
self.samples
89+
.iter()
90+
.fold((0.0, 0.0), |(cpu_acc, mem_acc), sample| {
91+
(cpu_acc + sample.cpu_usage, mem_acc + sample.memory_usage)
92+
});
93+
94+
Some((total_cpu / count, total_memory / count))
95+
}
96+
97+
fn sample_count(&self) -> usize {
98+
self.samples.len()
99+
}
100+
}
101+
37102
static RESOURCE_CHECK_ENABLED: LazyLock<Arc<AtomicBool>> =
38-
LazyLock::new(|| Arc::new(AtomicBool::new(false)));
103+
LazyLock::new(|| Arc::new(AtomicBool::new(true)));
104+
105+
static RESOURCE_HISTORY: LazyLock<Arc<Mutex<ResourceHistory>>> =
106+
LazyLock::new(|| Arc::new(Mutex::new(ResourceHistory::new(Duration::from_secs(120)))));
39107

40108
/// Spawn a background task to monitor system resources
41109
pub fn spawn_resource_monitor(shutdown_rx: tokio::sync::oneshot::Receiver<()>) {
@@ -48,9 +116,13 @@ pub fn spawn_resource_monitor(shutdown_rx: tokio::sync::oneshot::Receiver<()>) {
48116
let memory_threshold = PARSEABLE.options.memory_utilization_threshold;
49117

50118
info!(
51-
"Resource monitor started with thresholds - CPU: {:.1}%, Memory: {:.1}%",
119+
"Resource monitor started with thresholds - CPU: {:.1}%, Memory: {:.1}% (2-minute rolling average)",
52120
cpu_threshold, memory_threshold
53121
);
122+
123+
// Calculate minimum samples needed for a reliable 2-minute average
124+
let min_samples_for_decision = std::cmp::max(1, 120 / resource_check_interval as usize);
125+
54126
loop {
55127
select! {
56128
_ = check_interval.tick() => {
@@ -65,32 +137,61 @@ pub fn spawn_resource_monitor(shutdown_rx: tokio::sync::oneshot::Receiver<()>) {
65137
(used_memory, total_memory, cpu_usage)
66138
}).await.unwrap();
67139

68-
let mut resource_ok = true;
69-
70140
// Calculate memory usage percentage
71141
let memory_usage = if total_memory > 0.0 {
72142
(used_memory / total_memory) * 100.0
73143
} else {
74144
0.0
75145
};
76146

77-
// Log current resource usage every few checks for debugging
78-
info!("Current resource usage - CPU: {:.1}%, Memory: {:.1}% ({:.1}GB/{:.1}GB)",
79-
cpu_usage, memory_usage,
80-
used_memory / 1024.0 / 1024.0 / 1024.0,
81-
total_memory / 1024.0 / 1024.0 / 1024.0);
147+
// Add current sample to history
148+
{
149+
let mut history = RESOURCE_HISTORY.lock().unwrap();
150+
history.add_sample(cpu_usage, memory_usage);
151+
}
82152

83-
// Check memory utilization
84-
if memory_usage > memory_threshold {
85-
warn!("High memory usage detected: {:.1}% (threshold: {:.1}%)",
86-
memory_usage, memory_threshold);
153+
// Get rolling averages
154+
let (avg_cpu, avg_memory, sample_count) = {
155+
let history = RESOURCE_HISTORY.lock().unwrap();
156+
if let Some((cpu_avg, mem_avg)) = history.get_average() {
157+
(cpu_avg, mem_avg, history.sample_count())
158+
} else {
159+
(cpu_usage, memory_usage, 1) // Fallback to current values if no history
160+
}
161+
};
162+
163+
// Log current and average resource usage
164+
info!(
165+
"Resource usage - Current: CPU {:.1}%, Memory {:.1}% | 2-min avg: CPU {:.1}%, Memory {:.1}% (samples: {})",
166+
cpu_usage, memory_usage, avg_cpu, avg_memory, sample_count
167+
);
168+
169+
// Only make decisions based on rolling average if we have enough samples
170+
let (decision_cpu, decision_memory) = if sample_count >= min_samples_for_decision {
171+
(avg_cpu, avg_memory)
172+
} else {
173+
// For the first few minutes, use current values but be more conservative
174+
info!("Still warming up resource history (need {} samples, have {})", min_samples_for_decision, sample_count);
175+
(cpu_usage, memory_usage)
176+
};
177+
178+
let mut resource_ok = true;
179+
180+
// Check memory utilization against rolling average
181+
if decision_memory > memory_threshold {
182+
warn!(
183+
"High memory usage detected: 2-min avg {:.1}% (threshold: {:.1}%, current: {:.1}%)",
184+
decision_memory, memory_threshold, memory_usage
185+
);
87186
resource_ok = false;
88187
}
89188

90-
// Check CPU utilization
91-
if cpu_usage > cpu_threshold {
92-
warn!("High CPU usage detected: {:.1}% (threshold: {:.1}%)",
93-
cpu_usage, cpu_threshold);
189+
// Check CPU utilization against rolling average
190+
if decision_cpu > cpu_threshold {
191+
warn!(
192+
"High CPU usage detected: 2-min avg {:.1}% (threshold: {:.1}%, current: {:.1}%)",
193+
decision_cpu, cpu_threshold, cpu_usage
194+
);
94195
resource_ok = false;
95196
}
96197

@@ -100,9 +201,9 @@ pub fn spawn_resource_monitor(shutdown_rx: tokio::sync::oneshot::Receiver<()>) {
100201
// Log state changes
101202
if previous_state != resource_ok {
102203
if resource_ok {
103-
info!("Resource utilization back to normal - requests will be accepted");
204+
info!("Resource utilization back to normal (2-min avg: CPU {:.1}%, Memory {:.1}%) - requests will be accepted", avg_cpu, avg_memory);
104205
} else {
105-
warn!("Resource utilization too high - requests will be rejected");
206+
warn!("Resource utilization too high (2-min avg: CPU {:.1}%, Memory {:.1}%) - requests will be rejected", avg_cpu, avg_memory);
106207
}
107208
}
108209
},
@@ -116,17 +217,17 @@ pub fn spawn_resource_monitor(shutdown_rx: tokio::sync::oneshot::Receiver<()>) {
116217
}
117218

118219
/// Middleware to check system resource utilization before processing requests
119-
/// Returns 503 Service Unavailable if resources are over-utilized
220+
/// Returns 503 Service Unavailable if resources are over-utilized (based on 2-minute rolling average)
120221
pub async fn check_resource_utilization_middleware(
121222
req: ServiceRequest,
122223
next: Next<impl MessageBody>,
123224
) -> Result<ServiceResponse<impl MessageBody>, Error> {
124225
let resource_ok = RESOURCE_CHECK_ENABLED.load(std::sync::atomic::Ordering::SeqCst);
125226

126227
if !resource_ok {
127-
let error_msg = "Server resources over-utilized";
228+
let error_msg = "Server resources over-utilized (based on 2-minute rolling average)";
128229
warn!(
129-
"Rejecting request to {} due to resource constraints",
230+
"Rejecting request to {} due to resource constraints (2-minute average above threshold)",
130231
req.path()
131232
);
132233
return Err(ErrorServiceUnavailable(error_msg));
@@ -135,3 +236,66 @@ pub async fn check_resource_utilization_middleware(
135236
// Continue processing the request if resource utilization is within limits
136237
next.call(req).await
137238
}
239+
240+
#[cfg(test)]
241+
mod tests {
242+
use super::*;
243+
use std::time::Duration;
244+
245+
#[test]
246+
fn test_resource_history_basic() {
247+
let mut history = ResourceHistory::new(Duration::from_secs(60));
248+
249+
// Add some samples
250+
history.add_sample(50.0, 60.0);
251+
history.add_sample(70.0, 80.0);
252+
253+
let (avg_cpu, avg_memory) = history.get_average().unwrap();
254+
assert_eq!(avg_cpu, 60.0); // (50 + 70) / 2
255+
assert_eq!(avg_memory, 70.0); // (60 + 80) / 2
256+
assert_eq!(history.sample_count(), 2);
257+
}
258+
259+
#[test]
260+
fn test_resource_history_window_cleanup() {
261+
let mut history = ResourceHistory::new(Duration::from_millis(100));
262+
263+
// Add samples
264+
history.add_sample(50.0, 60.0);
265+
std::thread::sleep(Duration::from_millis(50));
266+
history.add_sample(70.0, 80.0);
267+
268+
// Both samples should be present
269+
assert_eq!(history.sample_count(), 2);
270+
271+
// Wait for first sample to expire
272+
std::thread::sleep(Duration::from_millis(100));
273+
history.add_sample(90.0, 100.0);
274+
275+
// Old samples should be cleaned up, only recent samples remain
276+
assert!(history.sample_count() <= 2);
277+
278+
let (avg_cpu, avg_memory) = history.get_average().unwrap();
279+
// Should be average of recent samples only
280+
assert!(avg_cpu >= 70.0);
281+
assert!(avg_memory >= 80.0);
282+
}
283+
284+
#[test]
285+
fn test_resource_history_empty() {
286+
let history = ResourceHistory::new(Duration::from_secs(60));
287+
assert!(history.get_average().is_none());
288+
assert_eq!(history.sample_count(), 0);
289+
}
290+
291+
#[test]
292+
fn test_resource_history_single_sample() {
293+
let mut history = ResourceHistory::new(Duration::from_secs(60));
294+
history.add_sample(75.5, 85.3);
295+
296+
let (avg_cpu, avg_memory) = history.get_average().unwrap();
297+
assert_eq!(avg_cpu, 75.5);
298+
assert_eq!(avg_memory, 85.3);
299+
assert_eq!(history.sample_count(), 1);
300+
}
301+
}

0 commit comments

Comments
 (0)