From ebe923bad0e449c33d166ff38a3de351030cd55e Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Tue, 12 Aug 2025 23:20:41 -0700 Subject: [PATCH 01/14] improve object store related metrics --- src/metrics/storage.rs | 131 +++----- src/storage/azure_blob.rs | 452 +++++++++++++++++++++++----- src/storage/gcs.rs | 472 +++++++++++++++++++++++------ src/storage/localfs.rs | 442 +++++++++++++++++++++++---- src/storage/metrics_layer.rs | 255 +++++++++++----- src/storage/s3.rs | 564 ++++++++++++++++++++++++++++------- 6 files changed, 1831 insertions(+), 485 deletions(-) diff --git a/src/metrics/storage.rs b/src/metrics/storage.rs index f96a317d9..d483d0ee8 100644 --- a/src/metrics/storage.rs +++ b/src/metrics/storage.rs @@ -16,150 +16,109 @@ * */ +use crate::metrics::METRICS_NAMESPACE; use actix_web_prometheus::PrometheusMetrics; +use once_cell::sync::Lazy; +use prometheus::{CounterVec, HistogramOpts, HistogramVec, Opts}; + +// Global storage metric used by all storage providers +pub static STORAGE_REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { + HistogramVec::new( + HistogramOpts::new("storage_request_response_time", "Storage Request Latency") + .namespace(METRICS_NAMESPACE), + &["provider", "method", "status"], + ) + .expect("metric can be created") +}); + +// Global storage metric for tracking number of files scanned +pub static STORAGE_FILES_SCANNED: Lazy = Lazy::new(|| { + CounterVec::new( + Opts::new( + "storage_files_scanned_total", + "Total number of files scanned in storage operations", + ) + .namespace(METRICS_NAMESPACE), + &["provider", "operation"], + ) + .expect("metric can be created") +}); pub trait StorageMetrics { fn register_metrics(&self, handler: &PrometheusMetrics); } pub mod localfs { - use crate::{metrics::METRICS_NAMESPACE, storage::FSConfig}; - use once_cell::sync::Lazy; - use prometheus::{HistogramOpts, HistogramVec}; - - use super::StorageMetrics; + use crate::storage::FSConfig; - pub static REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { - HistogramVec::new( - HistogramOpts::new("local_fs_response_time", "FileSystem Request Latency") - .namespace(METRICS_NAMESPACE), - &["method", "status"], - ) - .expect("metric can be created") - }); + use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; impl StorageMetrics for FSConfig { fn register_metrics(&self, handler: &actix_web_prometheus::PrometheusMetrics) { handler .registry - .register(Box::new(REQUEST_RESPONSE_TIME.clone())) + .register(Box::new(STORAGE_REQUEST_RESPONSE_TIME.clone())) + .expect("metric can be registered"); + handler + .registry + .register(Box::new(STORAGE_FILES_SCANNED.clone())) .expect("metric can be registered"); } } } pub mod s3 { - use crate::{metrics::METRICS_NAMESPACE, storage::S3Config}; - use once_cell::sync::Lazy; - use prometheus::{HistogramOpts, HistogramVec}; + use crate::storage::S3Config; - use super::StorageMetrics; - - pub static REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { - HistogramVec::new( - HistogramOpts::new("s3_response_time", "S3 Request Latency") - .namespace(METRICS_NAMESPACE), - &["method", "status"], - ) - .expect("metric can be created") - }); - - pub static QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { - HistogramVec::new( - HistogramOpts::new("query_s3_response_time", "S3 Request Latency") - .namespace(METRICS_NAMESPACE), - &["method", "status"], - ) - .expect("metric can be created") - }); + use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; impl StorageMetrics for S3Config { fn register_metrics(&self, handler: &actix_web_prometheus::PrometheusMetrics) { handler .registry - .register(Box::new(REQUEST_RESPONSE_TIME.clone())) + .register(Box::new(STORAGE_REQUEST_RESPONSE_TIME.clone())) .expect("metric can be registered"); handler .registry - .register(Box::new(QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME.clone())) + .register(Box::new(STORAGE_FILES_SCANNED.clone())) .expect("metric can be registered"); } } } pub mod azureblob { - use crate::{metrics::METRICS_NAMESPACE, storage::AzureBlobConfig}; - use once_cell::sync::Lazy; - use prometheus::{HistogramOpts, HistogramVec}; - - use super::StorageMetrics; + use crate::storage::AzureBlobConfig; - pub static REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { - HistogramVec::new( - HistogramOpts::new("azr_blob_response_time", "AzureBlob Request Latency") - .namespace(METRICS_NAMESPACE), - &["method", "status"], - ) - .expect("metric can be created") - }); - - pub static QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { - HistogramVec::new( - HistogramOpts::new("query_azr_blob_response_time", "AzureBlob Request Latency") - .namespace(METRICS_NAMESPACE), - &["method", "status"], - ) - .expect("metric can be created") - }); + use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; impl StorageMetrics for AzureBlobConfig { fn register_metrics(&self, handler: &actix_web_prometheus::PrometheusMetrics) { handler .registry - .register(Box::new(REQUEST_RESPONSE_TIME.clone())) + .register(Box::new(STORAGE_REQUEST_RESPONSE_TIME.clone())) .expect("metric can be registered"); handler .registry - .register(Box::new(QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME.clone())) + .register(Box::new(STORAGE_FILES_SCANNED.clone())) .expect("metric can be registered"); } } } pub mod gcs { - use crate::{metrics::METRICS_NAMESPACE, storage::GcsConfig}; - use once_cell::sync::Lazy; - use prometheus::{HistogramOpts, HistogramVec}; + use crate::storage::GcsConfig; - use super::StorageMetrics; - - pub static REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { - HistogramVec::new( - HistogramOpts::new("gcs_response_time", "GCS Request Latency") - .namespace(METRICS_NAMESPACE), - &["method", "status"], - ) - .expect("metric can be created") - }); - - pub static QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { - HistogramVec::new( - HistogramOpts::new("query_gcs_response_time", "GCS Request Latency") - .namespace(METRICS_NAMESPACE), - &["method", "status"], - ) - .expect("metric can be created") - }); + use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; impl StorageMetrics for GcsConfig { fn register_metrics(&self, handler: &actix_web_prometheus::PrometheusMetrics) { handler .registry - .register(Box::new(REQUEST_RESPONSE_TIME.clone())) + .register(Box::new(STORAGE_REQUEST_RESPONSE_TIME.clone())) .expect("metric can be registered"); handler .registry - .register(Box::new(QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME.clone())) + .register(Box::new(STORAGE_FILES_SCANNED.clone())) .expect("metric can be registered"); } } diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index acd867361..1b2f7b783 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -46,15 +46,15 @@ use tracing::{error, info}; use url::Url; use crate::{ - metrics::storage::{StorageMetrics, azureblob::REQUEST_RESPONSE_TIME}, + metrics::storage::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}, parseable::LogStream, }; use super::{ CONNECT_TIMEOUT_SECS, MIN_MULTIPART_UPLOAD_SIZE, ObjectStorage, ObjectStorageError, ObjectStorageProvider, PARSEABLE_ROOT_DIRECTORY, REQUEST_TIMEOUT_SECS, - STREAM_METADATA_FILE_NAME, metrics_layer::MetricLayer, object_storage::parseable_json_path, - to_object_store_path, + STREAM_METADATA_FILE_NAME, metrics_layer::MetricLayer, metrics_layer::error_to_status_code, + object_storage::parseable_json_path, to_object_store_path, }; #[derive(Debug, Clone, clap::Args)] @@ -166,7 +166,7 @@ impl ObjectStorageProvider for AzureBlobConfig { let azure = self.get_default_builder().build().unwrap(); // limit objectstore to a concurrent request limit let azure = LimitStore::new(azure, super::MAX_OBJECT_STORE_REQUESTS); - let azure = MetricLayer::new(azure); + let azure = MetricLayer::new(azure, "azure_blob"); let object_store_registry = DefaultObjectStoreRegistry::new(); let url = ObjectStoreUrl::parse(format!("https://{}.blob.core.windows.net", self.account)) @@ -211,21 +211,21 @@ impl BlobStore { async fn _get_object(&self, path: &RelativePath) -> Result { let instant = Instant::now(); let resp = self.client.get(&to_object_store_path(path)).await; + let elapsed = instant.elapsed().as_secs_f64(); match resp { Ok(resp) => { - let time = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(time); let body = resp.bytes().await.unwrap(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "GET", "200"]) + .observe(elapsed); Ok(body) } Err(err) => { - let time = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "400"]) - .observe(time); + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "GET", status_code]) + .observe(elapsed); Err(err.into()) } } @@ -236,36 +236,66 @@ impl BlobStore { path: &RelativePath, resource: PutPayload, ) -> Result<(), ObjectStorageError> { - let time = Instant::now(); + let instant = Instant::now(); let resp = self.client.put(&to_object_store_path(path), resource).await; - let status = if resp.is_ok() { "200" } else { "400" }; - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["PUT", status]) - .observe(time); - - if let Err(object_store::Error::NotFound { source, .. }) = &resp { - return Err(ObjectStorageError::Custom( - format!("Failed to upload, error: {source:?}").to_string(), - )); + let elapsed = instant.elapsed().as_secs_f64(); + match resp { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT", "200"]) + .observe(elapsed); + Ok(()) + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT", status_code]) + .observe(elapsed); + + if let object_store::Error::NotFound { source, .. } = &err { + return Err(ObjectStorageError::Custom( + format!("Failed to upload, error: {source:?}").to_string(), + )); + } + Err(err.into()) + } } - - resp.map(|_| ()).map_err(|err| err.into()) } async fn _delete_prefix(&self, key: &str) -> Result<(), ObjectStorageError> { + // Track LIST operation + let list_start = Instant::now(); let object_stream = self.client.list(Some(&(key.into()))); + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); object_stream .for_each_concurrent(None, |x| async { match x { Ok(obj) => { - if (self.client.delete(&obj.location).await).is_err() { - error!("Failed to fetch object during delete stream"); + // Track individual DELETE operation + let delete_start = Instant::now(); + match self.client.delete(&obj.location).await { + Ok(_) => { + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "DELETE", "200"]) + .observe(delete_elapsed); + } + Err(err) => { + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "DELETE", status_code]) + .observe(delete_elapsed); + error!("Failed to delete object during delete stream: {:?}", err); + } } } - Err(_) => { - error!("Failed to fetch object during delete stream"); + Err(err) => { + error!("Failed to fetch object during delete stream: {:?}", err); } }; }) @@ -275,10 +305,29 @@ impl BlobStore { } async fn _list_dates(&self, stream: &str) -> Result, ObjectStorageError> { + // Track LIST operation + let list_start = Instant::now(); let resp = self .client .list_with_delimiter(Some(&(stream.into()))) - .await?; + .await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; let common_prefixes = resp.common_prefixes; @@ -293,31 +342,39 @@ impl BlobStore { } async fn _upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { - let instant = Instant::now(); - // // TODO: Uncomment this when multipart is fixed // let should_multipart = std::fs::metadata(path)?.len() > MULTIPART_UPLOAD_SIZE as u64; let should_multipart = false; - let res = if should_multipart { + if should_multipart { // self._upload_multipart(key, path).await // this branch will never get executed Ok(()) } else { let bytes = tokio::fs::read(path).await?; - let result = self.client.put(&key.into(), bytes.into()).await?; - info!("Uploaded file to Azure Blob Storage: {:?}", result); - Ok(()) - }; - - let status = if res.is_ok() { "200" } else { "400" }; - let time = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["UPLOAD_PARQUET", status]) - .observe(time); - res + let put_start = Instant::now(); + let result = self.client.put(&key.into(), bytes.into()).await; + let put_elapsed = put_start.elapsed().as_secs_f64(); + + match result { + Ok(result) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT", "200"]) + .observe(put_elapsed); + info!("Uploaded file to Azure Blob Storage: {:?}", result); + Ok(()) + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT", status_code]) + .observe(put_elapsed); + Err(err.into()) + } + } + } } async fn _upload_multipart( @@ -328,14 +385,52 @@ impl BlobStore { let mut file = OpenOptions::new().read(true).open(path).await?; let location = &to_object_store_path(key); - let mut async_writer = self.client.put_multipart(location).await?; + // Track multipart initiation + let multipart_start = Instant::now(); + let async_writer = self.client.put_multipart(location).await; + let multipart_elapsed = multipart_start.elapsed().as_secs_f64(); + + let mut async_writer = match async_writer { + Ok(writer) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT_MULTIPART_INIT", "200"]) + .observe(multipart_elapsed); + writer + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT_MULTIPART_INIT", status_code]) + .observe(multipart_elapsed); + return Err(err.into()); + } + }; let meta = file.metadata().await?; let total_size = meta.len() as usize; if total_size < MIN_MULTIPART_UPLOAD_SIZE { let mut data = Vec::new(); file.read_to_end(&mut data).await?; - self.client.put(location, data.into()).await?; + + // Track single PUT operation for small files + let put_start = Instant::now(); + let result = self.client.put(location, data.into()).await; + let put_elapsed = put_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT", "200"]) + .observe(put_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT", status_code]) + .observe(put_elapsed); + return Err(err.into()); + } + } // async_writer.put_part(data.into()).await?; // async_writer.complete().await?; return Ok(()); @@ -349,7 +444,7 @@ impl BlobStore { let num_full_parts = total_size / MIN_MULTIPART_UPLOAD_SIZE; let total_parts = num_full_parts + if has_final_partial_part { 1 } else { 0 }; - // Upload each part + // Upload each part with metrics for part_number in 0..(total_parts) { let start_pos = part_number * MIN_MULTIPART_UPLOAD_SIZE; let end_pos = if part_number == num_full_parts && has_final_partial_part { @@ -363,15 +458,47 @@ impl BlobStore { // Extract this part's data let part_data = data[start_pos..end_pos].to_vec(); - // Upload the part - async_writer.put_part(part_data.into()).await?; + // Track individual part upload + let part_start = Instant::now(); + let result = async_writer.put_part(part_data.into()).await; + let part_elapsed = part_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT_MULTIPART_PART", "200"]) + .observe(part_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT_MULTIPART_PART", status_code]) + .observe(part_elapsed); + return Err(err.into()); + } + } // upload_parts.push(part_number as u64 + 1); } - if let Err(err) = async_writer.complete().await { + + // Track multipart completion + let complete_start = Instant::now(); + let complete_result = async_writer.complete().await; + let complete_elapsed = complete_start.elapsed().as_secs_f64(); + + if let Err(err) = complete_result { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT_MULTIPART_COMPLETE", status_code]) + .observe(complete_elapsed); error!("Failed to complete multipart upload. {:?}", err); async_writer.abort().await?; - }; + return Err(err.into()); + } else { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT_MULTIPART_COMPLETE", "200"]) + .observe(complete_elapsed); + } } Ok(()) } @@ -441,6 +568,11 @@ impl ObjectStorage for BlobStore { ))) } async fn head(&self, _path: &RelativePath) -> Result { + // Record attempt to access file (even though operation not implemented) + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "HEAD"]) + .inc(); + Err(ObjectStorageError::UnhandledError(Box::new( std::io::Error::new( std::io::ErrorKind::Unsupported, @@ -450,7 +582,14 @@ impl ObjectStorage for BlobStore { } async fn get_object(&self, path: &RelativePath) -> Result { - Ok(self._get_object(path).await?) + let result = self._get_object(path).await?; + + // Record single file accessed + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "GET"]) + .inc(); + + Ok(result) } async fn get_objects( @@ -458,19 +597,39 @@ impl ObjectStorage for BlobStore { base_path: Option<&RelativePath>, filter_func: Box bool + Send>, ) -> Result, ObjectStorageError> { - let instant = Instant::now(); - let prefix = if let Some(base_path) = base_path { to_object_store_path(base_path) } else { self.root.clone() }; + // Track list operation + let list_start = Instant::now(); let mut list_stream = self.client.list(Some(&prefix)); let mut res = vec![]; + let mut files_scanned = 0; + + while let Some(meta_result) = list_stream.next().await { + let list_elapsed = list_start.elapsed().as_secs_f64(); - while let Some(meta) = list_stream.next().await.transpose()? { + let meta = match meta_result { + Ok(meta) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); + meta + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + + files_scanned += 1; let ingestor_file = filter_func(meta.location.filename().unwrap().to_string()); if !ingestor_file { @@ -487,10 +646,10 @@ impl ObjectStorage for BlobStore { res.push(byts); } - let instant = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(instant); + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "GET"]) + .inc_by(files_scanned as f64); Ok(res) } @@ -498,11 +657,33 @@ impl ObjectStorage for BlobStore { async fn get_ingestor_meta_file_paths( &self, ) -> Result, ObjectStorageError> { - let time = Instant::now(); let mut path_arr = vec![]; + let mut files_scanned = 0; + + // Track list operation + let list_start = Instant::now(); let mut object_stream = self.client.list(Some(&self.root)); - while let Some(meta) = object_stream.next().await.transpose()? { + while let Some(meta_result) = object_stream.next().await { + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let meta = match meta_result { + Ok(meta) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); + meta + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + + files_scanned += 1; let flag = meta.location.filename().unwrap().starts_with("ingestor"); if flag { @@ -510,10 +691,10 @@ impl ObjectStorage for BlobStore { } } - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(time); + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "LIST"]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -527,6 +708,11 @@ impl ObjectStorage for BlobStore { .await .map_err(|err| ObjectStorageError::ConnectionError(Box::new(err)))?; + // Record single file written + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "PUT"]) + .inc(); + Ok(()) } @@ -537,15 +723,54 @@ impl ObjectStorage for BlobStore { } async fn delete_object(&self, path: &RelativePath) -> Result<(), ObjectStorageError> { - Ok(self.client.delete(&to_object_store_path(path)).await?) + let delete_start = Instant::now(); + let result = self.client.delete(&to_object_store_path(path)).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "DELETE", "200"]) + .observe(delete_elapsed); + // Record single file deleted + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "DELETE"]) + .inc(); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "DELETE", status_code]) + .observe(delete_elapsed); + } + } + + Ok(result?) } async fn check(&self) -> Result<(), ObjectStorageError> { - Ok(self + let head_start = Instant::now(); + let result = self .client .head(&to_object_store_path(&parseable_json_path())) - .await - .map(|_| ())?) + .await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "HEAD", "200"]) + .observe(head_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "HEAD", status_code]) + .observe(head_elapsed); + } + } + + Ok(result.map(|_| ())?) } async fn delete_stream(&self, stream_name: &str) -> Result<(), ObjectStorageError> { @@ -556,9 +781,24 @@ impl ObjectStorage for BlobStore { async fn try_delete_node_meta(&self, node_filename: String) -> Result<(), ObjectStorageError> { let file = RelativePathBuf::from(&node_filename); - match self.client.delete(&to_object_store_path(&file)).await { - Ok(_) => Ok(()), + + let delete_start = Instant::now(); + let result = self.client.delete(&to_object_store_path(&file)).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "DELETE", "200"]) + .observe(delete_elapsed); + Ok(()) + } Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "DELETE", status_code]) + .observe(delete_elapsed); + // if the object is not found, it is not an error // the given url path was incorrect if matches!(err, object_store::Error::NotFound { .. }) { @@ -580,7 +820,13 @@ impl ObjectStorage for BlobStore { } async fn list_old_streams(&self) -> Result, ObjectStorageError> { + // Track LIST operation + let list_start = Instant::now(); let resp = self.client.list_with_delimiter(None).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); let common_prefixes = resp.common_prefixes; // get all dirs @@ -596,7 +842,27 @@ impl ObjectStorage for BlobStore { for dir in &dirs { let key = format!("{dir}/{STREAM_METADATA_FILE_NAME}"); - let task = async move { self.client.head(&StorePath::from(key)).await.map(|_| ()) }; + let task = async move { + let head_start = Instant::now(); + let result = self.client.head(&StorePath::from(key)).await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "HEAD", "200"]) + .observe(head_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "HEAD", status_code]) + .observe(head_elapsed); + } + } + + result.map(|_| ()) + }; stream_json_check.push(task); } @@ -708,7 +974,26 @@ impl ObjectStorage for BlobStore { async fn list_dirs(&self) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from("/"); - let resp = self.client.list_with_delimiter(Some(&pre)).await?; + + let list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&pre)).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; Ok(resp .common_prefixes @@ -723,7 +1008,26 @@ impl ObjectStorage for BlobStore { relative_path: &RelativePath, ) -> Result, ObjectStorageError> { let prefix = object_store::path::Path::from(relative_path.as_str()); - let resp = self.client.list_with_delimiter(Some(&prefix)).await?; + + let list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&prefix)).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; Ok(resp .common_prefixes diff --git a/src/storage/gcs.rs b/src/storage/gcs.rs index 307359b51..b98c1d873 100644 --- a/src/storage/gcs.rs +++ b/src/storage/gcs.rs @@ -24,7 +24,7 @@ use std::{ }; use crate::{ - metrics::storage::{StorageMetrics, gcs::REQUEST_RESPONSE_TIME}, + metrics::storage::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}, parseable::LogStream, }; use async_trait::async_trait; @@ -51,8 +51,8 @@ use tracing::{error, info}; use super::{ CONNECT_TIMEOUT_SECS, MIN_MULTIPART_UPLOAD_SIZE, ObjectStorage, ObjectStorageError, ObjectStorageProvider, PARSEABLE_ROOT_DIRECTORY, REQUEST_TIMEOUT_SECS, - STREAM_METADATA_FILE_NAME, metrics_layer::MetricLayer, object_storage::parseable_json_path, - to_object_store_path, + STREAM_METADATA_FILE_NAME, metrics_layer::MetricLayer, metrics_layer::error_to_status_code, + object_storage::parseable_json_path, to_object_store_path, }; #[derive(Debug, Clone, clap::Args)] @@ -128,7 +128,7 @@ impl ObjectStorageProvider for GcsConfig { // limit objectstore to a concurrent request limit let gcs = LimitStore::new(gcs, super::MAX_OBJECT_STORE_REQUESTS); - let gcs = MetricLayer::new(gcs); + let gcs = MetricLayer::new(gcs, "gcs"); let object_store_registry = DefaultObjectStoreRegistry::new(); // Register GCS client under the "gs://" scheme so DataFusion can route @@ -174,24 +174,23 @@ pub struct Gcs { impl Gcs { async fn _get_object(&self, path: &RelativePath) -> Result { - let instant = Instant::now(); - + let get_start = Instant::now(); let resp = self.client.get(&to_object_store_path(path)).await; + let get_elapsed = get_start.elapsed().as_secs_f64(); match resp { Ok(resp) => { - let time = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(time); let body = resp.bytes().await.unwrap(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "GET", "200"]) + .observe(get_elapsed); Ok(body) } Err(err) => { - let time = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "400"]) - .observe(time); + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "GET", status_code]) + .observe(get_elapsed); Err(err.into()) } } @@ -202,39 +201,73 @@ impl Gcs { path: &RelativePath, resource: PutPayload, ) -> Result<(), ObjectStorageError> { - let time = Instant::now(); + let put_start = Instant::now(); let resp = self.client.put(&to_object_store_path(path), resource).await; - let status = if resp.is_ok() { "200" } else { "400" }; - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["PUT", status]) - .observe(time); + let put_elapsed = put_start.elapsed().as_secs_f64(); if let Err(object_store::Error::NotFound { source, .. }) = &resp { let source_str = source.to_string(); if source_str.contains("NoSuchBucket") { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT", "404"]) + .observe(put_elapsed); return Err(ObjectStorageError::Custom( format!("Bucket '{}' does not exist in GCS.", self.bucket).to_string(), )); } } - resp.map(|_| ()).map_err(|err| err.into()) + match resp { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT", "200"]) + .observe(put_elapsed); + Ok(()) + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT", status_code]) + .observe(put_elapsed); + Err(err.into()) + } + } } async fn _delete_prefix(&self, key: &str) -> Result<(), ObjectStorageError> { + // Track LIST operation + let list_start = Instant::now(); let object_stream = self.client.list(Some(&(key.into()))); + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); object_stream .for_each_concurrent(None, |x| async { match x { Ok(obj) => { - if (self.client.delete(&obj.location).await).is_err() { - error!("Failed to fetch object during delete stream"); + // Track individual DELETE operation + let delete_start = Instant::now(); + match self.client.delete(&obj.location).await { + Ok(_) => { + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "DELETE", "200"]) + .observe(delete_elapsed); + } + Err(err) => { + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "DELETE", status_code]) + .observe(delete_elapsed); + error!("Failed to delete object during delete stream: {:?}", err); + } } } - Err(_) => { - error!("Failed to fetch object during delete stream"); + Err(err) => { + error!("Failed to fetch object during delete stream: {:?}", err); } }; }) @@ -243,39 +276,30 @@ impl Gcs { Ok(()) } - // async fn _list_streams(&self) -> Result, ObjectStorageError> { - // let mut result_file_list = HashSet::new(); - // let resp = self.client.list_with_delimiter(None).await?; - - // let streams = resp - // .common_prefixes - // .iter() - // .flat_map(|path| path.parts()) - // .map(|name| name.as_ref().to_string()) - // .filter(|name| name != PARSEABLE_ROOT_DIRECTORY && name != USERS_ROOT_DIR) - // .collect::>(); - - // for stream in streams { - // let stream_path = - // object_store::path::Path::from(format!("{}/{}", &stream, STREAM_ROOT_DIRECTORY)); - // let resp = self.client.list_with_delimiter(Some(&stream_path)).await?; - // if resp - // .objects - // .iter() - // .any(|name| name.location.filename().unwrap().ends_with("stream.json")) - // { - // result_file_list.insert(stream); - // } - // } - - // Ok(result_file_list) - // } - async fn _list_dates(&self, stream: &str) -> Result, ObjectStorageError> { + // Track LIST operation + let list_start = Instant::now(); let resp = self .client .list_with_delimiter(Some(&(stream.into()))) - .await?; + .await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; let common_prefixes = resp.common_prefixes; @@ -288,19 +312,30 @@ impl Gcs { Ok(dates) } - async fn _upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { - let instant = Instant::now(); + async fn _upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { let bytes = tokio::fs::read(path).await?; - let result = self.client.put(&key.into(), bytes.into()).await?; - info!("Uploaded file to GCS: {:?}", result); - let time = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["UPLOAD_PARQUET", "200"]) - .observe(time); - - Ok(()) + let put_start = Instant::now(); + let result = self.client.put(&key.into(), bytes.into()).await; + let put_elapsed = put_start.elapsed().as_secs_f64(); + + match result { + Ok(result) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT", "200"]) + .observe(put_elapsed); + info!("Uploaded file to GCS: {:?}", result); + Ok(()) + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT", status_code]) + .observe(put_elapsed); + Err(err.into()) + } + } } async fn _upload_multipart( @@ -311,14 +346,52 @@ impl Gcs { let mut file = OpenOptions::new().read(true).open(path).await?; let location = &to_object_store_path(key); - let mut async_writer = self.client.put_multipart(location).await?; + // Track multipart initiation + let multipart_start = Instant::now(); + let async_writer = self.client.put_multipart(location).await; + let multipart_elapsed = multipart_start.elapsed().as_secs_f64(); + + let mut async_writer = match async_writer { + Ok(writer) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT_MULTIPART_INIT", "200"]) + .observe(multipart_elapsed); + writer + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT_MULTIPART_INIT", status_code]) + .observe(multipart_elapsed); + return Err(err.into()); + } + }; let meta = file.metadata().await?; let total_size = meta.len() as usize; if total_size < MIN_MULTIPART_UPLOAD_SIZE { let mut data = Vec::new(); file.read_to_end(&mut data).await?; - self.client.put(location, data.into()).await?; + + // Track single PUT operation for small files + let put_start = Instant::now(); + let result = self.client.put(location, data.into()).await; + let put_elapsed = put_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT", "200"]) + .observe(put_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT", status_code]) + .observe(put_elapsed); + return Err(err.into()); + } + } return Ok(()); } else { let mut data = Vec::new(); @@ -328,7 +401,7 @@ impl Gcs { let num_full_parts = total_size / MIN_MULTIPART_UPLOAD_SIZE; let total_parts = num_full_parts + if has_final_partial_part { 1 } else { 0 }; - // Upload each part + // Upload each part with metrics for part_number in 0..(total_parts) { let start_pos = part_number * MIN_MULTIPART_UPLOAD_SIZE; let end_pos = if part_number == num_full_parts && has_final_partial_part { @@ -342,10 +415,37 @@ impl Gcs { // Extract this part's data let part_data = data[start_pos..end_pos].to_vec(); - // Upload the part - async_writer.put_part(part_data.into()).await?; + // Track individual part upload + let part_start = Instant::now(); + let result = async_writer.put_part(part_data.into()).await; + let part_elapsed = part_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT_MULTIPART_PART", "200"]) + .observe(part_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT_MULTIPART_PART", status_code]) + .observe(part_elapsed); + return Err(err.into()); + } + } } - if let Err(err) = async_writer.complete().await { + + // Track multipart completion + let complete_start = Instant::now(); + let complete_result = async_writer.complete().await; + let complete_elapsed = complete_start.elapsed().as_secs_f64(); + + if let Err(err) = complete_result { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT_MULTIPART_COMPLETE", status_code]) + .observe(complete_elapsed); if let Err(abort_err) = async_writer.abort().await { error!( "Failed to abort multipart upload after completion failure: {:?}", @@ -353,7 +453,11 @@ impl Gcs { ); } return Err(err.into()); - }; + } else { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT_MULTIPART_COMPLETE", "200"]) + .observe(complete_elapsed); + } } Ok(()) } @@ -366,7 +470,26 @@ impl ObjectStorage for Gcs { path: &RelativePath, ) -> Result { let path = &to_object_store_path(path); - let meta = self.client.head(path).await?; + + let head_start = Instant::now(); + let meta = self.client.head(path).await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + let meta = match meta { + Ok(meta) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "HEAD", "200"]) + .observe(head_elapsed); + meta + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "HEAD", status_code]) + .observe(head_elapsed); + return Err(err.into()); + } + }; let store: Arc = self.client.clone(); let buf = object_store::buffered::BufReader::new(store, &meta); @@ -380,11 +503,40 @@ impl ObjectStorage for Gcs { self._upload_multipart(key, path).await } async fn head(&self, path: &RelativePath) -> Result { - Ok(self.client.head(&to_object_store_path(path)).await?) + let head_start = Instant::now(); + let result = self.client.head(&to_object_store_path(path)).await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "HEAD", "200"]) + .observe(head_elapsed); + // Record single file accessed + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "HEAD"]) + .inc(); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "HEAD", status_code]) + .observe(head_elapsed); + } + } + + Ok(result?) } async fn get_object(&self, path: &RelativePath) -> Result { - Ok(self._get_object(path).await?) + let result = self._get_object(path).await?; + + // Record single file accessed + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "GET"]) + .inc(); + + Ok(result) } async fn get_objects( @@ -392,19 +544,40 @@ impl ObjectStorage for Gcs { base_path: Option<&RelativePath>, filter_func: Box bool + Send>, ) -> Result, ObjectStorageError> { - let instant = Instant::now(); - let prefix = if let Some(base_path) = base_path { to_object_store_path(base_path) } else { self.root.clone() }; + // Track list operation + let list_start = Instant::now(); let mut list_stream = self.client.list(Some(&prefix)); let mut res = vec![]; + let mut files_scanned = 0; + + // Note: We track each streaming list item retrieval + while let Some(meta_result) = list_stream.next().await { + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let meta = match meta_result { + Ok(meta) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); + meta + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; - while let Some(meta) = list_stream.next().await.transpose()? { + files_scanned += 1; let ingestor_file = filter_func(meta.location.filename().unwrap().to_string()); if !ingestor_file { @@ -421,10 +594,10 @@ impl ObjectStorage for Gcs { res.push(byts); } - let instant = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(instant); + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "GET"]) + .inc_by(files_scanned as f64); Ok(res) } @@ -432,11 +605,33 @@ impl ObjectStorage for Gcs { async fn get_ingestor_meta_file_paths( &self, ) -> Result, ObjectStorageError> { - let time = Instant::now(); let mut path_arr = vec![]; + let mut files_scanned = 0; + + // Track list operation + let list_start = Instant::now(); let mut object_stream = self.client.list(Some(&self.root)); - while let Some(meta) = object_stream.next().await.transpose()? { + while let Some(meta_result) = object_stream.next().await { + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let meta = match meta_result { + Ok(meta) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); + meta + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + + files_scanned += 1; let flag = meta.location.filename().unwrap().starts_with("ingestor"); if flag { @@ -444,10 +639,10 @@ impl ObjectStorage for Gcs { } } - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(time); + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "LIST"]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -461,6 +656,11 @@ impl ObjectStorage for Gcs { .await .map_err(|err| ObjectStorageError::ConnectionError(Box::new(err)))?; + // Record single file written + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "PUT"]) + .inc(); + Ok(()) } @@ -471,7 +671,14 @@ impl ObjectStorage for Gcs { } async fn delete_object(&self, path: &RelativePath) -> Result<(), ObjectStorageError> { - Ok(self.client.delete(&to_object_store_path(path)).await?) + let result = self.client.delete(&to_object_store_path(path)).await?; + + // Record single file deleted + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "DELETE"]) + .inc(); + + Ok(result) } async fn check(&self) -> Result<(), ObjectStorageError> { @@ -490,9 +697,24 @@ impl ObjectStorage for Gcs { async fn try_delete_node_meta(&self, node_filename: String) -> Result<(), ObjectStorageError> { let file = RelativePathBuf::from(&node_filename); - match self.client.delete(&to_object_store_path(&file)).await { - Ok(_) => Ok(()), + + let delete_start = Instant::now(); + let result = self.client.delete(&to_object_store_path(&file)).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "DELETE", "200"]) + .observe(delete_elapsed); + Ok(()) + } Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "DELETE", status_code]) + .observe(delete_elapsed); + // if the object is not found, it is not an error // the given url path was incorrect if matches!(err, object_store::Error::NotFound { .. }) { @@ -514,7 +736,13 @@ impl ObjectStorage for Gcs { } async fn list_old_streams(&self) -> Result, ObjectStorageError> { + // Track LIST operation + let list_start = Instant::now(); let resp = self.client.list_with_delimiter(None).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); let common_prefixes = resp.common_prefixes; // get all dirs @@ -530,7 +758,27 @@ impl ObjectStorage for Gcs { for dir in &dirs { let key = format!("{dir}/{STREAM_METADATA_FILE_NAME}"); - let task = async move { self.client.head(&StorePath::from(key)).await.map(|_| ()) }; + let task = async move { + let head_start = Instant::now(); + let result = self.client.head(&StorePath::from(key)).await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "HEAD", "200"]) + .observe(head_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "HEAD", status_code]) + .observe(head_elapsed); + } + } + + result.map(|_| ()) + }; stream_json_check.push(task); } @@ -629,7 +877,26 @@ impl ObjectStorage for Gcs { async fn list_dirs(&self) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from("/"); - let resp = self.client.list_with_delimiter(Some(&pre)).await?; + + let list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&pre)).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; Ok(resp .common_prefixes @@ -644,7 +911,26 @@ impl ObjectStorage for Gcs { relative_path: &RelativePath, ) -> Result, ObjectStorageError> { let prefix = object_store::path::Path::from(relative_path.as_str()); - let resp = self.client.list_with_delimiter(Some(&prefix)).await?; + + let list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&prefix)).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; Ok(resp .common_prefixes diff --git a/src/storage/localfs.rs b/src/storage/localfs.rs index 8157c2b41..5eeb90930 100644 --- a/src/storage/localfs.rs +++ b/src/storage/localfs.rs @@ -38,7 +38,7 @@ use tokio_stream::wrappers::ReadDirStream; use crate::{ handlers::http::users::USERS_ROOT_DIR, - metrics::storage::{StorageMetrics, azureblob::REQUEST_RESPONSE_TIME}, + metrics::storage::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}, option::validation, parseable::LogStream, storage::SETTINGS_ROOT_DIRECTORY, @@ -130,6 +130,11 @@ impl ObjectStorage for LocalFS { ))) } async fn head(&self, _path: &RelativePath) -> Result { + // Record attempt to access file (even though operation not implemented) + STORAGE_FILES_SCANNED + .with_label_values(&["localfs", "HEAD"]) + .inc(); + Err(ObjectStorageError::UnhandledError(Box::new( std::io::Error::new( std::io::ErrorKind::Unsupported, @@ -138,8 +143,6 @@ impl ObjectStorage for LocalFS { ))) } async fn get_object(&self, path: &RelativePath) -> Result { - let time = Instant::now(); - let file_path; // this is for the `get_manifest()` function because inside a snapshot, we store the absolute path (without `/`) on linux based OS @@ -163,33 +166,68 @@ impl ObjectStorage for LocalFS { }; } - let res: Result = match fs::read(file_path).await { - Ok(x) => Ok(x.into()), + let get_start = Instant::now(); + let file_result = fs::read(file_path).await; + let get_elapsed = get_start.elapsed().as_secs_f64(); + + let res: Result = match file_result { + Ok(x) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "GET", "200"]) + .observe(get_elapsed); + // Record single file accessed successfully + STORAGE_FILES_SCANNED + .with_label_values(&["localfs", "GET"]) + .inc(); + Ok(x.into()) + } Err(e) => match e.kind() { std::io::ErrorKind::NotFound => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "GET", "404"]) + .observe(get_elapsed); Err(ObjectStorageError::NoSuchKey(path.to_string())) } - _ => Err(ObjectStorageError::UnhandledError(Box::new(e))), + _ => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "GET", "500"]) + .observe(get_elapsed); + Err(ObjectStorageError::UnhandledError(Box::new(e))) + } }, }; - let status = if res.is_ok() { "200" } else { "400" }; - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", status]) - .observe(time); res } async fn get_ingestor_meta_file_paths( &self, ) -> Result, ObjectStorageError> { - let time = Instant::now(); - let mut path_arr = vec![]; - let mut entries = fs::read_dir(&self.root).await?; + let mut files_scanned = 0u64; + + // Track list operation + let list_start = Instant::now(); + let entries_result = fs::read_dir(&self.root).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let mut entries = match entries_result { + Ok(entries) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "200"]) + .observe(list_elapsed); + entries + } + Err(err) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "404"]) + .observe(list_elapsed); + return Err(err.into()); + } + }; while let Some(entry) = entries.next_entry().await? { + files_scanned += 1; let flag = entry .path() .file_name() @@ -206,10 +244,10 @@ impl ObjectStorage for LocalFS { } } - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) // this might not be the right status code - .observe(time); + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["localfs", "LIST"]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -220,16 +258,33 @@ impl ObjectStorage for LocalFS { base_path: Option<&RelativePath>, filter_func: Box bool + std::marker::Send + 'static>, ) -> Result, ObjectStorageError> { - let time = Instant::now(); - + let list_start = Instant::now(); let prefix = if let Some(path) = base_path { path.to_path(&self.root) } else { self.root.clone() }; - let mut entries = fs::read_dir(&prefix).await?; + let entries_result = fs::read_dir(&prefix).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let mut entries = match entries_result { + Ok(entries) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "200"]) + .observe(list_elapsed); + entries + } + Err(err) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "404"]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + let mut res = Vec::new(); + let mut files_scanned = 0; while let Some(entry) = entries.next_entry().await? { let path = entry .path() @@ -240,22 +295,40 @@ impl ObjectStorage for LocalFS { .to_str() .expect("file name is parseable to str") .to_owned(); + + files_scanned += 1; let ingestor_file = filter_func(path); if !ingestor_file { continue; } - let file = fs::read(entry.path()).await?; - res.push(file.into()); + let file_read_start = Instant::now(); + let file_result = fs::read(entry.path()).await; + let file_read_elapsed = file_read_start.elapsed().as_secs_f64(); + + match file_result { + Ok(file) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "GET", "200"]) + .observe(file_read_elapsed); + res.push(file.into()); + } + Err(err) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "GET", "404"]) + .observe(file_read_elapsed); + return Err(err.into()); + } + } } + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["localfs", "GET"]) + .inc_by(files_scanned as f64); + // maybe change the return code - let status = if res.is_empty() { "200" } else { "400" }; - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", status]) - .observe(time); Ok(res) } @@ -265,52 +338,182 @@ impl ObjectStorage for LocalFS { path: &RelativePath, resource: Bytes, ) -> Result<(), ObjectStorageError> { - let time = Instant::now(); - let path = self.path_in_root(path); if let Some(parent) = path.parent() { fs::create_dir_all(parent).await?; } - let res = fs::write(path, resource).await; - let status = if res.is_ok() { "200" } else { "400" }; - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["PUT", status]) - .observe(time); + let put_start = Instant::now(); + let res = fs::write(path, resource).await; + let put_elapsed = put_start.elapsed().as_secs_f64(); + + match &res { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "PUT", "200"]) + .observe(put_elapsed); + // Record single file written successfully + STORAGE_FILES_SCANNED + .with_label_values(&["localfs", "PUT"]) + .inc(); + } + Err(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "PUT", "500"]) + .observe(put_elapsed); + } + } res.map_err(Into::into) } async fn delete_prefix(&self, path: &RelativePath) -> Result<(), ObjectStorageError> { let path = self.path_in_root(path); - tokio::fs::remove_dir_all(path).await?; + + let delete_start = Instant::now(); + let result = tokio::fs::remove_dir_all(path).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "DELETE", "200"]) + .observe(delete_elapsed); + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::NotFound => "404", + std::io::ErrorKind::PermissionDenied => "403", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "DELETE", status_code]) + .observe(delete_elapsed); + } + } + + result?; Ok(()) } async fn delete_object(&self, path: &RelativePath) -> Result<(), ObjectStorageError> { let path = self.path_in_root(path); - tokio::fs::remove_file(path).await?; + + let delete_start = Instant::now(); + let result = tokio::fs::remove_file(path).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "DELETE", "200"]) + .observe(delete_elapsed); + // Record single file deleted successfully + STORAGE_FILES_SCANNED + .with_label_values(&["localfs", "DELETE"]) + .inc(); + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::NotFound => "404", + std::io::ErrorKind::PermissionDenied => "403", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "DELETE", status_code]) + .observe(delete_elapsed); + } + } + + result?; Ok(()) } async fn check(&self) -> Result<(), ObjectStorageError> { - fs::create_dir_all(&self.root) - .await - .map_err(|e| ObjectStorageError::UnhandledError(e.into())) + let check_start = Instant::now(); + let result = fs::create_dir_all(&self.root).await; + let check_elapsed = check_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "HEAD", "200"]) + .observe(check_elapsed); + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::PermissionDenied => "403", + std::io::ErrorKind::NotFound => "404", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "HEAD", status_code]) + .observe(check_elapsed); + } + } + + result.map_err(|e| ObjectStorageError::UnhandledError(e.into())) } async fn delete_stream(&self, stream_name: &str) -> Result<(), ObjectStorageError> { let path = self.root.join(stream_name); - Ok(fs::remove_dir_all(path).await?) + + let delete_start = Instant::now(); + let result = fs::remove_dir_all(path).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "DELETE", "200"]) + .observe(delete_elapsed); + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::NotFound => "404", + std::io::ErrorKind::PermissionDenied => "403", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "DELETE", status_code]) + .observe(delete_elapsed); + } + } + + Ok(result?) } async fn try_delete_node_meta(&self, node_filename: String) -> Result<(), ObjectStorageError> { let path = self.root.join(node_filename); - Ok(fs::remove_file(path).await?) + + let delete_start = Instant::now(); + let result = fs::remove_file(path).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "DELETE", "200"]) + .observe(delete_elapsed); + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::NotFound => "404", + std::io::ErrorKind::PermissionDenied => "403", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "DELETE", status_code]) + .observe(delete_elapsed); + } + } + + Ok(result?) } async fn list_streams(&self) -> Result, ObjectStorageError> { + let list_start = Instant::now(); + let ignore_dir = &[ "lost+found", PARSEABLE_ROOT_DIRECTORY, @@ -318,7 +521,30 @@ impl ObjectStorage for LocalFS { ALERTS_ROOT_DIRECTORY, SETTINGS_ROOT_DIRECTORY, ]; - let directories = ReadDirStream::new(fs::read_dir(&self.root).await?); + + let result = fs::read_dir(&self.root).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let directories = match result { + Ok(read_dir) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "200"]) + .observe(list_elapsed); + ReadDirStream::new(read_dir) + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::NotFound => "404", + std::io::ErrorKind::PermissionDenied => "403", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + let entries: Vec = directories.try_collect().await?; let entries = entries .into_iter() @@ -333,13 +559,38 @@ impl ObjectStorage for LocalFS { } async fn list_old_streams(&self) -> Result, ObjectStorageError> { + let list_start = Instant::now(); + let ignore_dir = &[ "lost+found", PARSEABLE_ROOT_DIRECTORY, ALERTS_ROOT_DIRECTORY, SETTINGS_ROOT_DIRECTORY, ]; - let directories = ReadDirStream::new(fs::read_dir(&self.root).await?); + + let result = fs::read_dir(&self.root).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let directories = match result { + Ok(read_dir) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "200"]) + .observe(list_elapsed); + ReadDirStream::new(read_dir) + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::NotFound => "404", + std::io::ErrorKind::PermissionDenied => "403", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + let entries: Vec = directories.try_collect().await?; let entries = entries .into_iter() @@ -354,7 +605,31 @@ impl ObjectStorage for LocalFS { } async fn list_dirs(&self) -> Result, ObjectStorageError> { - let dirs = ReadDirStream::new(fs::read_dir(&self.root).await?) + let list_start = Instant::now(); + let result = fs::read_dir(&self.root).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let read_dir = match result { + Ok(read_dir) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "200"]) + .observe(list_elapsed); + read_dir + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::NotFound => "404", + std::io::ErrorKind::PermissionDenied => "403", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + + let dirs = ReadDirStream::new(read_dir) .try_collect::>() .await? .into_iter() @@ -375,7 +650,32 @@ impl ObjectStorage for LocalFS { relative_path: &RelativePath, ) -> Result, ObjectStorageError> { let root = self.root.join(relative_path.as_str()); - let dirs = ReadDirStream::new(fs::read_dir(root).await?) + + let list_start = Instant::now(); + let result = fs::read_dir(root).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let read_dir = match result { + Ok(read_dir) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "200"]) + .observe(list_elapsed); + read_dir + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::NotFound => "404", + std::io::ErrorKind::PermissionDenied => "403", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + + let dirs = ReadDirStream::new(read_dir) .try_collect::>() .await? .into_iter() @@ -393,7 +693,32 @@ impl ObjectStorage for LocalFS { async fn list_dates(&self, stream_name: &str) -> Result, ObjectStorageError> { let path = self.root.join(stream_name); - let directories = ReadDirStream::new(fs::read_dir(&path).await?); + + let list_start = Instant::now(); + let result = fs::read_dir(&path).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let read_dir = match result { + Ok(read_dir) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "200"]) + .observe(list_elapsed); + read_dir + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::NotFound => "404", + std::io::ErrorKind::PermissionDenied => "403", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + + let directories = ReadDirStream::new(read_dir); let entries: Vec = directories.try_collect().await?; let entries = entries.into_iter().map(dir_name); let dates: Vec<_> = FuturesUnordered::from_iter(entries).try_collect().await?; @@ -439,6 +764,7 @@ impl ObjectStorage for LocalFS { } async fn upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { + let upload_start = Instant::now(); let op = CopyOptions { overwrite: true, skip_exist: true, @@ -448,8 +774,24 @@ impl ObjectStorage for LocalFS { if let Some(path) = to_path.parent() { fs::create_dir_all(path).await?; } - let _ = fs_extra::file::copy(path, to_path, &op)?; - Ok(()) + + let result = fs_extra::file::copy(path, to_path, &op); + let upload_elapsed = upload_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "PUT", "200"]) + .observe(upload_elapsed); + Ok(()) + } + Err(err) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "PUT", "500"]) + .observe(upload_elapsed); + Err(err.into()) + } + } } fn absolute_url(&self, prefix: &RelativePath) -> object_store::path::Path { diff --git a/src/storage/metrics_layer.rs b/src/storage/metrics_layer.rs index cfaaeb6d2..6de1d9e64 100644 --- a/src/storage/metrics_layer.rs +++ b/src/storage/metrics_layer.rs @@ -34,16 +34,49 @@ use object_store::{ use object_store::{MultipartUpload, PutMultipartOpts, PutPayload} */ -use crate::metrics::storage::s3::QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME; +use crate::metrics::storage::STORAGE_REQUEST_RESPONSE_TIME; + +// Public helper function to map object_store errors to HTTP status codes +pub fn error_to_status_code(err: &object_store::Error) -> &'static str { + match err { + // 400 Bad Request - Client errors + object_store::Error::Generic { .. } => "400", + + // 401 Unauthorized - Authentication required + object_store::Error::Unauthenticated { .. } => "401", + + // 404 Not Found - Resource doesn't exist + object_store::Error::NotFound { .. } => "404", + + // 409 Conflict - Resource already exists + object_store::Error::AlreadyExists { .. } => "409", + + // 412 Precondition Failed - If-Match, If-None-Match, etc. failed + object_store::Error::Precondition { .. } => "412", + + // 304 Not Modified + object_store::Error::NotModified { .. } => "304", + + // 501 Not Implemented - Feature not supported + object_store::Error::NotSupported { .. } => "501", + + // 500 Internal Server Error - All other errors + _ => "500", + } +} #[derive(Debug)] pub struct MetricLayer { inner: T, + provider: String, } impl MetricLayer { - pub fn new(inner: T) -> Self { - Self { inner } + pub fn new(inner: T, provider: &str) -> Self { + Self { + inner, + provider: provider.to_string(), + } } } @@ -62,12 +95,18 @@ impl ObjectStore for MetricLayer { bytes: PutPayload, /* PutPayload */ ) -> ObjectStoreResult { let time = time::Instant::now(); - let put_result = self.inner.put(location, bytes).await?; + let put_result = self.inner.put(location, bytes).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["PUT", "200"]) + + let status = match &put_result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "PUT", status]) .observe(elapsed); - return Ok(put_result); + put_result } async fn put_opts( @@ -77,12 +116,18 @@ impl ObjectStore for MetricLayer { opts: PutOptions, ) -> ObjectStoreResult { let time = time::Instant::now(); - let put_result = self.inner.put_opts(location, payload, opts).await?; + let put_result = self.inner.put_opts(location, payload, opts).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["PUT_OPTS", "200"]) + + let status = match &put_result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "PUT_OPTS", status]) .observe(elapsed); - return Ok(put_result); + put_result } // // ! removed in object_store 0.10.0 @@ -94,7 +139,7 @@ impl ObjectStore for MetricLayer { // let time = time::Instant::now(); // let elapsed = time.elapsed().as_secs_f64(); // self.inner.abort_multipart(location, multipart_id).await?; - // QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME + // STORAGE_REQUEST_RESPONSE_TIME // .with_label_values(&["PUT_MULTIPART_ABORT", "200"]) // .observe(elapsed); // Ok(()) @@ -107,56 +152,84 @@ impl ObjectStore for MetricLayer { opts: PutMultipartOpts, ) -> ObjectStoreResult> { let time = time::Instant::now(); - let multipart_upload = self.inner.put_multipart_opts(location, opts).await?; + let result = self.inner.put_multipart_opts(location, opts).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["PUT_MULTIPART_OPTS", "200"]) - .observe(elapsed); - Ok(multipart_upload) + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "PUT_MULTIPART_OPTS", status]) + .observe(elapsed); + result } // todo completly tracking multipart upload async fn put_multipart(&self, location: &Path) -> ObjectStoreResult> /* ObjectStoreResult<(MultipartId, Box)> */ { let time = time::Instant::now(); - let multipart_upload = self.inner.put_multipart(location).await?; + let result = self.inner.put_multipart(location).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["PUT_MULTIPART", "200"]) - .observe(elapsed); - Ok(multipart_upload) + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "PUT_MULTIPART", status]) + .observe(elapsed); + result } async fn get(&self, location: &Path) -> ObjectStoreResult { let time = time::Instant::now(); - let res = self.inner.get(location).await?; + let get_result = self.inner.get(location).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) + + let status = match &get_result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "GET", status]) .observe(elapsed); - Ok(res) + get_result } async fn get_opts(&self, location: &Path, options: GetOptions) -> ObjectStoreResult { let time = time::Instant::now(); - let res = self.inner.get_opts(location, options).await?; + let result = self.inner.get_opts(location, options).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["GET_OPTS", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "GET_OPTS", status]) .observe(elapsed); - Ok(res) + result } async fn get_range(&self, location: &Path, range: Range) -> ObjectStoreResult { let time = time::Instant::now(); - let res = self.inner.get_range(location, range).await?; + let result = self.inner.get_range(location, range).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["GET_RANGE", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "GET_RANGE", status]) .observe(elapsed); - Ok(res) + result } async fn get_ranges( @@ -165,32 +238,50 @@ impl ObjectStore for MetricLayer { ranges: &[Range], ) -> ObjectStoreResult> { let time = time::Instant::now(); - let res = self.inner.get_ranges(location, ranges).await?; + let result = self.inner.get_ranges(location, ranges).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["GET_RANGES", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "GET_RANGES", status]) .observe(elapsed); - Ok(res) + result } async fn head(&self, location: &Path) -> ObjectStoreResult { let time = time::Instant::now(); - let res = self.inner.head(location).await?; + let result = self.inner.head(location).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["HEAD", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "HEAD", status]) .observe(elapsed); - Ok(res) + result } async fn delete(&self, location: &Path) -> ObjectStoreResult<()> { let time = time::Instant::now(); - let res = self.inner.delete(location).await?; + let result = self.inner.delete(location).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["DELETE", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "DELETE", status]) .observe(elapsed); - Ok(res) + result } fn delete_stream<'a>( @@ -229,52 +320,82 @@ impl ObjectStore for MetricLayer { async fn list_with_delimiter(&self, prefix: Option<&Path>) -> ObjectStoreResult { let time = time::Instant::now(); - let res = self.inner.list_with_delimiter(prefix).await?; + let result = self.inner.list_with_delimiter(prefix).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["LIST_DELIM", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "LIST_DELIM", status]) .observe(elapsed); - Ok(res) + result } async fn copy(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { let time = time::Instant::now(); - let res = self.inner.copy(from, to).await?; + let result = self.inner.copy(from, to).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["COPY", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "COPY", status]) .observe(elapsed); - Ok(res) + result } async fn rename(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { let time = time::Instant::now(); - let res = self.inner.rename(from, to).await?; + let result = self.inner.rename(from, to).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["RENAME", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "RENAME", status]) .observe(elapsed); - Ok(res) + result } async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { let time = time::Instant::now(); - let res = self.inner.copy_if_not_exists(from, to).await?; + let result = self.inner.copy_if_not_exists(from, to).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["COPY_IF", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "COPY_IF", status]) .observe(elapsed); - Ok(res) + result } async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { let time = time::Instant::now(); - let res = self.inner.rename_if_not_exists(from, to).await?; + let result = self.inner.rename_if_not_exists(from, to).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["RENAME_IF", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "RENAME_IF", status]) .observe(elapsed); - Ok(res) + result } } @@ -293,7 +414,7 @@ impl Stream for StreamMetricWrapper<'_, N, T> { ) -> Poll> { match self.inner.poll_next_unpin(cx) { t @ Poll::Ready(None) => { - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME + STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&self.labels) .observe(self.time.elapsed().as_secs_f64()); t diff --git a/src/storage/s3.rs b/src/storage/s3.rs index bc8d57a63..3aeb076f5 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -21,7 +21,10 @@ use std::{ fmt::Display, path::Path, str::FromStr, - sync::Arc, + sync::{ + Arc, + atomic::{AtomicU64, Ordering}, + }, time::{Duration, Instant}, }; @@ -47,15 +50,15 @@ use tokio::{fs::OpenOptions, io::AsyncReadExt}; use tracing::{error, info}; use crate::{ - metrics::storage::{StorageMetrics, azureblob::REQUEST_RESPONSE_TIME}, + metrics::storage::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}, parseable::LogStream, }; use super::{ CONNECT_TIMEOUT_SECS, MIN_MULTIPART_UPLOAD_SIZE, ObjectStorage, ObjectStorageError, ObjectStorageProvider, PARSEABLE_ROOT_DIRECTORY, REQUEST_TIMEOUT_SECS, - STREAM_METADATA_FILE_NAME, metrics_layer::MetricLayer, object_storage::parseable_json_path, - to_object_store_path, + STREAM_METADATA_FILE_NAME, metrics_layer::MetricLayer, metrics_layer::error_to_status_code, + object_storage::parseable_json_path, to_object_store_path, }; // in bytes @@ -299,7 +302,7 @@ impl ObjectStorageProvider for S3Config { // limit objectstore to a concurrent request limit let s3 = LimitStore::new(s3, super::MAX_OBJECT_STORE_REQUESTS); - let s3 = MetricLayer::new(s3); + let s3 = MetricLayer::new(s3, "s3"); let object_store_registry = DefaultObjectStoreRegistry::new(); let url = ObjectStoreUrl::parse(format!("s3://{}", &self.bucket_name)).unwrap(); @@ -336,24 +339,24 @@ pub struct S3 { impl S3 { async fn _get_object(&self, path: &RelativePath) -> Result { - let instant = Instant::now(); - + let time = std::time::Instant::now(); let resp = self.client.get(&to_object_store_path(path)).await; + let elapsed = time.elapsed().as_secs_f64(); + match resp { Ok(resp) => { - let time = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(time); let body = resp.bytes().await?; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "GET", "200"]) + .observe(elapsed); Ok(body) } Err(err) => { - let time = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "400"]) - .observe(time); + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "GET", status_code]) + .observe(elapsed); Err(err.into()) } } @@ -364,55 +367,127 @@ impl S3 { path: &RelativePath, resource: PutPayload, ) -> Result<(), ObjectStorageError> { - let time = Instant::now(); + let time = std::time::Instant::now(); let resp = self.client.put(&to_object_store_path(path), resource).await; - let status = if resp.is_ok() { "200" } else { "400" }; - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["PUT", status]) - .observe(time); - - if let Err(object_store::Error::NotFound { source, .. }) = &resp { - let source_str = source.to_string(); - if source_str.contains("NoSuchBucket") { - return Err(ObjectStorageError::Custom( - format!("Bucket '{}' does not exist in S3.", self.bucket).to_string(), - )); + + let elapsed = time.elapsed().as_secs_f64(); + + match resp { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT", "200"]) + .observe(elapsed); + Ok(()) + } + Err(err) => { + let status_code = match &err { + object_store::Error::NotFound { .. } => { + // Check for specific S3 bucket not found error + let source_str = err.to_string(); + if source_str.contains("NoSuchBucket") { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT", "404"]) + .observe(elapsed); + return Err(ObjectStorageError::Custom( + format!("Bucket '{}' does not exist in S3.", self.bucket) + .to_string(), + )); + } + "404" + } + _ => error_to_status_code(&err), + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT", status_code]) + .observe(elapsed); + Err(err.into()) } } - - resp.map(|_| ()).map_err(|err| err.into()) } async fn _delete_prefix(&self, key: &str) -> Result<(), ObjectStorageError> { + let files_scanned = Arc::new(AtomicU64::new(0)); + + // Track LIST operation + let list_start = Instant::now(); let object_stream = self.client.list(Some(&(key.into()))); + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); + let files_scanned_clone = files_scanned.clone(); object_stream .for_each_concurrent(None, |x| async { match x { Ok(obj) => { - if (self.client.delete(&obj.location).await).is_err() { - error!("Failed to fetch object during delete stream"); + files_scanned_clone.fetch_add(1, Ordering::Relaxed); + // Track individual DELETE operation + let delete_start = Instant::now(); + match self.client.delete(&obj.location).await { + Ok(_) => { + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "DELETE", "200"]) + .observe(delete_elapsed); + } + Err(err) => { + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "DELETE", status_code]) + .observe(delete_elapsed); + error!("Failed to delete object during delete stream: {:?}", err); + } } } - Err(_) => { - error!("Failed to fetch object during delete stream"); + Err(err) => { + error!("Failed to fetch object during delete stream: {:?}", err); } }; }) .await; + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "LIST"]) + .inc_by(files_scanned.load(Ordering::Relaxed) as f64); + Ok(()) } async fn _list_dates(&self, stream: &str) -> Result, ObjectStorageError> { + // Track LIST operation + let list_start = Instant::now(); let resp = self .client .list_with_delimiter(Some(&(stream.into()))) - .await?; + .await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; let common_prefixes = resp.common_prefixes; + // Record files scanned (prefixes/directories count as files scanned) + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "LIST"]) + .inc_by(common_prefixes.len() as f64); + // return prefixes at the root level let dates: Vec<_> = common_prefixes .iter() @@ -423,66 +498,40 @@ impl S3 { Ok(dates) } - // async fn _list_manifest_files( - // &self, - // stream: &str, - // ) -> Result>, ObjectStorageError> { - // let mut result_file_list: BTreeMap> = BTreeMap::new(); - // let resp = self - // .client - // .list_with_delimiter(Some(&(stream.into()))) - // .await?; - // warn!(resp=?resp); - // let dates = resp - // .common_prefixes - // .iter() - // .flat_map(|path| path.parts()) - // .filter(|name| name.as_ref() != stream && name.as_ref() != STREAM_ROOT_DIRECTORY) - // .map(|name| name.as_ref().to_string()) - // .collect::>(); - // warn!(dates=?dates); - - // for date in dates { - // let date_path = object_store::path::Path::from(format!("{}/{}", stream, &date)); - // let resp = self.client.list_with_delimiter(Some(&date_path)).await?; - // warn!(date_path=?resp); - // let manifests: Vec = resp - // .objects - // .iter() - // .filter(|name| name.location.filename().unwrap().ends_with("manifest.json")) - // .map(|name| name.location.to_string()) - // .collect(); - // result_file_list.entry(date).or_default().extend(manifests); - // } - // Ok(result_file_list) - // } - async fn _upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { - let instant = Instant::now(); - // // TODO: Uncomment this when multipart is fixed // let should_multipart = std::fs::metadata(path)?.len() > MULTIPART_UPLOAD_SIZE as u64; let should_multipart = false; - let res = if should_multipart { + if should_multipart { // self._upload_multipart(key, path).await // this branch will never get executed Ok(()) } else { let bytes = tokio::fs::read(path).await?; - let result = self.client.put(&key.into(), bytes.into()).await?; - info!("Uploaded file to S3: {:?}", result); - Ok(()) - }; - - let status = if res.is_ok() { "200" } else { "400" }; - let time = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["UPLOAD_PARQUET", status]) - .observe(time); - res + let put_start = Instant::now(); + let result = self.client.put(&key.into(), bytes.into()).await; + let put_elapsed = put_start.elapsed().as_secs_f64(); + + match result { + Ok(result) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT", "200"]) + .observe(put_elapsed); + info!("Uploaded file to S3: {:?}", result); + Ok(()) + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT", status_code]) + .observe(put_elapsed); + Err(err.into()) + } + } + } } async fn _upload_multipart( @@ -493,14 +542,53 @@ impl S3 { let mut file = OpenOptions::new().read(true).open(path).await?; let location = &to_object_store_path(key); - let mut async_writer = self.client.put_multipart(location).await?; + // Track multipart initiation + let multipart_start = Instant::now(); + let async_writer = self.client.put_multipart(location).await; + let multipart_elapsed = multipart_start.elapsed().as_secs_f64(); + + let mut async_writer = match async_writer { + Ok(writer) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT_MULTIPART_INIT", "200"]) + .observe(multipart_elapsed); + writer + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT_MULTIPART_INIT", status_code]) + .observe(multipart_elapsed); + return Err(err.into()); + } + }; let meta = file.metadata().await?; let total_size = meta.len() as usize; if total_size < MIN_MULTIPART_UPLOAD_SIZE { let mut data = Vec::new(); file.read_to_end(&mut data).await?; - self.client.put(location, data.into()).await?; + + // Track single PUT operation for small files + let put_start = Instant::now(); + let result = self.client.put(location, data.into()).await; + let put_elapsed = put_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT", "200"]) + .observe(put_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT", status_code]) + .observe(put_elapsed); + return Err(err.into()); + } + } + // async_writer.put_part(data.into()).await?; // async_writer.complete().await?; return Ok(()); @@ -514,7 +602,7 @@ impl S3 { let num_full_parts = total_size / MIN_MULTIPART_UPLOAD_SIZE; let total_parts = num_full_parts + if has_final_partial_part { 1 } else { 0 }; - // Upload each part + // Upload each part with metrics for part_number in 0..(total_parts) { let start_pos = part_number * MIN_MULTIPART_UPLOAD_SIZE; let end_pos = if part_number == num_full_parts && has_final_partial_part { @@ -528,15 +616,47 @@ impl S3 { // Extract this part's data let part_data = data[start_pos..end_pos].to_vec(); - // Upload the part - async_writer.put_part(part_data.into()).await?; + // Track individual part upload + let part_start = Instant::now(); + let result = async_writer.put_part(part_data.into()).await; + let part_elapsed = part_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT_MULTIPART_PART", "200"]) + .observe(part_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT_MULTIPART_PART", status_code]) + .observe(part_elapsed); + return Err(err.into()); + } + } // upload_parts.push(part_number as u64 + 1); } - if let Err(err) = async_writer.complete().await { + + // Track multipart completion + let complete_start = Instant::now(); + let complete_result = async_writer.complete().await; + let complete_elapsed = complete_start.elapsed().as_secs_f64(); + + if let Err(err) = complete_result { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT_MULTIPART_COMPLETE", status_code]) + .observe(complete_elapsed); error!("Failed to complete multipart upload. {:?}", err); async_writer.abort().await?; - }; + return Err(err.into()); + } else { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT_MULTIPART_COMPLETE", "200"]) + .observe(complete_elapsed); + } } Ok(()) } @@ -549,7 +669,26 @@ impl ObjectStorage for S3 { path: &RelativePath, ) -> Result { let path = &to_object_store_path(path); - let meta = self.client.head(path).await?; + + let head_start = Instant::now(); + let meta = self.client.head(path).await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + let meta = match meta { + Ok(meta) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "HEAD", "200"]) + .observe(head_elapsed); + meta + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "HEAD", status_code]) + .observe(head_elapsed); + return Err(err.into()); + } + }; let store: Arc = Arc::new(self.client.clone()); let buf = object_store::buffered::BufReader::new(store, &meta); @@ -563,11 +702,40 @@ impl ObjectStorage for S3 { self._upload_multipart(key, path).await } async fn head(&self, path: &RelativePath) -> Result { - Ok(self.client.head(&to_object_store_path(path)).await?) + let head_start = Instant::now(); + let result = self.client.head(&to_object_store_path(path)).await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "HEAD", "200"]) + .observe(head_elapsed); + // Record single file accessed + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "HEAD"]) + .inc(); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "HEAD", status_code]) + .observe(head_elapsed); + } + } + + Ok(result?) } async fn get_object(&self, path: &RelativePath) -> Result { - Ok(self._get_object(path).await?) + let result = self._get_object(path).await?; + + // Record single file accessed + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "GET"]) + .inc(); + + Ok(result) } async fn get_objects( @@ -575,19 +743,40 @@ impl ObjectStorage for S3 { base_path: Option<&RelativePath>, filter_func: Box bool + Send>, ) -> Result, ObjectStorageError> { - let instant = Instant::now(); - let prefix = if let Some(base_path) = base_path { to_object_store_path(base_path) } else { self.root.clone() }; + // Track list operation + let list_start = Instant::now(); let mut list_stream = self.client.list(Some(&prefix)); let mut res = vec![]; + let mut files_scanned = 0; + + // Note: We track each streaming list item retrieval + while let Some(meta_result) = list_stream.next().await { + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let meta = match meta_result { + Ok(meta) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); + meta + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; - while let Some(meta) = list_stream.next().await.transpose()? { + files_scanned += 1; let ingestor_file = filter_func(meta.location.filename().unwrap().to_string()); if !ingestor_file { @@ -604,10 +793,10 @@ impl ObjectStorage for S3 { res.push(byts); } - let instant = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(instant); + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "GET"]) + .inc_by(files_scanned as f64); Ok(res) } @@ -615,11 +804,33 @@ impl ObjectStorage for S3 { async fn get_ingestor_meta_file_paths( &self, ) -> Result, ObjectStorageError> { - let time = Instant::now(); let mut path_arr = vec![]; + let mut files_scanned = 0; + + // Track list operation + let list_start = Instant::now(); let mut object_stream = self.client.list(Some(&self.root)); - while let Some(meta) = object_stream.next().await.transpose()? { + while let Some(meta_result) = object_stream.next().await { + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let meta = match meta_result { + Ok(meta) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); + meta + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + + files_scanned += 1; let flag = meta.location.filename().unwrap().starts_with("ingestor"); if flag { @@ -627,10 +838,10 @@ impl ObjectStorage for S3 { } } - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(time); + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "LIST"]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -644,6 +855,11 @@ impl ObjectStorage for S3 { .await .map_err(|err| ObjectStorageError::ConnectionError(Box::new(err)))?; + // Record single file written + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "PUT"]) + .inc(); + Ok(()) } @@ -654,15 +870,54 @@ impl ObjectStorage for S3 { } async fn delete_object(&self, path: &RelativePath) -> Result<(), ObjectStorageError> { - Ok(self.client.delete(&to_object_store_path(path)).await?) + let delete_start = Instant::now(); + let result = self.client.delete(&to_object_store_path(path)).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "DELETE", "200"]) + .observe(delete_elapsed); + // Record single file deleted + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "DELETE"]) + .inc(); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "DELETE", status_code]) + .observe(delete_elapsed); + } + } + + Ok(result?) } async fn check(&self) -> Result<(), ObjectStorageError> { - Ok(self + let head_start = Instant::now(); + let result = self .client .head(&to_object_store_path(&parseable_json_path())) - .await - .map(|_| ())?) + .await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "HEAD", "200"]) + .observe(head_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "HEAD", status_code]) + .observe(head_elapsed); + } + } + + Ok(result.map(|_| ())?) } async fn delete_stream(&self, stream_name: &str) -> Result<(), ObjectStorageError> { @@ -673,9 +928,24 @@ impl ObjectStorage for S3 { async fn try_delete_node_meta(&self, node_filename: String) -> Result<(), ObjectStorageError> { let file = RelativePathBuf::from(&node_filename); - match self.client.delete(&to_object_store_path(&file)).await { - Ok(_) => Ok(()), + + let delete_start = Instant::now(); + let result = self.client.delete(&to_object_store_path(&file)).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "DELETE", "200"]) + .observe(delete_elapsed); + Ok(()) + } Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "DELETE", status_code]) + .observe(delete_elapsed); + // if the object is not found, it is not an error // the given url path was incorrect if matches!(err, object_store::Error::NotFound { .. }) { @@ -697,7 +967,13 @@ impl ObjectStorage for S3 { } async fn list_old_streams(&self) -> Result, ObjectStorageError> { + // Track LIST operation + let list_start = Instant::now(); let resp = self.client.list_with_delimiter(None).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); let common_prefixes = resp.common_prefixes; // get all dirs @@ -713,7 +989,27 @@ impl ObjectStorage for S3 { for dir in &dirs { let key = format!("{dir}/{STREAM_METADATA_FILE_NAME}"); - let task = async move { self.client.head(&StorePath::from(key)).await.map(|_| ()) }; + let task = async move { + let head_start = Instant::now(); + let result = self.client.head(&StorePath::from(key)).await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "HEAD", "200"]) + .observe(head_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "HEAD", status_code]) + .observe(head_elapsed); + } + } + + result.map(|_| ()) + }; stream_json_check.push(task); } @@ -821,7 +1117,26 @@ impl ObjectStorage for S3 { async fn list_dirs(&self) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from("/"); - let resp = self.client.list_with_delimiter(Some(&pre)).await?; + + let list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&pre)).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; Ok(resp .common_prefixes @@ -836,7 +1151,26 @@ impl ObjectStorage for S3 { relative_path: &RelativePath, ) -> Result, ObjectStorageError> { let prefix = object_store::path::Path::from(relative_path.as_str()); - let resp = self.client.list_with_delimiter(Some(&prefix)).await?; + + let list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&prefix)).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; Ok(resp .common_prefixes From 255912a3da2aff8ffd2204ff10f55eb27e687a04 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Sun, 24 Aug 2025 10:19:51 -0700 Subject: [PATCH 02/14] improve metrics collection --- src/catalog/mod.rs | 6 +- src/metadata.rs | 17 +- src/metrics/mod.rs | 95 ++++++++--- src/query/listing_table_builder.rs | 80 +++------ src/query/stream_schema_provider.rs | 58 +++---- src/stats.rs | 11 +- src/storage/azure_blob.rs | 245 ++++++++++++++-------------- src/storage/gcs.rs | 223 ++++++++++++++----------- src/storage/object_storage.rs | 12 +- src/storage/s3.rs | 207 ++++++++++------------- 10 files changed, 491 insertions(+), 463 deletions(-) diff --git a/src/catalog/mod.rs b/src/catalog/mod.rs index 5c8c411a2..457a48702 100644 --- a/src/catalog/mod.rs +++ b/src/catalog/mod.rs @@ -193,17 +193,17 @@ fn extract_partition_metrics(stream_name: &str, partition_lower: DateTime) let events_ingested = EVENTS_INGESTED_DATE .get_metric_with_label_values(&event_labels) - .map(|metric| metric.get() as u64) + .map(|metric| metric.get()) .unwrap_or(0); let ingestion_size = EVENTS_INGESTED_SIZE_DATE .get_metric_with_label_values(&event_labels) - .map(|metric| metric.get() as u64) + .map(|metric| metric.get()) .unwrap_or(0); let storage_size = EVENTS_STORAGE_SIZE_DATE .get_metric_with_label_values(&storage_labels) - .map(|metric| metric.get() as u64) + .map(|metric| metric.get()) .unwrap_or(0); (events_ingested, ingestion_size, storage_size) diff --git a/src/metadata.rs b/src/metadata.rs index 1e7061bfb..34b5880b4 100644 --- a/src/metadata.rs +++ b/src/metadata.rs @@ -29,6 +29,7 @@ use crate::handlers::TelemetryType; use crate::metrics::{ EVENTS_INGESTED, EVENTS_INGESTED_DATE, EVENTS_INGESTED_SIZE, EVENTS_INGESTED_SIZE_DATE, EVENTS_STORAGE_SIZE_DATE, LIFETIME_EVENTS_INGESTED, LIFETIME_EVENTS_INGESTED_SIZE, + TOTAL_EVENTS_INGESTED_DATE, TOTAL_EVENTS_INGESTED_SIZE_DATE, }; use crate::storage::StreamType; use crate::storage::retention::Retention; @@ -46,19 +47,25 @@ pub fn update_stats( .add(num_rows as i64); EVENTS_INGESTED_DATE .with_label_values(&[stream_name, origin, &parsed_date]) - .add(num_rows as i64); + .inc_by(num_rows as u64); EVENTS_INGESTED_SIZE .with_label_values(&[stream_name, origin]) .add(size as i64); EVENTS_INGESTED_SIZE_DATE .with_label_values(&[stream_name, origin, &parsed_date]) - .add(size as i64); + .inc_by(size); LIFETIME_EVENTS_INGESTED .with_label_values(&[stream_name, origin]) .add(num_rows as i64); LIFETIME_EVENTS_INGESTED_SIZE .with_label_values(&[stream_name, origin]) .add(size as i64); + TOTAL_EVENTS_INGESTED_DATE + .with_label_values(&[origin, &parsed_date]) + .inc_by(num_rows as u64); + TOTAL_EVENTS_INGESTED_SIZE_DATE + .with_label_values(&[origin, &parsed_date]) + .inc_by(size); } /// In order to support backward compatability with streams created before v1.6.4, @@ -173,12 +180,12 @@ pub fn load_daily_metrics(manifests: &Vec, stream_name: &str) { let storage_size = manifest.storage_size; EVENTS_INGESTED_DATE .with_label_values(&[stream_name, "json", &manifest_date]) - .set(events_ingested as i64); + .inc_by(events_ingested); EVENTS_INGESTED_SIZE_DATE .with_label_values(&[stream_name, "json", &manifest_date]) - .set(ingestion_size as i64); + .inc_by(ingestion_size); EVENTS_STORAGE_SIZE_DATE .with_label_values(&["data", stream_name, "parquet", &manifest_date]) - .set(storage_size as i64); + .inc_by(storage_size); } } diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index 1896bce0c..81809eefd 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -30,7 +30,7 @@ pub const METRICS_NAMESPACE: &str = env!("CARGO_PKG_NAME"); pub static EVENTS_INGESTED: Lazy = Lazy::new(|| { IntGaugeVec::new( - Opts::new("events_ingested", "Events ingested").namespace(METRICS_NAMESPACE), + Opts::new("events_ingested", "Events ingested for a stream").namespace(METRICS_NAMESPACE), &["stream", "format"], ) .expect("metric can be created") @@ -38,8 +38,11 @@ pub static EVENTS_INGESTED: Lazy = Lazy::new(|| { pub static EVENTS_INGESTED_SIZE: Lazy = Lazy::new(|| { IntGaugeVec::new( - Opts::new("events_ingested_size", "Events ingested size bytes") - .namespace(METRICS_NAMESPACE), + Opts::new( + "events_ingested_size", + "Events ingested size bytes for a stream", + ) + .namespace(METRICS_NAMESPACE), &["stream", "format"], ) .expect("metric can be created") @@ -47,7 +50,7 @@ pub static EVENTS_INGESTED_SIZE: Lazy = Lazy::new(|| { pub static STORAGE_SIZE: Lazy = Lazy::new(|| { IntGaugeVec::new( - Opts::new("storage_size", "Storage size bytes").namespace(METRICS_NAMESPACE), + Opts::new("storage_size", "Storage size bytes for a stream").namespace(METRICS_NAMESPACE), &["type", "stream", "format"], ) .expect("metric can be created") @@ -55,7 +58,7 @@ pub static STORAGE_SIZE: Lazy = Lazy::new(|| { pub static EVENTS_DELETED: Lazy = Lazy::new(|| { IntGaugeVec::new( - Opts::new("events_deleted", "Events deleted").namespace(METRICS_NAMESPACE), + Opts::new("events_deleted", "Events deleted for a stream").namespace(METRICS_NAMESPACE), &["stream", "format"], ) .expect("metric can be created") @@ -63,7 +66,11 @@ pub static EVENTS_DELETED: Lazy = Lazy::new(|| { pub static EVENTS_DELETED_SIZE: Lazy = Lazy::new(|| { IntGaugeVec::new( - Opts::new("events_deleted_size", "Events deleted size bytes").namespace(METRICS_NAMESPACE), + Opts::new( + "events_deleted_size", + "Events deleted size bytes for a stream", + ) + .namespace(METRICS_NAMESPACE), &["stream", "format"], ) .expect("metric can be created") @@ -73,7 +80,7 @@ pub static DELETED_EVENTS_STORAGE_SIZE: Lazy = Lazy::new(|| { IntGaugeVec::new( Opts::new( "deleted_events_storage_size", - "Deleted events storage size bytes", + "Deleted events storage size bytes for a stream", ) .namespace(METRICS_NAMESPACE), &["type", "stream", "format"], @@ -83,8 +90,11 @@ pub static DELETED_EVENTS_STORAGE_SIZE: Lazy = Lazy::new(|| { pub static LIFETIME_EVENTS_INGESTED: Lazy = Lazy::new(|| { IntGaugeVec::new( - Opts::new("lifetime_events_ingested", "Lifetime events ingested") - .namespace(METRICS_NAMESPACE), + Opts::new( + "lifetime_events_ingested", + "Lifetime events ingested for a stream", + ) + .namespace(METRICS_NAMESPACE), &["stream", "format"], ) .expect("metric can be created") @@ -94,7 +104,7 @@ pub static LIFETIME_EVENTS_INGESTED_SIZE: Lazy = Lazy::new(|| { IntGaugeVec::new( Opts::new( "lifetime_events_ingested_size", - "Lifetime events ingested size bytes", + "Lifetime events ingested size bytes for a stream", ) .namespace(METRICS_NAMESPACE), &["stream", "format"], @@ -106,7 +116,7 @@ pub static LIFETIME_EVENTS_STORAGE_SIZE: Lazy = Lazy::new(|| { IntGaugeVec::new( Opts::new( "lifetime_events_storage_size", - "Lifetime events storage size bytes", + "Lifetime events storage size bytes for a stream", ) .namespace(METRICS_NAMESPACE), &["type", "stream", "format"], @@ -114,11 +124,11 @@ pub static LIFETIME_EVENTS_STORAGE_SIZE: Lazy = Lazy::new(|| { .expect("metric can be created") }); -pub static EVENTS_INGESTED_DATE: Lazy = Lazy::new(|| { - IntGaugeVec::new( +pub static EVENTS_INGESTED_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( Opts::new( "events_ingested_date", - "Events ingested on a particular date", + "Events ingested for a stream on a particular date", ) .namespace(METRICS_NAMESPACE), &["stream", "format", "date"], @@ -126,11 +136,11 @@ pub static EVENTS_INGESTED_DATE: Lazy = Lazy::new(|| { .expect("metric can be created") }); -pub static EVENTS_INGESTED_SIZE_DATE: Lazy = Lazy::new(|| { - IntGaugeVec::new( +pub static EVENTS_INGESTED_SIZE_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( Opts::new( "events_ingested_size_date", - "Events ingested size in bytes on a particular date", + "Events ingested size in bytes for a stream on a particular date", ) .namespace(METRICS_NAMESPACE), &["stream", "format", "date"], @@ -138,11 +148,11 @@ pub static EVENTS_INGESTED_SIZE_DATE: Lazy = Lazy::new(|| { .expect("metric can be created") }); -pub static EVENTS_STORAGE_SIZE_DATE: Lazy = Lazy::new(|| { - IntGaugeVec::new( +pub static EVENTS_STORAGE_SIZE_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( Opts::new( "events_storage_size_date", - "Events storage size in bytes on a particular date", + "Events storage size in bytes for a stream on a particular date", ) .namespace(METRICS_NAMESPACE), &["type", "stream", "format", "date"], @@ -150,6 +160,42 @@ pub static EVENTS_STORAGE_SIZE_DATE: Lazy = Lazy::new(|| { .expect("metric can be created") }); +pub static TOTAL_EVENTS_INGESTED_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( + Opts::new( + "total_events_ingested_date", + "total events ingested on a particular date", + ) + .namespace(METRICS_NAMESPACE), + &["format", "date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_EVENTS_INGESTED_SIZE_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( + Opts::new( + "total_events_ingested_size_date", + "Total events ingested size in bytes on a particular date", + ) + .namespace(METRICS_NAMESPACE), + &["format", "date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_EVENTS_STORAGE_SIZE_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( + Opts::new( + "total_events_storage_size_date", + "Total events storage size in bytes on a particular date", + ) + .namespace(METRICS_NAMESPACE), + &["format", "date"], + ) + .expect("metric can be created") +}); + pub static STAGING_FILES: Lazy = Lazy::new(|| { IntGaugeVec::new( Opts::new("staging_files", "Active Staging files").namespace(METRICS_NAMESPACE), @@ -219,6 +265,15 @@ fn custom_metrics(registry: &Registry) { registry .register(Box::new(EVENTS_STORAGE_SIZE_DATE.clone())) .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_EVENTS_INGESTED_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_EVENTS_INGESTED_SIZE_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_EVENTS_STORAGE_SIZE_DATE.clone())) + .expect("metric can be registered"); registry .register(Box::new(STAGING_FILES.clone())) .expect("metric can be registered"); diff --git a/src/query/listing_table_builder.rs b/src/query/listing_table_builder.rs index 9ca484ead..a2087d2cc 100644 --- a/src/query/listing_table_builder.rs +++ b/src/query/listing_table_builder.rs @@ -16,7 +16,7 @@ * */ -use std::{collections::HashMap, ops::Bound, pin::Pin, sync::Arc}; +use std::{ops::Bound, sync::Arc}; use arrow_schema::Schema; use datafusion::{ @@ -27,9 +27,7 @@ use datafusion::{ error::DataFusionError, logical_expr::col, }; -use futures_util::{Future, TryStreamExt, stream::FuturesUnordered}; use itertools::Itertools; -use object_store::{ObjectMeta, ObjectStore, path::Path}; use crate::{ OBJECT_STORE_DATA_GRANULARITY, event::DEFAULT_TIMESTAMP_KEY, storage::ObjectStorage, @@ -56,7 +54,6 @@ impl ListingTableBuilder { pub async fn populate_via_listing( self, storage: Arc, - client: Arc, time_filters: &[PartialTimeFilter], ) -> Result { // Extract the minimum start time from the time filters. @@ -90,67 +87,28 @@ impl ListingTableBuilder { let prefixes = TimeRange::new(start_time.and_utc(), end_time.and_utc()) .generate_prefixes(OBJECT_STORE_DATA_GRANULARITY); - // Categorizes prefixes into "minute" and general resolve lists. - let mut minute_resolve = HashMap::>::new(); - let mut all_resolve = Vec::new(); + // Build all prefixes as relative paths + let prefixes: Vec<_> = prefixes + .into_iter() + .map(|prefix| { + relative_path::RelativePathBuf::from(format!("{}/{}", &self.stream, prefix)) + }) + .collect(); + + // Use storage.list_dirs_relative for all prefixes and flatten results + let mut listing = Vec::new(); for prefix in prefixes { - let path = relative_path::RelativePathBuf::from(format!("{}/{}", &self.stream, prefix)); - let prefix = storage.absolute_url(path.as_relative_path()).to_string(); - if let Some(pos) = prefix.rfind("minute") { - let hour_prefix = &prefix[..pos]; - minute_resolve - .entry(hour_prefix.to_owned()) - .or_default() - .push(prefix); - } else { - all_resolve.push(prefix); + match storage.list_dirs_relative(&prefix).await { + Ok(paths) => { + listing.extend(paths.into_iter().map(|p| p.to_string())); + } + Err(e) => { + return Err(DataFusionError::External(Box::new(e))); + } } } - /// Resolve all prefixes asynchronously and collect the object metadata. - type ResolveFuture = - Pin, object_store::Error>> + Send>>; - let tasks: FuturesUnordered = FuturesUnordered::new(); - for (listing_prefix, prefixes) in minute_resolve { - let client = Arc::clone(&client); - tasks.push(Box::pin(async move { - let path = Path::from(listing_prefix); - let mut objects = client.list(Some(&path)).try_collect::>().await?; - - objects.retain(|obj| { - prefixes.iter().any(|prefix| { - obj.location - .prefix_matches(&object_store::path::Path::from(prefix.as_ref())) - }) - }); - - Ok(objects) - })); - } - - for prefix in all_resolve { - let client = Arc::clone(&client); - tasks.push(Box::pin(async move { - client - .list(Some(&object_store::path::Path::from(prefix))) - .try_collect::>() - .await - })); - } - - let listing = tasks - .try_collect::>>() - .await - .map_err(|err| DataFusionError::External(Box::new(err)))? - .into_iter() - .flat_map(|res| { - res.into_iter() - .map(|obj| obj.location.to_string()) - .collect::>() - }) - .sorted() - .rev() - .collect_vec(); + let listing = listing.into_iter().sorted().rev().collect_vec(); Ok(Self { stream: self.stream, diff --git a/src/query/stream_schema_provider.rs b/src/query/stream_schema_provider.rs index aa25c9926..491e7c2fa 100644 --- a/src/query/stream_schema_provider.rs +++ b/src/query/stream_schema_provider.rs @@ -16,11 +16,10 @@ * */ -use std::{any::Any, collections::HashMap, ops::Bound, sync::Arc}; +use std::{any::Any, collections::HashMap, ops::Bound, path::PathBuf, sync::Arc}; use arrow_array::RecordBatch; use arrow_schema::{Schema, SchemaRef, SortOptions}; -use bytes::Bytes; use chrono::{DateTime, NaiveDateTime, TimeDelta, Timelike, Utc}; use datafusion::{ catalog::{SchemaProvider, Session}, @@ -45,10 +44,9 @@ use datafusion::{ prelude::Expr, scalar::ScalarValue, }; -use futures_util::{StreamExt, TryFutureExt, TryStreamExt, stream::FuturesOrdered}; +use futures_util::TryFutureExt; use itertools::Itertools; -use object_store::{ObjectStore, path::Path}; -use url::Url; +use relative_path::RelativePathBuf; use crate::{ catalog::{ @@ -59,10 +57,10 @@ use crate::{ }, event::DEFAULT_TIMESTAMP_KEY, hottier::HotTierManager, - metrics::QUERY_CACHE_HIT, + metrics::{QUERY_CACHE_HIT, storage::STORAGE_FILES_SCANNED}, option::Mode, parseable::{PARSEABLE, STREAM_EXISTS}, - storage::{ObjectStorage, ObjectStoreFormat}, + storage::{ObjectStorage, ObjectStorageError, ObjectStoreFormat}, }; use super::listing_table_builder::ListingTableBuilder; @@ -91,7 +89,6 @@ impl SchemaProvider for GlobalSchemaProvider { .expect(STREAM_EXISTS) .get_schema(), stream: name.to_owned(), - url: self.storage.store_url(), }))) } else { Ok(None) @@ -108,8 +105,6 @@ struct StandardTableProvider { schema: SchemaRef, // prefix under which to find snapshot stream: String, - // url to find right instance of object store - url: Url, } impl StandardTableProvider { @@ -276,7 +271,6 @@ impl StandardTableProvider { &self, execution_plans: &mut Vec>, glob_storage: Arc, - object_store: Arc, time_filters: &[PartialTimeFilter], state: &dyn Session, projection: Option<&Vec>, @@ -285,7 +279,7 @@ impl StandardTableProvider { time_partition: Option, ) -> Result<(), DataFusionError> { ListingTableBuilder::new(self.stream.to_owned()) - .populate_via_listing(glob_storage.clone(), object_store, time_filters) + .populate_via_listing(glob_storage.clone(), time_filters) .and_then(|builder| async { let table = builder.build( self.schema.clone(), @@ -327,7 +321,7 @@ impl StandardTableProvider { &self, manifest_files: Vec, ) -> (Vec>, datafusion::common::Statistics) { - let target_partition = num_cpus::get(); + let target_partition: usize = num_cpus::get(); let mut partitioned_files = Vec::from_iter((0..target_partition).map(|_| Vec::new())); let mut column_statistics = HashMap::>::new(); let mut count = 0; @@ -487,11 +481,6 @@ impl TableProvider for StandardTableProvider { limit: Option, ) -> Result, DataFusionError> { let mut execution_plans = vec![]; - let object_store = state - .runtime_env() - .object_store_registry - .get_store(&self.url) - .unwrap(); let glob_storage = PARSEABLE.storage.get_object_store(); let object_store_format: ObjectStoreFormat = serde_json::from_slice( @@ -548,7 +537,6 @@ impl TableProvider for StandardTableProvider { self.legacy_listing_table( &mut execution_plans, glob_storage.clone(), - object_store.clone(), &listing_time_filter, state, projection, @@ -594,6 +582,11 @@ impl TableProvider for StandardTableProvider { return self.final_plan(execution_plans, projection); } + let parquet_files_to_scan = manifest_files.len(); + STORAGE_FILES_SCANNED + .with_label_values(&[PARSEABLE.storage().name(), "GET"]) + .inc_by(parquet_files_to_scan as f64); + let (partitioned_files, statistics) = self.partitioned_files(manifest_files); self.create_parquet_physical_plan( &mut execution_plans, @@ -864,24 +857,27 @@ fn extract_timestamp_bound( } pub async fn collect_manifest_files( - storage: Arc, + storage: Arc, manifest_urls: Vec, -) -> Result, object_store::Error> { - let tasks = manifest_urls.into_iter().map(|path| { - let path = Path::parse(path).unwrap(); +) -> Result, ObjectStorageError> { + let mut tasks = Vec::new(); + manifest_urls.into_iter().for_each(|path| { + let path = RelativePathBuf::from_path(PathBuf::from(path)).expect("Invalid path"); let storage = Arc::clone(&storage); - async move { storage.get(&path).await } + tasks.push(tokio::task::spawn(async move { + storage.get_object(&path).await + })); }); - let resp = FuturesOrdered::from_iter(tasks) - .and_then(|res| res.bytes()) - .collect::>>() - .await; + let mut op = Vec::new(); + for task in tasks { + let file = task.await??; + op.push(file); + } - Ok(resp + Ok(op .into_iter() - .flat_map(|res| res.ok()) - .map(|bytes| serde_json::from_slice(&bytes).unwrap()) + .map(|res| serde_json::from_slice(&res).expect("Data is invalid for Manifest")) .collect()) } diff --git a/src/stats.rs b/src/stats.rs index 5a167cc39..0c2214043 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -20,16 +20,17 @@ use std::collections::HashMap; use std::sync::Arc; use once_cell::sync::Lazy; -use prometheus::IntGaugeVec; use prometheus::core::Collector; use prometheus::proto::MetricFamily; +use prometheus::{IntCounterVec, IntGaugeVec}; use tracing::warn; use crate::metrics::{ DELETED_EVENTS_STORAGE_SIZE, EVENTS_DELETED, EVENTS_DELETED_SIZE, EVENTS_INGESTED, EVENTS_INGESTED_DATE, EVENTS_INGESTED_SIZE, EVENTS_INGESTED_SIZE_DATE, EVENTS_STORAGE_SIZE_DATE, LIFETIME_EVENTS_INGESTED, LIFETIME_EVENTS_INGESTED_SIZE, - LIFETIME_EVENTS_STORAGE_SIZE, STORAGE_SIZE, + LIFETIME_EVENTS_STORAGE_SIZE, STORAGE_SIZE, TOTAL_EVENTS_INGESTED_DATE, + TOTAL_EVENTS_INGESTED_SIZE_DATE, TOTAL_EVENTS_STORAGE_SIZE_DATE, }; use crate::storage::{ObjectStorage, ObjectStorageError, ObjectStoreFormat}; @@ -136,6 +137,10 @@ pub async fn update_deleted_stats( "parquet", &manifest_date, ]); + let _ = TOTAL_EVENTS_INGESTED_DATE.remove_label_values(&["json", &manifest_date]); + let _ = TOTAL_EVENTS_INGESTED_SIZE_DATE.remove_label_values(&["json", &manifest_date]); + let _ = + TOTAL_EVENTS_STORAGE_SIZE_DATE.remove_label_values(&["parquet", &manifest_date]); num_row += manifest.events_ingested as i64; ingestion_size += manifest.ingestion_size as i64; storage_size += manifest.storage_size as i64; @@ -197,7 +202,7 @@ fn remove_label_values(lazy_static: &Lazy, event_labels: &[&str]) { } } -fn delete_with_label_prefix(metrics: &IntGaugeVec, prefix: &[&str]) { +fn delete_with_label_prefix(metrics: &IntCounterVec, prefix: &[&str]) { let families: Vec = metrics.collect().into_iter().collect(); for metric in families.iter().flat_map(|m| m.get_metric()) { let label_map: HashMap<&str, &str> = metric diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index 1b2f7b783..1cea29865 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -19,7 +19,10 @@ use std::{ collections::HashSet, path::Path, - sync::Arc, + sync::{ + Arc, + atomic::{AtomicU64, Ordering}, + }, time::{Duration, Instant}, }; @@ -209,13 +212,15 @@ pub struct BlobStore { impl BlobStore { async fn _get_object(&self, path: &RelativePath) -> Result { - let instant = Instant::now(); + let time = std::time::Instant::now(); let resp = self.client.get(&to_object_store_path(path)).await; - let elapsed = instant.elapsed().as_secs_f64(); - + let elapsed = time.elapsed().as_secs_f64(); + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "GET"]) + .inc(); match resp { Ok(resp) => { - let body = resp.bytes().await.unwrap(); + let body: Bytes = resp.bytes().await.unwrap(); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "GET", "200"]) .observe(elapsed); @@ -236,9 +241,12 @@ impl BlobStore { path: &RelativePath, resource: PutPayload, ) -> Result<(), ObjectStorageError> { - let instant = Instant::now(); + let time = std::time::Instant::now(); let resp = self.client.put(&to_object_store_path(path), resource).await; - let elapsed = instant.elapsed().as_secs_f64(); + let elapsed = time.elapsed().as_secs_f64(); + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "PUT"]) + .inc(); match resp { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -251,18 +259,14 @@ impl BlobStore { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "PUT", status_code]) .observe(elapsed); - - if let object_store::Error::NotFound { source, .. } = &err { - return Err(ObjectStorageError::Custom( - format!("Failed to upload, error: {source:?}").to_string(), - )); - } Err(err.into()) } } } async fn _delete_prefix(&self, key: &str) -> Result<(), ObjectStorageError> { + let files_scanned = Arc::new(AtomicU64::new(0)); + let files_deleted = Arc::new(AtomicU64::new(0)); // Track LIST operation let list_start = Instant::now(); let object_stream = self.client.list(Some(&(key.into()))); @@ -273,19 +277,20 @@ impl BlobStore { object_stream .for_each_concurrent(None, |x| async { + files_scanned.fetch_add(1, Ordering::Relaxed); match x { Ok(obj) => { - // Track individual DELETE operation + files_deleted.fetch_add(1, Ordering::Relaxed); let delete_start = Instant::now(); - match self.client.delete(&obj.location).await { + let delete_resp = self.client.delete(&obj.location).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + match delete_resp { Ok(_) => { - let delete_elapsed = delete_start.elapsed().as_secs_f64(); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "DELETE", "200"]) .observe(delete_elapsed); } Err(err) => { - let delete_elapsed = delete_start.elapsed().as_secs_f64(); let status_code = error_to_status_code(&err); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "DELETE", status_code]) @@ -301,13 +306,18 @@ impl BlobStore { }) .await; + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "LIST"]) + .inc_by(files_scanned.load(Ordering::Relaxed) as f64); + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "DELETE"]) + .inc_by(files_deleted.load(Ordering::Relaxed) as f64); Ok(()) } async fn _list_dates(&self, stream: &str) -> Result, ObjectStorageError> { - // Track LIST operation let list_start = Instant::now(); - let resp = self + let resp: Result = self .client .list_with_delimiter(Some(&(stream.into()))) .await; @@ -331,6 +341,10 @@ impl BlobStore { let common_prefixes = resp.common_prefixes; + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "LIST"]) + .inc_by(common_prefixes.len() as f64); + // return prefixes at the root level let dates: Vec<_> = common_prefixes .iter() @@ -342,37 +356,28 @@ impl BlobStore { } async fn _upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { - // // TODO: Uncomment this when multipart is fixed - // let should_multipart = std::fs::metadata(path)?.len() > MULTIPART_UPLOAD_SIZE as u64; - - let should_multipart = false; + let bytes = tokio::fs::read(path).await?; - if should_multipart { - // self._upload_multipart(key, path).await - // this branch will never get executed - Ok(()) - } else { - let bytes = tokio::fs::read(path).await?; - - let put_start = Instant::now(); - let result = self.client.put(&key.into(), bytes.into()).await; - let put_elapsed = put_start.elapsed().as_secs_f64(); - - match result { - Ok(result) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT", "200"]) - .observe(put_elapsed); - info!("Uploaded file to Azure Blob Storage: {:?}", result); - Ok(()) - } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT", status_code]) - .observe(put_elapsed); - Err(err.into()) - } + let put_start = Instant::now(); + let result = self.client.put(&key.into(), bytes.into()).await; + let put_elapsed = put_start.elapsed().as_secs_f64(); + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "PUT"]) + .inc(); + match result { + Ok(result) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT", "200"]) + .observe(put_elapsed); + info!("Uploaded file to Azure Blob Storage: {:?}", result); + Ok(()) + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT", status_code]) + .observe(put_elapsed); + Err(err.into()) } } } @@ -549,13 +554,6 @@ impl BlobStore { #[async_trait] impl ObjectStorage for BlobStore { - async fn upload_multipart( - &self, - key: &RelativePath, - path: &Path, - ) -> Result<(), ObjectStorageError> { - self._upload_multipart(key, path).await - } async fn get_buffered_reader( &self, _path: &RelativePath, @@ -567,29 +565,42 @@ impl ObjectStorage for BlobStore { ), ))) } - async fn head(&self, _path: &RelativePath) -> Result { - // Record attempt to access file (even though operation not implemented) - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "HEAD"]) - .inc(); - Err(ObjectStorageError::UnhandledError(Box::new( - std::io::Error::new( - std::io::ErrorKind::Unsupported, - "Head operation not implemented for Blob Storage yet", - ), - ))) + async fn upload_multipart( + &self, + key: &RelativePath, + path: &Path, + ) -> Result<(), ObjectStorageError> { + self._upload_multipart(key, path).await } - async fn get_object(&self, path: &RelativePath) -> Result { - let result = self._get_object(path).await?; + async fn head(&self, path: &RelativePath) -> Result { + let head_start = Instant::now(); + let result = self.client.head(&to_object_store_path(path)).await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "HEAD", "200"]) + .observe(head_elapsed); + // Record single file accessed + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "HEAD"]) + .inc(); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "HEAD", status_code]) + .observe(head_elapsed); + } + } - // Record single file accessed - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "GET"]) - .inc(); + Ok(result?) + } - Ok(result) + async fn get_object(&self, path: &RelativePath) -> Result { + Ok(self._get_object(path).await?) } async fn get_objects( @@ -610,21 +621,11 @@ impl ObjectStorage for BlobStore { let mut res = vec![]; let mut files_scanned = 0; + // Note: We track each streaming list item retrieval while let Some(meta_result) = list_stream.next().await { - let list_elapsed = list_start.elapsed().as_secs_f64(); - let meta = match meta_result { - Ok(meta) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", "200"]) - .observe(list_elapsed); - meta - } + Ok(meta) => meta, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -642,13 +643,23 @@ impl ObjectStorage for BlobStore { .map_err(ObjectStorageError::PathError)?, ) .await?; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "GET", "200"]) + .observe(list_start.elapsed().as_secs_f64()); + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "GET"]) + .inc(); res.push(byts); } + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); // Record total files scanned STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "GET"]) + .with_label_values(&["azure_blob", "LIST"]) .inc_by(files_scanned as f64); Ok(res) @@ -665,20 +676,9 @@ impl ObjectStorage for BlobStore { let mut object_stream = self.client.list(Some(&self.root)); while let Some(meta_result) = object_stream.next().await { - let list_elapsed = list_start.elapsed().as_secs_f64(); - let meta = match meta_result { - Ok(meta) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", "200"]) - .observe(list_elapsed); - meta - } + Ok(meta) => meta, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -690,7 +690,10 @@ impl ObjectStorage for BlobStore { path_arr.push(RelativePathBuf::from(meta.location.as_ref())); } } - + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); // Record total files scanned STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "LIST"]) @@ -708,11 +711,6 @@ impl ObjectStorage for BlobStore { .await .map_err(|err| ObjectStorageError::ConnectionError(Box::new(err)))?; - // Record single file written - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "PUT"]) - .inc(); - Ok(()) } @@ -769,6 +767,9 @@ impl ObjectStorage for BlobStore { .observe(head_elapsed); } } + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "HEAD"]) + .inc(); Ok(result.map(|_| ())?) } @@ -785,7 +786,9 @@ impl ObjectStorage for BlobStore { let delete_start = Instant::now(); let result = self.client.delete(&to_object_store_path(&file)).await; let delete_elapsed = delete_start.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "DELETE"]) + .inc(); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -799,15 +802,7 @@ impl ObjectStorage for BlobStore { .with_label_values(&["azure_blob", "DELETE", status_code]) .observe(delete_elapsed); - // if the object is not found, it is not an error - // the given url path was incorrect - if matches!(err, object_store::Error::NotFound { .. }) { - error!("Node does not exist"); - Err(err.into()) - } else { - error!("Error deleting node meta file: {:?}", err); - Err(err.into()) - } + Err(err.into()) } } } @@ -868,7 +863,7 @@ impl ObjectStorage for BlobStore { stream_json_check.try_collect::<()>().await?; - Ok(dirs.into_iter().collect()) + Ok(dirs) } async fn list_dates(&self, stream_name: &str) -> Result, ObjectStorageError> { @@ -883,9 +878,14 @@ impl ObjectStorage for BlobStore { date: &str, ) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from(format!("{}/{}/", stream_name, date)); + let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); - let hours = resp + let hours: Vec = resp .common_prefixes .iter() .filter_map(|path| { @@ -912,9 +912,14 @@ impl ObjectStorage for BlobStore { hour: &str, ) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from(format!("{}/{}/{}/", stream_name, date, hour)); + let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); - let minutes = resp + let minutes: Vec = resp .common_prefixes .iter() .filter_map(|path| { @@ -945,9 +950,7 @@ impl ObjectStorage for BlobStore { // } async fn upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { - self._upload_file(key, path).await?; - - Ok(()) + Ok(self._upload_file(key, path).await?) } fn absolute_url(&self, prefix: &RelativePath) -> object_store::path::Path { diff --git a/src/storage/gcs.rs b/src/storage/gcs.rs index b98c1d873..3392bfbdb 100644 --- a/src/storage/gcs.rs +++ b/src/storage/gcs.rs @@ -19,7 +19,10 @@ use std::{ collections::HashSet, path::Path, - sync::Arc, + sync::{ + Arc, + atomic::{AtomicU64, Ordering}, + }, time::{Duration, Instant}, }; @@ -174,23 +177,25 @@ pub struct Gcs { impl Gcs { async fn _get_object(&self, path: &RelativePath) -> Result { - let get_start = Instant::now(); + let time = std::time::Instant::now(); let resp = self.client.get(&to_object_store_path(path)).await; - let get_elapsed = get_start.elapsed().as_secs_f64(); - + let elapsed = time.elapsed().as_secs_f64(); + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "GET"]) + .inc(); match resp { Ok(resp) => { - let body = resp.bytes().await.unwrap(); + let body: Bytes = resp.bytes().await.unwrap(); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "GET", "200"]) - .observe(get_elapsed); + .observe(elapsed); Ok(body) } Err(err) => { let status_code = error_to_status_code(&err); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "GET", status_code]) - .observe(get_elapsed); + .observe(elapsed); Err(err.into()) } } @@ -201,40 +206,32 @@ impl Gcs { path: &RelativePath, resource: PutPayload, ) -> Result<(), ObjectStorageError> { - let put_start = Instant::now(); + let time = std::time::Instant::now(); let resp = self.client.put(&to_object_store_path(path), resource).await; - let put_elapsed = put_start.elapsed().as_secs_f64(); - - if let Err(object_store::Error::NotFound { source, .. }) = &resp { - let source_str = source.to_string(); - if source_str.contains("NoSuchBucket") { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "PUT", "404"]) - .observe(put_elapsed); - return Err(ObjectStorageError::Custom( - format!("Bucket '{}' does not exist in GCS.", self.bucket).to_string(), - )); - } - } - + let elapsed = time.elapsed().as_secs_f64(); + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "PUT"]) + .inc(); match resp { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "PUT", "200"]) - .observe(put_elapsed); + .observe(elapsed); Ok(()) } Err(err) => { let status_code = error_to_status_code(&err); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "PUT", status_code]) - .observe(put_elapsed); + .observe(elapsed); Err(err.into()) } } } async fn _delete_prefix(&self, key: &str) -> Result<(), ObjectStorageError> { + let files_scanned = Arc::new(AtomicU64::new(0)); + let files_deleted = Arc::new(AtomicU64::new(0)); // Track LIST operation let list_start = Instant::now(); let object_stream = self.client.list(Some(&(key.into()))); @@ -245,19 +242,20 @@ impl Gcs { object_stream .for_each_concurrent(None, |x| async { + files_scanned.fetch_add(1, Ordering::Relaxed); match x { Ok(obj) => { - // Track individual DELETE operation + files_deleted.fetch_add(1, Ordering::Relaxed); let delete_start = Instant::now(); - match self.client.delete(&obj.location).await { + let delete_resp = self.client.delete(&obj.location).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + match delete_resp { Ok(_) => { - let delete_elapsed = delete_start.elapsed().as_secs_f64(); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "DELETE", "200"]) .observe(delete_elapsed); } Err(err) => { - let delete_elapsed = delete_start.elapsed().as_secs_f64(); let status_code = error_to_status_code(&err); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "DELETE", status_code]) @@ -273,13 +271,18 @@ impl Gcs { }) .await; + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "LIST"]) + .inc_by(files_scanned.load(Ordering::Relaxed) as f64); + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "DELETE"]) + .inc_by(files_deleted.load(Ordering::Relaxed) as f64); Ok(()) } async fn _list_dates(&self, stream: &str) -> Result, ObjectStorageError> { - // Track LIST operation let list_start = Instant::now(); - let resp = self + let resp: Result = self .client .list_with_delimiter(Some(&(stream.into()))) .await; @@ -303,6 +306,10 @@ impl Gcs { let common_prefixes = resp.common_prefixes; + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "LIST"]) + .inc_by(common_prefixes.len() as f64); + // return prefixes at the root level let dates: Vec<_> = common_prefixes .iter() @@ -319,7 +326,9 @@ impl Gcs { let put_start = Instant::now(); let result = self.client.put(&key.into(), bytes.into()).await; let put_elapsed = put_start.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "PUT"]) + .inc(); match result { Ok(result) => { STORAGE_REQUEST_RESPONSE_TIME @@ -474,7 +483,9 @@ impl ObjectStorage for Gcs { let head_start = Instant::now(); let meta = self.client.head(path).await; let head_elapsed = head_start.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "HEAD"]) + .inc(); let meta = match meta { Ok(meta) => { STORAGE_REQUEST_RESPONSE_TIME @@ -495,6 +506,7 @@ impl ObjectStorage for Gcs { let buf = object_store::buffered::BufReader::new(store, &meta); Ok(buf) } + async fn upload_multipart( &self, key: &RelativePath, @@ -502,11 +514,14 @@ impl ObjectStorage for Gcs { ) -> Result<(), ObjectStorageError> { self._upload_multipart(key, path).await } + async fn head(&self, path: &RelativePath) -> Result { let head_start = Instant::now(); let result = self.client.head(&to_object_store_path(path)).await; let head_elapsed = head_start.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "HEAD"]) + .inc(); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -529,14 +544,7 @@ impl ObjectStorage for Gcs { } async fn get_object(&self, path: &RelativePath) -> Result { - let result = self._get_object(path).await?; - - // Record single file accessed - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "GET"]) - .inc(); - - Ok(result) + Ok(self._get_object(path).await?) } async fn get_objects( @@ -559,20 +567,9 @@ impl ObjectStorage for Gcs { // Note: We track each streaming list item retrieval while let Some(meta_result) = list_stream.next().await { - let list_elapsed = list_start.elapsed().as_secs_f64(); - let meta = match meta_result { - Ok(meta) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", "200"]) - .observe(list_elapsed); - meta - } + Ok(meta) => meta, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -590,13 +587,23 @@ impl ObjectStorage for Gcs { .map_err(ObjectStorageError::PathError)?, ) .await?; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "GET", "200"]) + .observe(list_start.elapsed().as_secs_f64()); + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "GET"]) + .inc(); res.push(byts); } + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); // Record total files scanned STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "GET"]) + .with_label_values(&["gcs", "LIST"]) .inc_by(files_scanned as f64); Ok(res) @@ -613,20 +620,9 @@ impl ObjectStorage for Gcs { let mut object_stream = self.client.list(Some(&self.root)); while let Some(meta_result) = object_stream.next().await { - let list_elapsed = list_start.elapsed().as_secs_f64(); - let meta = match meta_result { - Ok(meta) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", "200"]) - .observe(list_elapsed); - meta - } + Ok(meta) => meta, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -638,7 +634,10 @@ impl ObjectStorage for Gcs { path_arr.push(RelativePathBuf::from(meta.location.as_ref())); } } - + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); // Record total files scanned STORAGE_FILES_SCANNED .with_label_values(&["gcs", "LIST"]) @@ -656,11 +655,6 @@ impl ObjectStorage for Gcs { .await .map_err(|err| ObjectStorageError::ConnectionError(Box::new(err)))?; - // Record single file written - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "PUT"]) - .inc(); - Ok(()) } @@ -671,22 +665,57 @@ impl ObjectStorage for Gcs { } async fn delete_object(&self, path: &RelativePath) -> Result<(), ObjectStorageError> { - let result = self.client.delete(&to_object_store_path(path)).await?; + let delete_start = Instant::now(); + let result = self.client.delete(&to_object_store_path(path)).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); - // Record single file deleted - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "DELETE"]) - .inc(); + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "DELETE", "200"]) + .observe(delete_elapsed); + // Record single file deleted + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "DELETE"]) + .inc(); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "DELETE", status_code]) + .observe(delete_elapsed); + } + } - Ok(result) + Ok(result?) } async fn check(&self) -> Result<(), ObjectStorageError> { - Ok(self + let head_start = Instant::now(); + let result = self .client .head(&to_object_store_path(&parseable_json_path())) - .await - .map(|_| ())?) + .await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "HEAD", "200"]) + .observe(head_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "HEAD", status_code]) + .observe(head_elapsed); + } + } + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "HEAD"]) + .inc(); + + Ok(result.map(|_| ())?) } async fn delete_stream(&self, stream_name: &str) -> Result<(), ObjectStorageError> { @@ -701,7 +730,9 @@ impl ObjectStorage for Gcs { let delete_start = Instant::now(); let result = self.client.delete(&to_object_store_path(&file)).await; let delete_elapsed = delete_start.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "DELETE"]) + .inc(); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -715,15 +746,7 @@ impl ObjectStorage for Gcs { .with_label_values(&["gcs", "DELETE", status_code]) .observe(delete_elapsed); - // if the object is not found, it is not an error - // the given url path was incorrect - if matches!(err, object_store::Error::NotFound { .. }) { - error!("Node does not exist"); - Err(err.into()) - } else { - error!("Error deleting node meta file: {:?}", err); - Err(err.into()) - } + Err(err.into()) } } } @@ -799,9 +822,14 @@ impl ObjectStorage for Gcs { date: &str, ) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from(format!("{}/{}/", stream_name, date)); + let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); - let hours = resp + let hours: Vec = resp .common_prefixes .iter() .filter_map(|path| { @@ -828,9 +856,14 @@ impl ObjectStorage for Gcs { hour: &str, ) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from(format!("{}/{}/{}/", stream_name, date, hour)); + let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); - let minutes = resp + let minutes: Vec = resp .common_prefixes .iter() .filter_map(|path| { @@ -852,9 +885,7 @@ impl ObjectStorage for Gcs { } async fn upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { - self._upload_file(key, path).await?; - - Ok(()) + Ok(self._upload_file(key, path).await?) } fn absolute_url(&self, prefix: &RelativePath) -> object_store::path::Path { diff --git a/src/storage/object_storage.rs b/src/storage/object_storage.rs index 41639d025..ea2c9d50c 100644 --- a/src/storage/object_storage.rs +++ b/src/storage/object_storage.rs @@ -50,6 +50,7 @@ use crate::handlers::http::fetch_schema; use crate::handlers::http::modal::ingest_server::INGESTOR_EXPECT; use crate::handlers::http::modal::ingest_server::INGESTOR_META; use crate::handlers::http::users::{FILTER_DIR, USERS_ROOT_DIR}; +use crate::metrics::TOTAL_EVENTS_STORAGE_SIZE_DATE; use crate::metrics::storage::StorageMetrics; use crate::metrics::{EVENTS_STORAGE_SIZE_DATE, LIFETIME_EVENTS_STORAGE_SIZE, STORAGE_SIZE}; use crate::option::Mode; @@ -169,17 +170,22 @@ fn update_storage_metrics( ) -> Result<(), ObjectStorageError> { let mut file_date_part = filename.split('.').collect::>()[0]; file_date_part = file_date_part.split('=').collect::>()[1]; - let compressed_size = path.metadata().map_or(0, |meta| meta.len()); - + let compressed_size = path + .metadata() + .map(|m| m.len()) + .map_err(|e| ObjectStorageError::Custom(format!("metadata failed for {filename}: {e}")))?; STORAGE_SIZE .with_label_values(&["data", stream_name, "parquet"]) .add(compressed_size as i64); EVENTS_STORAGE_SIZE_DATE .with_label_values(&["data", stream_name, "parquet", file_date_part]) - .add(compressed_size as i64); + .inc_by(compressed_size); LIFETIME_EVENTS_STORAGE_SIZE .with_label_values(&["data", stream_name, "parquet"]) .add(compressed_size as i64); + TOTAL_EVENTS_STORAGE_SIZE_DATE + .with_label_values(&["parquet", file_date_part]) + .inc_by(compressed_size); Ok(()) } diff --git a/src/storage/s3.rs b/src/storage/s3.rs index 3aeb076f5..a98e63124 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -341,9 +341,10 @@ impl S3 { async fn _get_object(&self, path: &RelativePath) -> Result { let time = std::time::Instant::now(); let resp = self.client.get(&to_object_store_path(path)).await; - let elapsed = time.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "GET"]) + .inc(); match resp { Ok(resp) => { let body = resp.bytes().await?; @@ -369,9 +370,10 @@ impl S3 { ) -> Result<(), ObjectStorageError> { let time = std::time::Instant::now(); let resp = self.client.put(&to_object_store_path(path), resource).await; - let elapsed = time.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "PUT"]) + .inc(); match resp { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -380,23 +382,7 @@ impl S3 { Ok(()) } Err(err) => { - let status_code = match &err { - object_store::Error::NotFound { .. } => { - // Check for specific S3 bucket not found error - let source_str = err.to_string(); - if source_str.contains("NoSuchBucket") { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT", "404"]) - .observe(elapsed); - return Err(ObjectStorageError::Custom( - format!("Bucket '{}' does not exist in S3.", self.bucket) - .to_string(), - )); - } - "404" - } - _ => error_to_status_code(&err), - }; + let status_code = error_to_status_code(&err); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "PUT", status_code]) .observe(elapsed); @@ -407,7 +393,7 @@ impl S3 { async fn _delete_prefix(&self, key: &str) -> Result<(), ObjectStorageError> { let files_scanned = Arc::new(AtomicU64::new(0)); - + let files_deleted = Arc::new(AtomicU64::new(0)); // Track LIST operation let list_start = Instant::now(); let object_stream = self.client.list(Some(&(key.into()))); @@ -416,23 +402,22 @@ impl S3 { .with_label_values(&["s3", "LIST", "200"]) .observe(list_elapsed); - let files_scanned_clone = files_scanned.clone(); object_stream .for_each_concurrent(None, |x| async { + files_scanned.fetch_add(1, Ordering::Relaxed); match x { Ok(obj) => { - files_scanned_clone.fetch_add(1, Ordering::Relaxed); - // Track individual DELETE operation + files_deleted.fetch_add(1, Ordering::Relaxed); let delete_start = Instant::now(); - match self.client.delete(&obj.location).await { + let delete_resp = self.client.delete(&obj.location).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + match delete_resp { Ok(_) => { - let delete_elapsed = delete_start.elapsed().as_secs_f64(); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "DELETE", "200"]) .observe(delete_elapsed); } Err(err) => { - let delete_elapsed = delete_start.elapsed().as_secs_f64(); let status_code = error_to_status_code(&err); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "DELETE", status_code]) @@ -448,18 +433,18 @@ impl S3 { }) .await; - // Record total files scanned STORAGE_FILES_SCANNED .with_label_values(&["s3", "LIST"]) .inc_by(files_scanned.load(Ordering::Relaxed) as f64); - + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "DELETE"]) + .inc_by(files_deleted.load(Ordering::Relaxed) as f64); Ok(()) } async fn _list_dates(&self, stream: &str) -> Result, ObjectStorageError> { - // Track LIST operation let list_start = Instant::now(); - let resp = self + let resp: Result = self .client .list_with_delimiter(Some(&(stream.into()))) .await; @@ -483,7 +468,6 @@ impl S3 { let common_prefixes = resp.common_prefixes; - // Record files scanned (prefixes/directories count as files scanned) STORAGE_FILES_SCANNED .with_label_values(&["s3", "LIST"]) .inc_by(common_prefixes.len() as f64); @@ -499,37 +483,28 @@ impl S3 { } async fn _upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { - // // TODO: Uncomment this when multipart is fixed - // let should_multipart = std::fs::metadata(path)?.len() > MULTIPART_UPLOAD_SIZE as u64; - - let should_multipart = false; + let bytes = tokio::fs::read(path).await?; - if should_multipart { - // self._upload_multipart(key, path).await - // this branch will never get executed - Ok(()) - } else { - let bytes = tokio::fs::read(path).await?; - - let put_start = Instant::now(); - let result = self.client.put(&key.into(), bytes.into()).await; - let put_elapsed = put_start.elapsed().as_secs_f64(); - - match result { - Ok(result) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT", "200"]) - .observe(put_elapsed); - info!("Uploaded file to S3: {:?}", result); - Ok(()) - } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT", status_code]) - .observe(put_elapsed); - Err(err.into()) - } + let put_start = Instant::now(); + let result = self.client.put(&key.into(), bytes.into()).await; + let put_elapsed = put_start.elapsed().as_secs_f64(); + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "PUT"]) + .inc(); + match result { + Ok(result) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT", "200"]) + .observe(put_elapsed); + info!("Uploaded file to S3: {:?}", result); + Ok(()) + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT", status_code]) + .observe(put_elapsed); + Err(err.into()) } } } @@ -550,14 +525,14 @@ impl S3 { let mut async_writer = match async_writer { Ok(writer) => { STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT_MULTIPART_INIT", "200"]) + .with_label_values(&["s3", "PUT_MULTIPART", "200"]) .observe(multipart_elapsed); writer } Err(err) => { let status_code = error_to_status_code(&err); STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT_MULTIPART_INIT", status_code]) + .with_label_values(&["s3", "PUT_MULTIPART", status_code]) .observe(multipart_elapsed); return Err(err.into()); } @@ -573,7 +548,9 @@ impl S3 { let put_start = Instant::now(); let result = self.client.put(location, data.into()).await; let put_elapsed = put_start.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "PUT"]) + .inc(); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -673,7 +650,9 @@ impl ObjectStorage for S3 { let head_start = Instant::now(); let meta = self.client.head(path).await; let head_elapsed = head_start.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "HEAD"]) + .inc(); let meta = match meta { Ok(meta) => { STORAGE_REQUEST_RESPONSE_TIME @@ -694,6 +673,7 @@ impl ObjectStorage for S3 { let buf = object_store::buffered::BufReader::new(store, &meta); Ok(buf) } + async fn upload_multipart( &self, key: &RelativePath, @@ -701,11 +681,14 @@ impl ObjectStorage for S3 { ) -> Result<(), ObjectStorageError> { self._upload_multipart(key, path).await } + async fn head(&self, path: &RelativePath) -> Result { let head_start = Instant::now(); let result = self.client.head(&to_object_store_path(path)).await; let head_elapsed = head_start.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "HEAD"]) + .inc(); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -728,14 +711,7 @@ impl ObjectStorage for S3 { } async fn get_object(&self, path: &RelativePath) -> Result { - let result = self._get_object(path).await?; - - // Record single file accessed - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "GET"]) - .inc(); - - Ok(result) + Ok(self._get_object(path).await?) } async fn get_objects( @@ -758,20 +734,9 @@ impl ObjectStorage for S3 { // Note: We track each streaming list item retrieval while let Some(meta_result) = list_stream.next().await { - let list_elapsed = list_start.elapsed().as_secs_f64(); - let meta = match meta_result { - Ok(meta) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", "200"]) - .observe(list_elapsed); - meta - } + Ok(meta) => meta, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -789,13 +754,23 @@ impl ObjectStorage for S3 { .map_err(ObjectStorageError::PathError)?, ) .await?; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "GET", "200"]) + .observe(list_start.elapsed().as_secs_f64()); + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "GET"]) + .inc(); res.push(byts); } + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); // Record total files scanned STORAGE_FILES_SCANNED - .with_label_values(&["s3", "GET"]) + .with_label_values(&["s3", "LIST"]) .inc_by(files_scanned as f64); Ok(res) @@ -812,20 +787,9 @@ impl ObjectStorage for S3 { let mut object_stream = self.client.list(Some(&self.root)); while let Some(meta_result) = object_stream.next().await { - let list_elapsed = list_start.elapsed().as_secs_f64(); - let meta = match meta_result { - Ok(meta) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", "200"]) - .observe(list_elapsed); - meta - } + Ok(meta) => meta, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -837,7 +801,10 @@ impl ObjectStorage for S3 { path_arr.push(RelativePathBuf::from(meta.location.as_ref())); } } - + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); // Record total files scanned STORAGE_FILES_SCANNED .with_label_values(&["s3", "LIST"]) @@ -855,11 +822,6 @@ impl ObjectStorage for S3 { .await .map_err(|err| ObjectStorageError::ConnectionError(Box::new(err)))?; - // Record single file written - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "PUT"]) - .inc(); - Ok(()) } @@ -916,6 +878,9 @@ impl ObjectStorage for S3 { .observe(head_elapsed); } } + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "HEAD"]) + .inc(); Ok(result.map(|_| ())?) } @@ -932,7 +897,9 @@ impl ObjectStorage for S3 { let delete_start = Instant::now(); let result = self.client.delete(&to_object_store_path(&file)).await; let delete_elapsed = delete_start.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "DELETE"]) + .inc(); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -946,15 +913,7 @@ impl ObjectStorage for S3 { .with_label_values(&["s3", "DELETE", status_code]) .observe(delete_elapsed); - // if the object is not found, it is not an error - // the given url path was incorrect - if matches!(err, object_store::Error::NotFound { .. }) { - error!("Node does not exist"); - Err(err.into()) - } else { - error!("Error deleting node meta file: {:?}", err); - Err(err.into()) - } + Err(err.into()) } } } @@ -1030,7 +989,12 @@ impl ObjectStorage for S3 { date: &str, ) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from(format!("{}/{}/", stream_name, date)); + let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); let hours: Vec = resp .common_prefixes @@ -1059,7 +1023,12 @@ impl ObjectStorage for S3 { hour: &str, ) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from(format!("{}/{}/{}/", stream_name, date, hour)); + let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); let minutes: Vec = resp .common_prefixes @@ -1092,9 +1061,7 @@ impl ObjectStorage for S3 { // } async fn upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { - self._upload_file(key, path).await?; - - Ok(()) + Ok(self._upload_file(key, path).await?) } fn absolute_url(&self, prefix: &RelativePath) -> object_store::path::Path { From 8588596e7b65e0965c67e4dceb9ec289fbeb3784 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Mon, 25 Aug 2025 08:10:53 -0700 Subject: [PATCH 03/14] add scanned metrics by date --- src/metadata.rs | 4 +- src/metrics/mod.rs | 12 +-- src/metrics/storage.rs | 36 ++++++- src/query/stream_schema_provider.rs | 12 ++- src/stats.rs | 14 ++- src/storage/azure_blob.rs | 141 +++++++++++++++++++--------- src/storage/gcs.rs | 92 ++++++++++++++++-- src/storage/localfs.rs | 35 +++++-- src/storage/object_storage.rs | 2 +- src/storage/s3.rs | 92 ++++++++++++++++-- 10 files changed, 354 insertions(+), 86 deletions(-) diff --git a/src/metadata.rs b/src/metadata.rs index 34b5880b4..2d3bcae22 100644 --- a/src/metadata.rs +++ b/src/metadata.rs @@ -62,10 +62,10 @@ pub fn update_stats( .add(size as i64); TOTAL_EVENTS_INGESTED_DATE .with_label_values(&[origin, &parsed_date]) - .inc_by(num_rows as u64); + .add(num_rows as i64); TOTAL_EVENTS_INGESTED_SIZE_DATE .with_label_values(&[origin, &parsed_date]) - .inc_by(size); + .add(size as i64); } /// In order to support backward compatability with streams created before v1.6.4, diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index 81809eefd..27970c22d 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -160,8 +160,8 @@ pub static EVENTS_STORAGE_SIZE_DATE: Lazy = Lazy::new(|| { .expect("metric can be created") }); -pub static TOTAL_EVENTS_INGESTED_DATE: Lazy = Lazy::new(|| { - IntCounterVec::new( +pub static TOTAL_EVENTS_INGESTED_DATE: Lazy = Lazy::new(|| { + IntGaugeVec::new( Opts::new( "total_events_ingested_date", "total events ingested on a particular date", @@ -172,8 +172,8 @@ pub static TOTAL_EVENTS_INGESTED_DATE: Lazy = Lazy::new(|| { .expect("metric can be created") }); -pub static TOTAL_EVENTS_INGESTED_SIZE_DATE: Lazy = Lazy::new(|| { - IntCounterVec::new( +pub static TOTAL_EVENTS_INGESTED_SIZE_DATE: Lazy = Lazy::new(|| { + IntGaugeVec::new( Opts::new( "total_events_ingested_size_date", "Total events ingested size in bytes on a particular date", @@ -184,8 +184,8 @@ pub static TOTAL_EVENTS_INGESTED_SIZE_DATE: Lazy = Lazy::new(|| { .expect("metric can be created") }); -pub static TOTAL_EVENTS_STORAGE_SIZE_DATE: Lazy = Lazy::new(|| { - IntCounterVec::new( +pub static TOTAL_EVENTS_STORAGE_SIZE_DATE: Lazy = Lazy::new(|| { + IntGaugeVec::new( Opts::new( "total_events_storage_size_date", "Total events storage size in bytes on a particular date", diff --git a/src/metrics/storage.rs b/src/metrics/storage.rs index d483d0ee8..3386c451a 100644 --- a/src/metrics/storage.rs +++ b/src/metrics/storage.rs @@ -44,12 +44,24 @@ pub static STORAGE_FILES_SCANNED: Lazy = Lazy::new(|| { .expect("metric can be created") }); +pub static STORAGE_FILES_SCANNED_DATE: Lazy = Lazy::new(|| { + CounterVec::new( + Opts::new( + "storage_files_scanned_date_total", + "Total number of files scanned in storage operations by date", + ) + .namespace(METRICS_NAMESPACE), + &["provider", "operation", "date"], + ) + .expect("metric can be created") +}); + pub trait StorageMetrics { fn register_metrics(&self, handler: &PrometheusMetrics); } pub mod localfs { - use crate::storage::FSConfig; + use crate::{metrics::storage::STORAGE_FILES_SCANNED_DATE, storage::FSConfig}; use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; @@ -63,12 +75,16 @@ pub mod localfs { .registry .register(Box::new(STORAGE_FILES_SCANNED.clone())) .expect("metric can be registered"); + handler + .registry + .register(Box::new(STORAGE_FILES_SCANNED_DATE.clone())) + .expect("metric can be registered"); } } } pub mod s3 { - use crate::storage::S3Config; + use crate::{metrics::storage::STORAGE_FILES_SCANNED_DATE, storage::S3Config}; use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; @@ -82,12 +98,16 @@ pub mod s3 { .registry .register(Box::new(STORAGE_FILES_SCANNED.clone())) .expect("metric can be registered"); + handler + .registry + .register(Box::new(STORAGE_FILES_SCANNED_DATE.clone())) + .expect("metric can be registered"); } } } pub mod azureblob { - use crate::storage::AzureBlobConfig; + use crate::{metrics::storage::STORAGE_FILES_SCANNED_DATE, storage::AzureBlobConfig}; use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; @@ -101,12 +121,16 @@ pub mod azureblob { .registry .register(Box::new(STORAGE_FILES_SCANNED.clone())) .expect("metric can be registered"); + handler + .registry + .register(Box::new(STORAGE_FILES_SCANNED_DATE.clone())) + .expect("metric can be registered"); } } } pub mod gcs { - use crate::storage::GcsConfig; + use crate::{metrics::storage::STORAGE_FILES_SCANNED_DATE, storage::GcsConfig}; use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; @@ -120,6 +144,10 @@ pub mod gcs { .registry .register(Box::new(STORAGE_FILES_SCANNED.clone())) .expect("metric can be registered"); + handler + .registry + .register(Box::new(STORAGE_FILES_SCANNED_DATE.clone())) + .expect("metric can be registered"); } } } diff --git a/src/query/stream_schema_provider.rs b/src/query/stream_schema_provider.rs index 491e7c2fa..60a23e9aa 100644 --- a/src/query/stream_schema_provider.rs +++ b/src/query/stream_schema_provider.rs @@ -57,7 +57,10 @@ use crate::{ }, event::DEFAULT_TIMESTAMP_KEY, hottier::HotTierManager, - metrics::{QUERY_CACHE_HIT, storage::STORAGE_FILES_SCANNED}, + metrics::{ + QUERY_CACHE_HIT, + storage::{STORAGE_FILES_SCANNED, STORAGE_FILES_SCANNED_DATE}, + }, option::Mode, parseable::{PARSEABLE, STREAM_EXISTS}, storage::{ObjectStorage, ObjectStorageError, ObjectStoreFormat}, @@ -586,6 +589,13 @@ impl TableProvider for StandardTableProvider { STORAGE_FILES_SCANNED .with_label_values(&[PARSEABLE.storage().name(), "GET"]) .inc_by(parquet_files_to_scan as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&[ + PARSEABLE.storage().name(), + "GET", + &Utc::now().date_naive().to_string(), + ]) + .inc_by(parquet_files_to_scan as f64); let (partitioned_files, statistics) = self.partitioned_files(manifest_files); self.create_parquet_physical_plan( diff --git a/src/stats.rs b/src/stats.rs index 0c2214043..464a22b0a 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -137,13 +137,19 @@ pub async fn update_deleted_stats( "parquet", &manifest_date, ]); - let _ = TOTAL_EVENTS_INGESTED_DATE.remove_label_values(&["json", &manifest_date]); - let _ = TOTAL_EVENTS_INGESTED_SIZE_DATE.remove_label_values(&["json", &manifest_date]); - let _ = - TOTAL_EVENTS_STORAGE_SIZE_DATE.remove_label_values(&["parquet", &manifest_date]); + num_row += manifest.events_ingested as i64; ingestion_size += manifest.ingestion_size as i64; storage_size += manifest.storage_size as i64; + TOTAL_EVENTS_INGESTED_DATE + .with_label_values(&["json", &manifest_date]) + .sub(manifest.events_ingested as i64); + TOTAL_EVENTS_INGESTED_SIZE_DATE + .with_label_values(&["json", &manifest_date]) + .sub(manifest.ingestion_size as i64); + TOTAL_EVENTS_STORAGE_SIZE_DATE + .with_label_values(&["parquet", &manifest_date]) + .sub(manifest.storage_size as i64); } } EVENTS_DELETED diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index 1cea29865..e8a73440a 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -28,6 +28,7 @@ use std::{ use async_trait::async_trait; use bytes::Bytes; +use chrono::Utc; use datafusion::{ datasource::listing::ListingTableUrl, execution::{ @@ -49,7 +50,10 @@ use tracing::{error, info}; use url::Url; use crate::{ - metrics::storage::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}, + metrics::storage::{ + STORAGE_FILES_SCANNED, STORAGE_FILES_SCANNED_DATE, STORAGE_REQUEST_RESPONSE_TIME, + StorageMetrics, + }, parseable::LogStream, }; @@ -218,6 +222,9 @@ impl BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "GET"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "GET", &Utc::now().date_naive().to_string()]) + .inc(); match resp { Ok(resp) => { let body: Bytes = resp.bytes().await.unwrap(); @@ -247,6 +254,9 @@ impl BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "PUT"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "PUT", &Utc::now().date_naive().to_string()]) + .inc(); match resp { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -309,9 +319,15 @@ impl BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "LIST"]) .inc_by(files_scanned.load(Ordering::Relaxed) as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned.load(Ordering::Relaxed) as f64); STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "DELETE"]) .inc_by(files_deleted.load(Ordering::Relaxed) as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "DELETE", &Utc::now().date_naive().to_string()]) + .inc_by(files_deleted.load(Ordering::Relaxed) as f64); Ok(()) } @@ -344,6 +360,9 @@ impl BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "LIST"]) .inc_by(common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(common_prefixes.len() as f64); // return prefixes at the root level let dates: Vec<_> = common_prefixes @@ -364,6 +383,9 @@ impl BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "PUT"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "PUT", &Utc::now().date_naive().to_string()]) + .inc(); match result { Ok(result) => { STORAGE_REQUEST_RESPONSE_TIME @@ -507,49 +529,6 @@ impl BlobStore { } Ok(()) } - - // TODO: introduce parallel, multipart-uploads if required - // async fn _upload_multipart(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { - // let mut buf = vec![0u8; MULTIPART_UPLOAD_SIZE / 2]; - // let mut file = OpenOptions::new().read(true).open(path).await?; - - // // let (multipart_id, mut async_writer) = self.client.put_multipart(&key.into()).await?; - // let mut async_writer = self.client.put_multipart(&key.into()).await?; - - // /* `abort_multipart()` has been removed */ - // // let close_multipart = |err| async move { - // // error!("multipart upload failed. {:?}", err); - // // self.client - // // .abort_multipart(&key.into(), &multipart_id) - // // .await - // // }; - - // loop { - // match file.read(&mut buf).await { - // Ok(len) => { - // if len == 0 { - // break; - // } - // if let Err(err) = async_writer.write_all(&buf[0..len]).await { - // // close_multipart(err).await?; - // break; - // } - // if let Err(err) = async_writer.flush().await { - // // close_multipart(err).await?; - // break; - // } - // } - // Err(err) => { - // // close_multipart(err).await?; - // break; - // } - // } - // } - - // async_writer.shutdown().await?; - - // Ok(()) - // } } #[async_trait] @@ -587,6 +566,13 @@ impl ObjectStorage for BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "HEAD"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&[ + "azure_blob", + "HEAD", + &Utc::now().date_naive().to_string(), + ]) + .inc(); } Err(err) => { let status_code = error_to_status_code(err); @@ -649,6 +635,9 @@ impl ObjectStorage for BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "GET"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "GET", &Utc::now().date_naive().to_string()]) + .inc(); res.push(byts); } @@ -661,6 +650,9 @@ impl ObjectStorage for BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "LIST"]) .inc_by(files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); Ok(res) } @@ -698,6 +690,9 @@ impl ObjectStorage for BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "LIST"]) .inc_by(files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -734,6 +729,13 @@ impl ObjectStorage for BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "DELETE"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&[ + "azure_blob", + "DELETE", + &Utc::now().date_naive().to_string(), + ]) + .inc(); } Err(err) => { let status_code = error_to_status_code(err); @@ -770,6 +772,9 @@ impl ObjectStorage for BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "HEAD"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "HEAD", &Utc::now().date_naive().to_string()]) + .inc(); Ok(result.map(|_| ())?) } @@ -789,6 +794,9 @@ impl ObjectStorage for BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "DELETE"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "DELETE", &Utc::now().date_naive().to_string()]) + .inc(); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -824,7 +832,12 @@ impl ObjectStorage for BlobStore { .observe(list_elapsed); let common_prefixes = resp.common_prefixes; // get all dirs - + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "LIST"]) + .inc_by(common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(common_prefixes.len() as f64); // return prefixes at the root level let dirs: HashSet<_> = common_prefixes .iter() @@ -860,6 +873,12 @@ impl ObjectStorage for BlobStore { }; stream_json_check.push(task); } + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "HEAD"]) + .inc_by(dirs.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "HEAD", &Utc::now().date_naive().to_string()]) + .inc_by(dirs.len() as f64); stream_json_check.try_collect::<()>().await?; @@ -884,6 +903,12 @@ impl ObjectStorage for BlobStore { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); let hours: Vec = resp .common_prefixes @@ -918,6 +943,12 @@ impl ObjectStorage for BlobStore { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); let minutes: Vec = resp .common_prefixes @@ -987,6 +1018,16 @@ impl ObjectStorage for BlobStore { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&[ + "azure_blob", + "LIST", + &Utc::now().date_naive().to_string(), + ]) + .inc_by(resp.common_prefixes.len() as f64); resp } Err(err) => { @@ -1021,6 +1062,16 @@ impl ObjectStorage for BlobStore { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&[ + "azure_blob", + "LIST", + &Utc::now().date_naive().to_string(), + ]) + .inc_by(resp.common_prefixes.len() as f64); resp } Err(err) => { diff --git a/src/storage/gcs.rs b/src/storage/gcs.rs index 3392bfbdb..f3c8c85f2 100644 --- a/src/storage/gcs.rs +++ b/src/storage/gcs.rs @@ -27,11 +27,15 @@ use std::{ }; use crate::{ - metrics::storage::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}, + metrics::storage::{ + STORAGE_FILES_SCANNED, STORAGE_FILES_SCANNED_DATE, STORAGE_REQUEST_RESPONSE_TIME, + StorageMetrics, + }, parseable::LogStream, }; use async_trait::async_trait; use bytes::Bytes; +use chrono::Utc; use datafusion::{ datasource::listing::ListingTableUrl, execution::{ @@ -183,6 +187,9 @@ impl Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "GET"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "GET", &Utc::now().date_naive().to_string()]) + .inc(); match resp { Ok(resp) => { let body: Bytes = resp.bytes().await.unwrap(); @@ -212,6 +219,9 @@ impl Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "PUT"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "PUT", &Utc::now().date_naive().to_string()]) + .inc(); match resp { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -274,9 +284,15 @@ impl Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "LIST"]) .inc_by(files_scanned.load(Ordering::Relaxed) as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned.load(Ordering::Relaxed) as f64); STORAGE_FILES_SCANNED .with_label_values(&["gcs", "DELETE"]) .inc_by(files_deleted.load(Ordering::Relaxed) as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "DELETE", &Utc::now().date_naive().to_string()]) + .inc_by(files_deleted.load(Ordering::Relaxed) as f64); Ok(()) } @@ -309,6 +325,9 @@ impl Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "LIST"]) .inc_by(common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(common_prefixes.len() as f64); // return prefixes at the root level let dates: Vec<_> = common_prefixes @@ -329,6 +348,9 @@ impl Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "PUT"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "PUT", &Utc::now().date_naive().to_string()]) + .inc(); match result { Ok(result) => { STORAGE_REQUEST_RESPONSE_TIME @@ -486,6 +508,9 @@ impl ObjectStorage for Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "HEAD"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "HEAD", &Utc::now().date_naive().to_string()]) + .inc(); let meta = match meta { Ok(meta) => { STORAGE_REQUEST_RESPONSE_TIME @@ -522,15 +547,14 @@ impl ObjectStorage for Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "HEAD"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "HEAD", &Utc::now().date_naive().to_string()]) + .inc(); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "HEAD", "200"]) .observe(head_elapsed); - // Record single file accessed - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "HEAD"]) - .inc(); } Err(err) => { let status_code = error_to_status_code(err); @@ -593,6 +617,9 @@ impl ObjectStorage for Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "GET"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "GET", &Utc::now().date_naive().to_string()]) + .inc(); res.push(byts); } @@ -605,6 +632,9 @@ impl ObjectStorage for Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "LIST"]) .inc_by(files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); Ok(res) } @@ -642,7 +672,9 @@ impl ObjectStorage for Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "LIST"]) .inc_by(files_scanned as f64); - + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -678,6 +710,9 @@ impl ObjectStorage for Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "DELETE"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "DELETE", &Utc::now().date_naive().to_string()]) + .inc(); } Err(err) => { let status_code = error_to_status_code(err); @@ -714,6 +749,9 @@ impl ObjectStorage for Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "HEAD"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "HEAD", &Utc::now().date_naive().to_string()]) + .inc(); Ok(result.map(|_| ())?) } @@ -733,6 +771,9 @@ impl ObjectStorage for Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "DELETE"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "DELETE", &Utc::now().date_naive().to_string()]) + .inc(); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -768,7 +809,12 @@ impl ObjectStorage for Gcs { .observe(list_elapsed); let common_prefixes = resp.common_prefixes; // get all dirs - + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "LIST"]) + .inc_by(common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(common_prefixes.len() as f64); // return prefixes at the root level let dirs: HashSet<_> = common_prefixes .iter() @@ -804,6 +850,12 @@ impl ObjectStorage for Gcs { }; stream_json_check.push(task); } + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "HEAD"]) + .inc_by(dirs.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "HEAD", &Utc::now().date_naive().to_string()]) + .inc_by(dirs.len() as f64); stream_json_check.try_collect::<()>().await?; @@ -828,7 +880,12 @@ impl ObjectStorage for Gcs { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "LIST", "200"]) .observe(list_elapsed); - + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); let hours: Vec = resp .common_prefixes .iter() @@ -862,7 +919,12 @@ impl ObjectStorage for Gcs { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "LIST", "200"]) .observe(list_elapsed); - + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); let minutes: Vec = resp .common_prefixes .iter() @@ -918,6 +980,12 @@ impl ObjectStorage for Gcs { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); resp } Err(err) => { @@ -952,6 +1020,12 @@ impl ObjectStorage for Gcs { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); resp } Err(err) => { diff --git a/src/storage/localfs.rs b/src/storage/localfs.rs index 5eeb90930..28b1aeb74 100644 --- a/src/storage/localfs.rs +++ b/src/storage/localfs.rs @@ -25,6 +25,7 @@ use std::{ use async_trait::async_trait; use bytes::Bytes; +use chrono::Utc; use datafusion::{datasource::listing::ListingTableUrl, execution::runtime_env::RuntimeEnvBuilder}; use fs_extra::file::CopyOptions; use futures::{TryStreamExt, stream::FuturesUnordered}; @@ -38,7 +39,10 @@ use tokio_stream::wrappers::ReadDirStream; use crate::{ handlers::http::users::USERS_ROOT_DIR, - metrics::storage::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}, + metrics::storage::{ + STORAGE_FILES_SCANNED, STORAGE_FILES_SCANNED_DATE, STORAGE_REQUEST_RESPONSE_TIME, + StorageMetrics, + }, option::validation, parseable::LogStream, storage::SETTINGS_ROOT_DIRECTORY, @@ -134,7 +138,9 @@ impl ObjectStorage for LocalFS { STORAGE_FILES_SCANNED .with_label_values(&["localfs", "HEAD"]) .inc(); - + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["localfs", "HEAD", &Utc::now().date_naive().to_string()]) + .inc(); Err(ObjectStorageError::UnhandledError(Box::new( std::io::Error::new( std::io::ErrorKind::Unsupported, @@ -179,22 +185,24 @@ impl ObjectStorage for LocalFS { STORAGE_FILES_SCANNED .with_label_values(&["localfs", "GET"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["localfs", "GET", &Utc::now().date_naive().to_string()]) + .inc(); Ok(x.into()) } - Err(e) => match e.kind() { - std::io::ErrorKind::NotFound => { + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["localfs", "GET", "404"]) .observe(get_elapsed); Err(ObjectStorageError::NoSuchKey(path.to_string())) - } - _ => { + } else { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["localfs", "GET", "500"]) .observe(get_elapsed); Err(ObjectStorageError::UnhandledError(Box::new(e))) } - }, + } }; res @@ -248,7 +256,9 @@ impl ObjectStorage for LocalFS { STORAGE_FILES_SCANNED .with_label_values(&["localfs", "LIST"]) .inc_by(files_scanned as f64); - + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["localfs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -327,6 +337,9 @@ impl ObjectStorage for LocalFS { STORAGE_FILES_SCANNED .with_label_values(&["localfs", "GET"]) .inc_by(files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["localfs", "GET", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); // maybe change the return code @@ -356,6 +369,9 @@ impl ObjectStorage for LocalFS { STORAGE_FILES_SCANNED .with_label_values(&["localfs", "PUT"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["localfs", "PUT", &Utc::now().date_naive().to_string()]) + .inc(); } Err(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -412,6 +428,9 @@ impl ObjectStorage for LocalFS { STORAGE_FILES_SCANNED .with_label_values(&["localfs", "DELETE"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["localfs", "DELETE", &Utc::now().date_naive().to_string()]) + .inc(); } Err(err) => { let status_code = match err.kind() { diff --git a/src/storage/object_storage.rs b/src/storage/object_storage.rs index ea2c9d50c..5a3f1b1eb 100644 --- a/src/storage/object_storage.rs +++ b/src/storage/object_storage.rs @@ -185,7 +185,7 @@ fn update_storage_metrics( .add(compressed_size as i64); TOTAL_EVENTS_STORAGE_SIZE_DATE .with_label_values(&["parquet", file_date_part]) - .inc_by(compressed_size); + .add(compressed_size as i64); Ok(()) } diff --git a/src/storage/s3.rs b/src/storage/s3.rs index a98e63124..fc4ab5638 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -30,6 +30,7 @@ use std::{ use async_trait::async_trait; use bytes::Bytes; +use chrono::Utc; use datafusion::{ datasource::listing::ListingTableUrl, execution::{ @@ -50,7 +51,10 @@ use tokio::{fs::OpenOptions, io::AsyncReadExt}; use tracing::{error, info}; use crate::{ - metrics::storage::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}, + metrics::storage::{ + STORAGE_FILES_SCANNED, STORAGE_FILES_SCANNED_DATE, STORAGE_REQUEST_RESPONSE_TIME, + StorageMetrics, + }, parseable::LogStream, }; @@ -345,6 +349,9 @@ impl S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "GET"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "GET", &Utc::now().date_naive().to_string()]) + .inc(); match resp { Ok(resp) => { let body = resp.bytes().await?; @@ -374,6 +381,9 @@ impl S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "PUT"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "PUT", &Utc::now().date_naive().to_string()]) + .inc(); match resp { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -436,9 +446,15 @@ impl S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "LIST"]) .inc_by(files_scanned.load(Ordering::Relaxed) as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned.load(Ordering::Relaxed) as f64); STORAGE_FILES_SCANNED .with_label_values(&["s3", "DELETE"]) .inc_by(files_deleted.load(Ordering::Relaxed) as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "DELETE", &Utc::now().date_naive().to_string()]) + .inc_by(files_deleted.load(Ordering::Relaxed) as f64); Ok(()) } @@ -471,6 +487,9 @@ impl S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "LIST"]) .inc_by(common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(common_prefixes.len() as f64); // return prefixes at the root level let dates: Vec<_> = common_prefixes @@ -491,6 +510,9 @@ impl S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "PUT"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "PUT", &Utc::now().date_naive().to_string()]) + .inc(); match result { Ok(result) => { STORAGE_REQUEST_RESPONSE_TIME @@ -551,6 +573,9 @@ impl S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "PUT"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "PUT", &Utc::now().date_naive().to_string()]) + .inc(); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -653,6 +678,9 @@ impl ObjectStorage for S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "HEAD"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "HEAD", &Utc::now().date_naive().to_string()]) + .inc(); let meta = match meta { Ok(meta) => { STORAGE_REQUEST_RESPONSE_TIME @@ -689,15 +717,14 @@ impl ObjectStorage for S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "HEAD"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "HEAD", &Utc::now().date_naive().to_string()]) + .inc(); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "HEAD", "200"]) .observe(head_elapsed); - // Record single file accessed - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "HEAD"]) - .inc(); } Err(err) => { let status_code = error_to_status_code(err); @@ -760,6 +787,9 @@ impl ObjectStorage for S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "GET"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "GET", &Utc::now().date_naive().to_string()]) + .inc(); res.push(byts); } @@ -772,6 +802,9 @@ impl ObjectStorage for S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "LIST"]) .inc_by(files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); Ok(res) } @@ -809,6 +842,9 @@ impl ObjectStorage for S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "LIST"]) .inc_by(files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -845,6 +881,9 @@ impl ObjectStorage for S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "DELETE"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "DELETE", &Utc::now().date_naive().to_string()]) + .inc(); } Err(err) => { let status_code = error_to_status_code(err); @@ -881,6 +920,9 @@ impl ObjectStorage for S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "HEAD"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "HEAD", &Utc::now().date_naive().to_string()]) + .inc(); Ok(result.map(|_| ())?) } @@ -900,6 +942,9 @@ impl ObjectStorage for S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "DELETE"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "DELETE", &Utc::now().date_naive().to_string()]) + .inc(); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -935,7 +980,12 @@ impl ObjectStorage for S3 { .observe(list_elapsed); let common_prefixes = resp.common_prefixes; // get all dirs - + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "LIST"]) + .inc_by(common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(common_prefixes.len() as f64); // return prefixes at the root level let dirs: HashSet<_> = common_prefixes .iter() @@ -971,6 +1021,12 @@ impl ObjectStorage for S3 { }; stream_json_check.push(task); } + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "HEAD"]) + .inc_by(dirs.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "HEAD", &Utc::now().date_naive().to_string()]) + .inc_by(dirs.len() as f64); stream_json_check.try_collect::<()>().await?; @@ -995,6 +1051,12 @@ impl ObjectStorage for S3 { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); let hours: Vec = resp .common_prefixes @@ -1029,6 +1091,12 @@ impl ObjectStorage for S3 { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); let minutes: Vec = resp .common_prefixes @@ -1094,6 +1162,12 @@ impl ObjectStorage for S3 { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); resp } Err(err) => { @@ -1128,6 +1202,12 @@ impl ObjectStorage for S3 { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); resp } Err(err) => { From 126c798e6cdc98536a10a83ee8600c15e7c1bd63 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Tue, 9 Sep 2025 01:40:17 -0700 Subject: [PATCH 04/14] add metrics to collect date level stats of ingestion, query and other object store calls --- src/event/mod.rs | 6 + src/handlers/http/modal/ingest_server.rs | 2 - src/handlers/http/modal/query_server.rs | 1 - src/handlers/http/modal/server.rs | 2 - src/handlers/http/query.rs | 10 +- src/metrics/mod.rs | 212 ++++++++- src/metrics/storage.rs | 153 ------ src/query/stream_schema_provider.rs | 33 +- src/storage/azure_blob.rs | 577 +++++++++++++++++------ src/storage/gcs.rs | 338 +++++++------ src/storage/localfs.rs | 139 ++++-- src/storage/metrics_layer.rs | 6 +- src/storage/object_storage.rs | 11 +- src/storage/s3.rs | 340 +++++++------ 14 files changed, 1153 insertions(+), 677 deletions(-) delete mode 100644 src/metrics/storage.rs diff --git a/src/event/mod.rs b/src/event/mod.rs index 4da88de1a..d5a0ef25d 100644 --- a/src/event/mod.rs +++ b/src/event/mod.rs @@ -28,6 +28,7 @@ use self::error::EventError; use crate::{ LOCK_EXPECT, metadata::update_stats, + metrics::{increment_events_ingested_by_date, increment_events_ingested_size_by_date}, parseable::{PARSEABLE, StagingError}, storage::StreamType, }; @@ -88,6 +89,11 @@ impl Event { self.parsed_timestamp.date(), ); + // Track billing metrics for event ingestion + let date_string = self.parsed_timestamp.date().to_string(); + increment_events_ingested_by_date(self.rb.num_rows() as u64, &date_string); + increment_events_ingested_size_by_date(self.origin_size, &date_string); + crate::livetail::LIVETAIL.process(&self.stream_name, &self.rb); Ok(()) diff --git a/src/handlers/http/modal/ingest_server.rs b/src/handlers/http/modal/ingest_server.rs index 96553b06c..628bd9f0f 100644 --- a/src/handlers/http/modal/ingest_server.rs +++ b/src/handlers/http/modal/ingest_server.rs @@ -116,8 +116,6 @@ impl ParseableServer for IngestServer { }) .await; - PARSEABLE.storage.register_store_metrics(prometheus); - migration::run_migration(&PARSEABLE).await?; // local sync on init diff --git a/src/handlers/http/modal/query_server.rs b/src/handlers/http/modal/query_server.rs index 5e6fa0860..c551884d4 100644 --- a/src/handlers/http/modal/query_server.rs +++ b/src/handlers/http/modal/query_server.rs @@ -106,7 +106,6 @@ impl ParseableServer for QueryServer { prometheus: &PrometheusMetrics, shutdown_rx: oneshot::Receiver<()>, ) -> anyhow::Result<()> { - PARSEABLE.storage.register_store_metrics(prometheus); // write the ingestor metadata to storage QUERIER_META .get_or_init(|| async { diff --git a/src/handlers/http/modal/server.rs b/src/handlers/http/modal/server.rs index 7d1d46ee9..a522697aa 100644 --- a/src/handlers/http/modal/server.rs +++ b/src/handlers/http/modal/server.rs @@ -128,8 +128,6 @@ impl ParseableServer for Server { prometheus: &PrometheusMetrics, shutdown_rx: oneshot::Receiver<()>, ) -> anyhow::Result<()> { - PARSEABLE.storage.register_store_metrics(prometheus); - migration::run_migration(&PARSEABLE).await?; // load on init diff --git a/src/handlers/http/query.rs b/src/handlers/http/query.rs index 014bed163..63d908a8e 100644 --- a/src/handlers/http/query.rs +++ b/src/handlers/http/query.rs @@ -46,7 +46,7 @@ use tokio::task::JoinSet; use tracing::{error, warn}; use crate::event::{DEFAULT_TIMESTAMP_KEY, commit_schema}; -use crate::metrics::QUERY_EXECUTE_TIME; +use crate::metrics::{QUERY_EXECUTE_TIME, increment_query_calls_by_date}; use crate::parseable::{PARSEABLE, StreamNotFound}; use crate::query::error::ExecuteError; use crate::query::{CountsRequest, Query as LogicalQuery, execute}; @@ -123,6 +123,10 @@ pub async fn query(req: HttpRequest, query_request: Query) -> Result` // we use the `get_bin_density` method to get the count of records in the dataset // instead of executing the query using datafusion @@ -341,6 +345,10 @@ pub async fn get_counts( req: HttpRequest, counts_request: Json, ) -> Result { + // Track billing metrics for query calls + let current_date = chrono::Utc::now().date_naive().to_string(); + increment_query_calls_by_date(¤t_date); + let creds = extract_session_key_from_req(&req)?; let permissions = Users.get_permissions(&creds); diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index 27970c22d..ee25e4a5f 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -17,8 +17,6 @@ */ pub mod prom_utils; -pub mod storage; - use crate::{handlers::http::metrics_path, stats::FullStats}; use actix_web::Responder; use actix_web_prometheus::{PrometheusMetrics, PrometheusMetricsBuilder}; @@ -228,6 +226,125 @@ pub static ALERTS_STATES: Lazy = Lazy::new(|| { .expect("metric can be created") }); +// Billing Metrics - Counter type metrics for billing/usage tracking +pub static TOTAL_EVENTS_INGESTED_BY_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( + Opts::new( + "total_events_ingested_by_date", + "Total events ingested by date (Counter for billing)", + ) + .namespace(METRICS_NAMESPACE), + &["date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_EVENTS_INGESTED_SIZE_BY_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( + Opts::new( + "total_events_ingested_size_by_date", + "Total events ingested size in bytes by date (Counter for billing)", + ) + .namespace(METRICS_NAMESPACE), + &["date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_PARQUETS_STORED_BY_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( + Opts::new( + "total_parquets_stored_by_date", + "Total parquet files stored by date (Counter for billing)", + ) + .namespace(METRICS_NAMESPACE), + &["date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_PARQUETS_STORED_SIZE_BY_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( + Opts::new( + "total_parquets_stored_size_by_date", + "Total parquet files stored size in bytes by date (Counter for billing)", + ) + .namespace(METRICS_NAMESPACE), + &["date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_QUERY_CALLS_BY_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( + Opts::new( + "total_query_calls_by_date", + "Total query calls by date (Counter for billing)", + ) + .namespace(METRICS_NAMESPACE), + &["date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_FILES_SCANNED_IN_QUERY_BY_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( + Opts::new( + "total_files_scanned_in_query_by_date", + "Total files scanned in queries by date (Counter for billing)", + ) + .namespace(METRICS_NAMESPACE), + &["date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_BYTES_SCANNED_IN_QUERY_BY_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( + Opts::new( + "total_bytes_scanned_in_query_by_date", + "Total bytes scanned in queries by date (Counter for billing)", + ) + .namespace(METRICS_NAMESPACE), + &["date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_OBJECT_STORE_CALLS_BY_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( + Opts::new( + "total_object_store_calls_by_date", + "Total object store calls by date (Counter for billing)", + ) + .namespace(METRICS_NAMESPACE), + &["provider", "method", "date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_FILES_SCANNED_IN_OBJECT_STORE_CALLS_BY_DATE: Lazy = + Lazy::new(|| { + IntCounterVec::new( + Opts::new( + "total_files_scanned_in_object_store_calls_by_date", + "Total files scanned in object store calls by date (Counter for billing)", + ) + .namespace(METRICS_NAMESPACE), + &["provider", "method", "date"], + ) + .expect("metric can be created") + }); + +pub static STORAGE_REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { + HistogramVec::new( + HistogramOpts::new("storage_request_response_time", "Storage Request Latency") + .namespace(METRICS_NAMESPACE), + &["provider", "method", "status"], + ) + .expect("metric can be created") +}); + fn custom_metrics(registry: &Registry) { registry .register(Box::new(EVENTS_INGESTED.clone())) @@ -286,6 +403,39 @@ fn custom_metrics(registry: &Registry) { registry .register(Box::new(ALERTS_STATES.clone())) .expect("metric can be registered"); + // Register billing metrics + registry + .register(Box::new(TOTAL_EVENTS_INGESTED_BY_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_EVENTS_INGESTED_SIZE_BY_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_PARQUETS_STORED_BY_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_PARQUETS_STORED_SIZE_BY_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_QUERY_CALLS_BY_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_FILES_SCANNED_IN_QUERY_BY_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_BYTES_SCANNED_IN_QUERY_BY_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_OBJECT_STORE_CALLS_BY_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new( + TOTAL_FILES_SCANNED_IN_OBJECT_STORE_CALLS_BY_DATE.clone(), + )) + .expect("metric can be registered"); + registry + .register(Box::new(STORAGE_REQUEST_RESPONSE_TIME.clone())) + .expect("metric can be registered"); } pub fn build_metrics_handler() -> PrometheusMetrics { @@ -345,6 +495,64 @@ pub async fn fetch_stats_from_storage(stream_name: &str, stats: FullStats) { .set(stats.lifetime_stats.storage as i64); } +// Helper functions for tracking billing metrics +pub fn increment_events_ingested_by_date(count: u64, date: &str) { + TOTAL_EVENTS_INGESTED_BY_DATE + .with_label_values(&[date]) + .inc_by(count); +} + +pub fn increment_events_ingested_size_by_date(size: u64, date: &str) { + TOTAL_EVENTS_INGESTED_SIZE_BY_DATE + .with_label_values(&[date]) + .inc_by(size); +} + +pub fn increment_parquets_stored_by_date(date: &str) { + TOTAL_PARQUETS_STORED_BY_DATE + .with_label_values(&[date]) + .inc(); +} + +pub fn increment_parquets_stored_size_by_date(size: u64, date: &str) { + TOTAL_PARQUETS_STORED_SIZE_BY_DATE + .with_label_values(&[date]) + .inc_by(size); +} + +pub fn increment_query_calls_by_date(date: &str) { + TOTAL_QUERY_CALLS_BY_DATE.with_label_values(&[date]).inc(); +} + +pub fn increment_files_scanned_in_query_by_date(count: u64, date: &str) { + TOTAL_FILES_SCANNED_IN_QUERY_BY_DATE + .with_label_values(&[date]) + .inc_by(count); +} + +pub fn increment_bytes_scanned_in_query_by_date(bytes: u64, date: &str) { + TOTAL_BYTES_SCANNED_IN_QUERY_BY_DATE + .with_label_values(&[date]) + .inc_by(bytes); +} + +pub fn increment_object_store_calls_by_date(provider: &str, method: &str, date: &str) { + TOTAL_OBJECT_STORE_CALLS_BY_DATE + .with_label_values(&[provider, method, date]) + .inc(); +} + +pub fn increment_files_scanned_in_object_store_calls_by_date( + provider: &str, + method: &str, + count: u64, + date: &str, +) { + TOTAL_FILES_SCANNED_IN_OBJECT_STORE_CALLS_BY_DATE + .with_label_values(&[provider, method, date]) + .inc_by(count); +} + use actix_web::HttpResponse; pub async fn get() -> Result { diff --git a/src/metrics/storage.rs b/src/metrics/storage.rs deleted file mode 100644 index 3386c451a..000000000 --- a/src/metrics/storage.rs +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Parseable Server (C) 2022 - 2024 Parseable, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - */ - -use crate::metrics::METRICS_NAMESPACE; -use actix_web_prometheus::PrometheusMetrics; -use once_cell::sync::Lazy; -use prometheus::{CounterVec, HistogramOpts, HistogramVec, Opts}; - -// Global storage metric used by all storage providers -pub static STORAGE_REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { - HistogramVec::new( - HistogramOpts::new("storage_request_response_time", "Storage Request Latency") - .namespace(METRICS_NAMESPACE), - &["provider", "method", "status"], - ) - .expect("metric can be created") -}); - -// Global storage metric for tracking number of files scanned -pub static STORAGE_FILES_SCANNED: Lazy = Lazy::new(|| { - CounterVec::new( - Opts::new( - "storage_files_scanned_total", - "Total number of files scanned in storage operations", - ) - .namespace(METRICS_NAMESPACE), - &["provider", "operation"], - ) - .expect("metric can be created") -}); - -pub static STORAGE_FILES_SCANNED_DATE: Lazy = Lazy::new(|| { - CounterVec::new( - Opts::new( - "storage_files_scanned_date_total", - "Total number of files scanned in storage operations by date", - ) - .namespace(METRICS_NAMESPACE), - &["provider", "operation", "date"], - ) - .expect("metric can be created") -}); - -pub trait StorageMetrics { - fn register_metrics(&self, handler: &PrometheusMetrics); -} - -pub mod localfs { - use crate::{metrics::storage::STORAGE_FILES_SCANNED_DATE, storage::FSConfig}; - - use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; - - impl StorageMetrics for FSConfig { - fn register_metrics(&self, handler: &actix_web_prometheus::PrometheusMetrics) { - handler - .registry - .register(Box::new(STORAGE_REQUEST_RESPONSE_TIME.clone())) - .expect("metric can be registered"); - handler - .registry - .register(Box::new(STORAGE_FILES_SCANNED.clone())) - .expect("metric can be registered"); - handler - .registry - .register(Box::new(STORAGE_FILES_SCANNED_DATE.clone())) - .expect("metric can be registered"); - } - } -} - -pub mod s3 { - use crate::{metrics::storage::STORAGE_FILES_SCANNED_DATE, storage::S3Config}; - - use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; - - impl StorageMetrics for S3Config { - fn register_metrics(&self, handler: &actix_web_prometheus::PrometheusMetrics) { - handler - .registry - .register(Box::new(STORAGE_REQUEST_RESPONSE_TIME.clone())) - .expect("metric can be registered"); - handler - .registry - .register(Box::new(STORAGE_FILES_SCANNED.clone())) - .expect("metric can be registered"); - handler - .registry - .register(Box::new(STORAGE_FILES_SCANNED_DATE.clone())) - .expect("metric can be registered"); - } - } -} - -pub mod azureblob { - use crate::{metrics::storage::STORAGE_FILES_SCANNED_DATE, storage::AzureBlobConfig}; - - use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; - - impl StorageMetrics for AzureBlobConfig { - fn register_metrics(&self, handler: &actix_web_prometheus::PrometheusMetrics) { - handler - .registry - .register(Box::new(STORAGE_REQUEST_RESPONSE_TIME.clone())) - .expect("metric can be registered"); - handler - .registry - .register(Box::new(STORAGE_FILES_SCANNED.clone())) - .expect("metric can be registered"); - handler - .registry - .register(Box::new(STORAGE_FILES_SCANNED_DATE.clone())) - .expect("metric can be registered"); - } - } -} - -pub mod gcs { - use crate::{metrics::storage::STORAGE_FILES_SCANNED_DATE, storage::GcsConfig}; - - use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; - - impl StorageMetrics for GcsConfig { - fn register_metrics(&self, handler: &actix_web_prometheus::PrometheusMetrics) { - handler - .registry - .register(Box::new(STORAGE_REQUEST_RESPONSE_TIME.clone())) - .expect("metric can be registered"); - handler - .registry - .register(Box::new(STORAGE_FILES_SCANNED.clone())) - .expect("metric can be registered"); - handler - .registry - .register(Box::new(STORAGE_FILES_SCANNED_DATE.clone())) - .expect("metric can be registered"); - } - } -} diff --git a/src/query/stream_schema_provider.rs b/src/query/stream_schema_provider.rs index 60a23e9aa..286800018 100644 --- a/src/query/stream_schema_provider.rs +++ b/src/query/stream_schema_provider.rs @@ -58,8 +58,9 @@ use crate::{ event::DEFAULT_TIMESTAMP_KEY, hottier::HotTierManager, metrics::{ - QUERY_CACHE_HIT, - storage::{STORAGE_FILES_SCANNED, STORAGE_FILES_SCANNED_DATE}, + QUERY_CACHE_HIT, increment_bytes_scanned_in_query_by_date, + increment_files_scanned_in_object_store_calls_by_date, + increment_files_scanned_in_query_by_date, }, option::Mode, parseable::{PARSEABLE, STREAM_EXISTS}, @@ -328,6 +329,8 @@ impl StandardTableProvider { let mut partitioned_files = Vec::from_iter((0..target_partition).map(|_| Vec::new())); let mut column_statistics = HashMap::>::new(); let mut count = 0; + let mut total_file_size = 0u64; + let mut file_count = 0u64; for (index, file) in manifest_files .into_iter() .enumerate() @@ -338,9 +341,14 @@ impl StandardTableProvider { mut file_path, num_rows, columns, + file_size, .. } = file; + // Track billing metrics for files scanned in query + file_count += 1; + total_file_size += file_size; + // object_store::path::Path doesn't automatically deal with Windows path separators // to do that, we are using from_absolute_path() which takes into consideration the underlying filesystem // before sending the file path to PartitionedFile @@ -397,6 +405,11 @@ impl StandardTableProvider { column_statistics: statistics, }; + // Track billing metrics for query scan + let current_date = chrono::Utc::now().date_naive().to_string(); + increment_files_scanned_in_query_by_date(file_count, ¤t_date); + increment_bytes_scanned_in_query_by_date(total_file_size, ¤t_date); + (partitioned_files, statistics) } } @@ -586,16 +599,12 @@ impl TableProvider for StandardTableProvider { } let parquet_files_to_scan = manifest_files.len(); - STORAGE_FILES_SCANNED - .with_label_values(&[PARSEABLE.storage().name(), "GET"]) - .inc_by(parquet_files_to_scan as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&[ - PARSEABLE.storage().name(), - "GET", - &Utc::now().date_naive().to_string(), - ]) - .inc_by(parquet_files_to_scan as f64); + increment_files_scanned_in_object_store_calls_by_date( + PARSEABLE.storage().name(), + "GET", + parquet_files_to_scan as u64, + &Utc::now().date_naive().to_string(), + ); let (partitioned_files, statistics) = self.partitioned_files(manifest_files); self.create_parquet_physical_plan( diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index e8a73440a..4e3d56dcb 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -17,7 +17,7 @@ */ use std::{ - collections::HashSet, + collections::{BTreeMap, HashSet}, path::Path, sync::{ Arc, @@ -46,15 +46,17 @@ use object_store::{ }; use relative_path::{RelativePath, RelativePathBuf}; use tokio::{fs::OpenOptions, io::AsyncReadExt}; -use tracing::{error, info}; +use tracing::error; use url::Url; use crate::{ - metrics::storage::{ - STORAGE_FILES_SCANNED, STORAGE_FILES_SCANNED_DATE, STORAGE_REQUEST_RESPONSE_TIME, - StorageMetrics, + handlers::http::users::USERS_ROOT_DIR, + metrics::{ + STORAGE_REQUEST_RESPONSE_TIME, increment_files_scanned_in_object_store_calls_by_date, + increment_object_store_calls_by_date, }, parseable::LogStream, + storage::STREAM_ROOT_DIRECTORY, }; use super::{ @@ -198,10 +200,6 @@ impl ObjectStorageProvider for AzureBlobConfig { fn get_endpoint(&self) -> String { self.endpoint_url.clone() } - - fn register_store_metrics(&self, handler: &actix_web_prometheus::PrometheusMetrics) { - self.register_metrics(handler) - } } // ObjStoreClient is generic client to enable interactions with different cloudprovider's @@ -219,18 +217,25 @@ impl BlobStore { let time = std::time::Instant::now(); let resp = self.client.get(&to_object_store_path(path)).await; let elapsed = time.elapsed().as_secs_f64(); - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "GET"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["azure_blob", "GET", &Utc::now().date_naive().to_string()]) - .inc(); + + increment_object_store_calls_by_date( + "azure_blob", + "GET", + &Utc::now().date_naive().to_string(), + ); + match resp { Ok(resp) => { let body: Bytes = resp.bytes().await.unwrap(); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "GET", "200"]) .observe(elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "GET", + 1, + &Utc::now().date_naive().to_string(), + ); Ok(body) } Err(err) => { @@ -251,17 +256,23 @@ impl BlobStore { let time = std::time::Instant::now(); let resp = self.client.put(&to_object_store_path(path), resource).await; let elapsed = time.elapsed().as_secs_f64(); - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "PUT"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["azure_blob", "PUT", &Utc::now().date_naive().to_string()]) - .inc(); + + increment_object_store_calls_by_date( + "azure_blob", + "PUT", + &Utc::now().date_naive().to_string(), + ); match resp { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "PUT", "200"]) .observe(elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "PUT", + 1, + &Utc::now().date_naive().to_string(), + ); Ok(()) } Err(err) => { @@ -284,16 +295,27 @@ impl BlobStore { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "LIST", "200"]) .observe(list_elapsed); + increment_object_store_calls_by_date( + "azure_blob", + "LIST", + &Utc::now().date_naive().to_string(), + ); object_stream .for_each_concurrent(None, |x| async { files_scanned.fetch_add(1, Ordering::Relaxed); + match x { Ok(obj) => { files_deleted.fetch_add(1, Ordering::Relaxed); let delete_start = Instant::now(); let delete_resp = self.client.delete(&obj.location).await; let delete_elapsed = delete_start.elapsed().as_secs_f64(); + increment_object_store_calls_by_date( + "azure_blob", + "DELETE", + &Utc::now().date_naive().to_string(), + ); match delete_resp { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -316,21 +338,98 @@ impl BlobStore { }) .await; - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "LIST"]) - .inc_by(files_scanned.load(Ordering::Relaxed) as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(files_scanned.load(Ordering::Relaxed) as f64); - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "DELETE"]) - .inc_by(files_deleted.load(Ordering::Relaxed) as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["azure_blob", "DELETE", &Utc::now().date_naive().to_string()]) - .inc_by(files_deleted.load(Ordering::Relaxed) as f64); + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "LIST", + files_scanned.load(Ordering::Relaxed), + &Utc::now().date_naive().to_string(), + ); + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "DELETE", + files_deleted.load(Ordering::Relaxed), + &Utc::now().date_naive().to_string(), + ); + // Note: Individual DELETE calls are tracked inside the concurrent loop Ok(()) } + async fn _list_streams(&self) -> Result, ObjectStorageError> { + let mut result_file_list = HashSet::new(); + let mut total_files_scanned = 0u64; + + let list_start = Instant::now(); + let resp = self.client.list_with_delimiter(None).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + total_files_scanned += resp.objects.len() as u64; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); + increment_object_store_calls_by_date( + "azure_blob", + "LIST", + &Utc::now().date_naive().to_string(), + ); + + let streams = resp + .common_prefixes + .iter() + .flat_map(|path| path.parts()) + .map(|name| name.as_ref().to_string()) + .filter(|name| name != PARSEABLE_ROOT_DIRECTORY && name != USERS_ROOT_DIR) + .collect::>(); + + for stream in streams { + let stream_path = + object_store::path::Path::from(format!("{}/{}", &stream, STREAM_ROOT_DIRECTORY)); + + // Track individual LIST operations for each stream + let stream_list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&stream_path)).await; + let stream_list_elapsed = stream_list_start.elapsed().as_secs_f64(); + increment_object_store_calls_by_date( + "azure_blob", + "LIST", + &Utc::now().date_naive().to_string(), + ); + match &resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(stream_list_elapsed); + + total_files_scanned += resp.objects.len() as u64; + if resp + .objects + .iter() + .any(|name| name.location.filename().unwrap().ends_with("stream.json")) + { + result_file_list.insert(stream); + } + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", status_code]) + .observe(stream_list_elapsed); + + return Err(ObjectStorageError::UnhandledError(Box::new( + std::io::Error::other(format!("List operation failed: {}", err)), + ))); + } + } + } + + // Record total files scanned across all operations + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "LIST", + total_files_scanned, + &Utc::now().date_naive().to_string(), + ); + Ok(result_file_list) + } + async fn _list_dates(&self, stream: &str) -> Result, ObjectStorageError> { let list_start = Instant::now(); let resp: Result = self @@ -338,6 +437,11 @@ impl BlobStore { .list_with_delimiter(Some(&(stream.into()))) .await; let list_elapsed = list_start.elapsed().as_secs_f64(); + increment_object_store_calls_by_date( + "azure_blob", + "LIST", + &Utc::now().date_naive().to_string(), + ); let resp = match resp { Ok(resp) => { @@ -357,12 +461,12 @@ impl BlobStore { let common_prefixes = resp.common_prefixes; - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "LIST"]) - .inc_by(common_prefixes.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(common_prefixes.len() as f64); + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "LIST", + common_prefixes.len() as u64, + &Utc::now().date_naive().to_string(), + ); // return prefixes at the root level let dates: Vec<_> = common_prefixes @@ -374,24 +478,121 @@ impl BlobStore { Ok(dates) } + async fn _list_manifest_files( + &self, + stream: &str, + ) -> Result>, ObjectStorageError> { + let mut result_file_list: BTreeMap> = BTreeMap::new(); + let mut total_files_scanned = 0u64; + + // Track initial LIST operation + let list_start = Instant::now(); + let resp = self + .client + .list_with_delimiter(Some(&(stream.into()))) + .await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + increment_object_store_calls_by_date( + "azure_blob", + "LIST", + &Utc::now().date_naive().to_string(), + ); + let resp = match resp { + Ok(resp) => { + total_files_scanned += resp.objects.len() as u64; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); + + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + + let dates = resp + .common_prefixes + .iter() + .flat_map(|path| path.parts()) + .filter(|name| name.as_ref() != stream && name.as_ref() != STREAM_ROOT_DIRECTORY) + .map(|name| name.as_ref().to_string()) + .collect::>(); + + for date in dates { + let date_path = object_store::path::Path::from(format!("{}/{}", stream, &date)); + + // Track individual LIST operation for each date + let date_list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&date_path)).await; + let date_list_elapsed = date_list_start.elapsed().as_secs_f64(); + increment_object_store_calls_by_date( + "azure_blob", + "LIST", + &Utc::now().date_naive().to_string(), + ); + match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(date_list_elapsed); + + total_files_scanned += resp.objects.len() as u64; + let manifests: Vec = resp + .objects + .iter() + .filter(|name| name.location.filename().unwrap().ends_with("manifest.json")) + .map(|name| name.location.to_string()) + .collect(); + result_file_list.entry(date).or_default().extend(manifests); + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", status_code]) + .observe(date_list_elapsed); + return Err(err.into()); + } + } + } + + // Record total files scanned across all date operations + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "LIST", + total_files_scanned, + &Utc::now().date_naive().to_string(), + ); + Ok(result_file_list) + } + async fn _upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { let bytes = tokio::fs::read(path).await?; let put_start = Instant::now(); let result = self.client.put(&key.into(), bytes.into()).await; let put_elapsed = put_start.elapsed().as_secs_f64(); - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "PUT"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["azure_blob", "PUT", &Utc::now().date_naive().to_string()]) - .inc(); + + increment_object_store_calls_by_date( + "azure_blob", + "PUT", + &Utc::now().date_naive().to_string(), + ); match result { - Ok(result) => { + Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "PUT", "200"]) .observe(put_elapsed); - info!("Uploaded file to Azure Blob Storage: {:?}", result); + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "PUT", + 1, + &Utc::now().date_naive().to_string(), + ); Ok(()) } Err(err) => { @@ -416,18 +617,22 @@ impl BlobStore { let multipart_start = Instant::now(); let async_writer = self.client.put_multipart(location).await; let multipart_elapsed = multipart_start.elapsed().as_secs_f64(); - + increment_object_store_calls_by_date( + "azure_blob", + "PUT_MULTIPART", + &Utc::now().date_naive().to_string(), + ); let mut async_writer = match async_writer { Ok(writer) => { STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT_MULTIPART_INIT", "200"]) + .with_label_values(&["azure_blob", "PUT_MULTIPART", "200"]) .observe(multipart_elapsed); writer } Err(err) => { let status_code = error_to_status_code(&err); STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT_MULTIPART_INIT", status_code]) + .with_label_values(&["azure_blob", "PUT_MULTIPART", status_code]) .observe(multipart_elapsed); return Err(err.into()); } @@ -443,12 +648,23 @@ impl BlobStore { let put_start = Instant::now(); let result = self.client.put(location, data.into()).await; let put_elapsed = put_start.elapsed().as_secs_f64(); + increment_object_store_calls_by_date( + "azure_blob", + "PUT", + &Utc::now().date_naive().to_string(), + ); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "PUT", "200"]) .observe(put_elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "PUT", + 1, + &Utc::now().date_naive().to_string(), + ); } Err(err) => { let status_code = error_to_status_code(&err); @@ -489,17 +705,21 @@ impl BlobStore { let part_start = Instant::now(); let result = async_writer.put_part(part_data.into()).await; let part_elapsed = part_start.elapsed().as_secs_f64(); - + increment_object_store_calls_by_date( + "azure_blob", + "PUT_MULTIPART", + &Utc::now().date_naive().to_string(), + ); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT_MULTIPART_PART", "200"]) + .with_label_values(&["azure_blob", "PUT_MULTIPART", "200"]) .observe(part_elapsed); } Err(err) => { let status_code = error_to_status_code(&err); STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT_MULTIPART_PART", status_code]) + .with_label_values(&["azure_blob", "PUT_MULTIPART", status_code]) .observe(part_elapsed); return Err(err.into()); } @@ -557,22 +777,23 @@ impl ObjectStorage for BlobStore { let head_start = Instant::now(); let result = self.client.head(&to_object_store_path(path)).await; let head_elapsed = head_start.elapsed().as_secs_f64(); + + increment_object_store_calls_by_date( + "azure_blob", + "HEAD", + &Utc::now().date_naive().to_string(), + ); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "HEAD", "200"]) .observe(head_elapsed); - // Record single file accessed - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "HEAD"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&[ - "azure_blob", - "HEAD", - &Utc::now().date_naive().to_string(), - ]) - .inc(); + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "HEAD", + 1, + &Utc::now().date_naive().to_string(), + ); } Err(err) => { let status_code = error_to_status_code(err); @@ -632,13 +853,17 @@ impl ObjectStorage for BlobStore { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "GET", "200"]) .observe(list_start.elapsed().as_secs_f64()); - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "GET"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["azure_blob", "GET", &Utc::now().date_naive().to_string()]) - .inc(); - + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "GET", + 1, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date( + "azure_blob", + "GET", + &Utc::now().date_naive().to_string(), + ); res.push(byts); } let list_elapsed = list_start.elapsed().as_secs_f64(); @@ -647,13 +872,17 @@ impl ObjectStorage for BlobStore { .observe(list_elapsed); // Record total files scanned - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "LIST"]) - .inc_by(files_scanned as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(files_scanned as f64); - + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "LIST", + files_scanned as u64, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date( + "azure_blob", + "LIST", + &Utc::now().date_naive().to_string(), + ); Ok(res) } @@ -666,6 +895,11 @@ impl ObjectStorage for BlobStore { // Track list operation let list_start = Instant::now(); let mut object_stream = self.client.list(Some(&self.root)); + increment_object_store_calls_by_date( + "azure_blob", + "LIST", + &Utc::now().date_naive().to_string(), + ); while let Some(meta_result) = object_stream.next().await { let meta = match meta_result { @@ -687,13 +921,12 @@ impl ObjectStorage for BlobStore { .with_label_values(&["azure_blob", "LIST", "200"]) .observe(list_elapsed); // Record total files scanned - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "LIST"]) - .inc_by(files_scanned as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(files_scanned as f64); - + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "LIST", + files_scanned as u64, + &Utc::now().date_naive().to_string(), + ); Ok(path_arr) } @@ -719,23 +952,23 @@ impl ObjectStorage for BlobStore { let delete_start = Instant::now(); let result = self.client.delete(&to_object_store_path(path)).await; let delete_elapsed = delete_start.elapsed().as_secs_f64(); - + increment_object_store_calls_by_date( + "azure_blob", + "DELETE", + &Utc::now().date_naive().to_string(), + ); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "DELETE", "200"]) .observe(delete_elapsed); // Record single file deleted - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "DELETE"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&[ - "azure_blob", - "DELETE", - &Utc::now().date_naive().to_string(), - ]) - .inc(); + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "DELETE", + 1, + &Utc::now().date_naive().to_string(), + ); } Err(err) => { let status_code = error_to_status_code(err); @@ -755,12 +988,23 @@ impl ObjectStorage for BlobStore { .head(&to_object_store_path(&parseable_json_path())) .await; let head_elapsed = head_start.elapsed().as_secs_f64(); + increment_object_store_calls_by_date( + "azure_blob", + "HEAD", + &Utc::now().date_naive().to_string(), + ); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "HEAD", "200"]) .observe(head_elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "HEAD", + 1, + &Utc::now().date_naive().to_string(), + ); } Err(err) => { let status_code = error_to_status_code(err); @@ -769,12 +1013,6 @@ impl ObjectStorage for BlobStore { .observe(head_elapsed); } } - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "HEAD"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["azure_blob", "HEAD", &Utc::now().date_naive().to_string()]) - .inc(); Ok(result.map(|_| ())?) } @@ -791,17 +1029,23 @@ impl ObjectStorage for BlobStore { let delete_start = Instant::now(); let result = self.client.delete(&to_object_store_path(&file)).await; let delete_elapsed = delete_start.elapsed().as_secs_f64(); - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "DELETE"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["azure_blob", "DELETE", &Utc::now().date_naive().to_string()]) - .inc(); + + increment_object_store_calls_by_date( + "azure_blob", + "DELETE", + &Utc::now().date_naive().to_string(), + ); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "DELETE", "200"]) .observe(delete_elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "DELETE", + 1, + &Utc::now().date_naive().to_string(), + ); Ok(()) } Err(err) => { @@ -832,12 +1076,17 @@ impl ObjectStorage for BlobStore { .observe(list_elapsed); let common_prefixes = resp.common_prefixes; // get all dirs - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "LIST"]) - .inc_by(common_prefixes.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(common_prefixes.len() as f64); + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "LIST", + common_prefixes.len() as u64, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date( + "azure_blob", + "LIST", + &Utc::now().date_naive().to_string(), + ); // return prefixes at the root level let dirs: HashSet<_> = common_prefixes .iter() @@ -854,7 +1103,11 @@ impl ObjectStorage for BlobStore { let head_start = Instant::now(); let result = self.client.head(&StorePath::from(key)).await; let head_elapsed = head_start.elapsed().as_secs_f64(); - + increment_object_store_calls_by_date( + "azure_blob", + "HEAD", + &Utc::now().date_naive().to_string(), + ); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -873,13 +1126,12 @@ impl ObjectStorage for BlobStore { }; stream_json_check.push(task); } - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "HEAD"]) - .inc_by(dirs.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["azure_blob", "HEAD", &Utc::now().date_naive().to_string()]) - .inc_by(dirs.len() as f64); - + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "HEAD", + dirs.len() as u64, + &Utc::now().date_naive().to_string(), + ); stream_json_check.try_collect::<()>().await?; Ok(dirs) @@ -903,12 +1155,17 @@ impl ObjectStorage for BlobStore { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "LIST", "200"]) .observe(list_elapsed); - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "LIST"]) - .inc_by(resp.common_prefixes.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(resp.common_prefixes.len() as f64); + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "LIST", + resp.common_prefixes.len() as u64, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date( + "azure_blob", + "LIST", + &Utc::now().date_naive().to_string(), + ); let hours: Vec = resp .common_prefixes @@ -943,13 +1200,17 @@ impl ObjectStorage for BlobStore { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "LIST", "200"]) .observe(list_elapsed); - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "LIST"]) - .inc_by(resp.common_prefixes.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(resp.common_prefixes.len() as f64); - + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "LIST", + resp.common_prefixes.len() as u64, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date( + "azure_blob", + "LIST", + &Utc::now().date_naive().to_string(), + ); let minutes: Vec = resp .common_prefixes .iter() @@ -1012,22 +1273,23 @@ impl ObjectStorage for BlobStore { let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await; let list_elapsed = list_start.elapsed().as_secs_f64(); - + increment_object_store_calls_by_date( + "azure_blob", + "LIST", + &Utc::now().date_naive().to_string(), + ); let resp = match resp { Ok(resp) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "LIST", "200"]) .observe(list_elapsed); - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "LIST"]) - .inc_by(resp.common_prefixes.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&[ - "azure_blob", - "LIST", - &Utc::now().date_naive().to_string(), - ]) - .inc_by(resp.common_prefixes.len() as f64); + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "LIST", + resp.common_prefixes.len() as u64, + &Utc::now().date_naive().to_string(), + ); + resp } Err(err) => { @@ -1056,22 +1318,23 @@ impl ObjectStorage for BlobStore { let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&prefix)).await; let list_elapsed = list_start.elapsed().as_secs_f64(); - + increment_object_store_calls_by_date( + "azure_blob", + "LIST", + &Utc::now().date_naive().to_string(), + ); let resp = match resp { Ok(resp) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "LIST", "200"]) .observe(list_elapsed); - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "LIST"]) - .inc_by(resp.common_prefixes.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&[ - "azure_blob", - "LIST", - &Utc::now().date_naive().to_string(), - ]) - .inc_by(resp.common_prefixes.len() as f64); + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "LIST", + resp.common_prefixes.len() as u64, + &Utc::now().date_naive().to_string(), + ); + resp } Err(err) => { diff --git a/src/storage/gcs.rs b/src/storage/gcs.rs index f3c8c85f2..135f13267 100644 --- a/src/storage/gcs.rs +++ b/src/storage/gcs.rs @@ -27,9 +27,9 @@ use std::{ }; use crate::{ - metrics::storage::{ - STORAGE_FILES_SCANNED, STORAGE_FILES_SCANNED_DATE, STORAGE_REQUEST_RESPONSE_TIME, - StorageMetrics, + metrics::{ + STORAGE_REQUEST_RESPONSE_TIME, increment_files_scanned_in_object_store_calls_by_date, + increment_object_store_calls_by_date, }, parseable::LogStream, }; @@ -53,7 +53,7 @@ use object_store::{ }; use relative_path::{RelativePath, RelativePathBuf}; use tokio::{fs::OpenOptions, io::AsyncReadExt}; -use tracing::{error, info}; +use tracing::error; use super::{ CONNECT_TIMEOUT_SECS, MIN_MULTIPART_UPLOAD_SIZE, ObjectStorage, ObjectStorageError, @@ -160,10 +160,6 @@ impl ObjectStorageProvider for GcsConfig { format!("{}/{}", self.endpoint_url, self.bucket_name) } - fn register_store_metrics(&self, handler: &actix_web_prometheus::PrometheusMetrics) { - self.register_metrics(handler); - } - fn get_object_store(&self) -> Arc { static STORE: once_cell::sync::OnceCell> = once_cell::sync::OnceCell::new(); @@ -184,18 +180,21 @@ impl Gcs { let time = std::time::Instant::now(); let resp = self.client.get(&to_object_store_path(path)).await; let elapsed = time.elapsed().as_secs_f64(); - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "GET"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "GET", &Utc::now().date_naive().to_string()]) - .inc(); + + increment_object_store_calls_by_date("gcs", "GET", &Utc::now().date_naive().to_string()); + match resp { Ok(resp) => { let body: Bytes = resp.bytes().await.unwrap(); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "GET", "200"]) .observe(elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "GET", + 1, + &Utc::now().date_naive().to_string(), + ); Ok(body) } Err(err) => { @@ -216,17 +215,19 @@ impl Gcs { let time = std::time::Instant::now(); let resp = self.client.put(&to_object_store_path(path), resource).await; let elapsed = time.elapsed().as_secs_f64(); - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "PUT"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "PUT", &Utc::now().date_naive().to_string()]) - .inc(); + + increment_object_store_calls_by_date("gcs", "PUT", &Utc::now().date_naive().to_string()); match resp { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "PUT", "200"]) .observe(elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "PUT", + 1, + &Utc::now().date_naive().to_string(), + ); Ok(()) } Err(err) => { @@ -249,16 +250,23 @@ impl Gcs { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "LIST", "200"]) .observe(list_elapsed); + increment_object_store_calls_by_date("gcs", "LIST", &Utc::now().date_naive().to_string()); object_stream .for_each_concurrent(None, |x| async { files_scanned.fetch_add(1, Ordering::Relaxed); + match x { Ok(obj) => { files_deleted.fetch_add(1, Ordering::Relaxed); let delete_start = Instant::now(); let delete_resp = self.client.delete(&obj.location).await; let delete_elapsed = delete_start.elapsed().as_secs_f64(); + increment_object_store_calls_by_date( + "gcs", + "DELETE", + &Utc::now().date_naive().to_string(), + ); match delete_resp { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -281,18 +289,19 @@ impl Gcs { }) .await; - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "LIST"]) - .inc_by(files_scanned.load(Ordering::Relaxed) as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(files_scanned.load(Ordering::Relaxed) as f64); - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "DELETE"]) - .inc_by(files_deleted.load(Ordering::Relaxed) as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "DELETE", &Utc::now().date_naive().to_string()]) - .inc_by(files_deleted.load(Ordering::Relaxed) as f64); + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "LIST", + files_scanned.load(Ordering::Relaxed), + &Utc::now().date_naive().to_string(), + ); + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "DELETE", + files_deleted.load(Ordering::Relaxed), + &Utc::now().date_naive().to_string(), + ); + // Note: Individual DELETE calls are tracked inside the concurrent loop Ok(()) } @@ -303,6 +312,7 @@ impl Gcs { .list_with_delimiter(Some(&(stream.into()))) .await; let list_elapsed = list_start.elapsed().as_secs_f64(); + increment_object_store_calls_by_date("gcs", "LIST", &Utc::now().date_naive().to_string()); let resp = match resp { Ok(resp) => { @@ -322,12 +332,12 @@ impl Gcs { let common_prefixes = resp.common_prefixes; - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "LIST"]) - .inc_by(common_prefixes.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(common_prefixes.len() as f64); + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "LIST", + common_prefixes.len() as u64, + &Utc::now().date_naive().to_string(), + ); // return prefixes at the root level let dates: Vec<_> = common_prefixes @@ -345,18 +355,19 @@ impl Gcs { let put_start = Instant::now(); let result = self.client.put(&key.into(), bytes.into()).await; let put_elapsed = put_start.elapsed().as_secs_f64(); - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "PUT"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "PUT", &Utc::now().date_naive().to_string()]) - .inc(); + + increment_object_store_calls_by_date("gcs", "PUT", &Utc::now().date_naive().to_string()); match result { - Ok(result) => { + Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "PUT", "200"]) .observe(put_elapsed); - info!("Uploaded file to GCS: {:?}", result); + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "PUT", + 1, + &Utc::now().date_naive().to_string(), + ); Ok(()) } Err(err) => { @@ -381,18 +392,22 @@ impl Gcs { let multipart_start = Instant::now(); let async_writer = self.client.put_multipart(location).await; let multipart_elapsed = multipart_start.elapsed().as_secs_f64(); - + increment_object_store_calls_by_date( + "gcs", + "PUT_MULTIPART", + &Utc::now().date_naive().to_string(), + ); let mut async_writer = match async_writer { Ok(writer) => { STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "PUT_MULTIPART_INIT", "200"]) + .with_label_values(&["gcs", "PUT_MULTIPART", "200"]) .observe(multipart_elapsed); writer } Err(err) => { let status_code = error_to_status_code(&err); STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "PUT_MULTIPART_INIT", status_code]) + .with_label_values(&["gcs", "PUT_MULTIPART", status_code]) .observe(multipart_elapsed); return Err(err.into()); } @@ -408,12 +423,22 @@ impl Gcs { let put_start = Instant::now(); let result = self.client.put(location, data.into()).await; let put_elapsed = put_start.elapsed().as_secs_f64(); - + increment_object_store_calls_by_date( + "gcs", + "PUT", + &Utc::now().date_naive().to_string(), + ); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "PUT", "200"]) .observe(put_elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "PUT", + 1, + &Utc::now().date_naive().to_string(), + ); } Err(err) => { let status_code = error_to_status_code(&err); @@ -450,17 +475,21 @@ impl Gcs { let part_start = Instant::now(); let result = async_writer.put_part(part_data.into()).await; let part_elapsed = part_start.elapsed().as_secs_f64(); - + increment_object_store_calls_by_date( + "gcs", + "PUT_MULTIPART", + &Utc::now().date_naive().to_string(), + ); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "PUT_MULTIPART_PART", "200"]) + .with_label_values(&["gcs", "PUT_MULTIPART", "200"]) .observe(part_elapsed); } Err(err) => { let status_code = error_to_status_code(&err); STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "PUT_MULTIPART_PART", status_code]) + .with_label_values(&["gcs", "PUT_MULTIPART", status_code]) .observe(part_elapsed); return Err(err.into()); } @@ -505,17 +534,19 @@ impl ObjectStorage for Gcs { let head_start = Instant::now(); let meta = self.client.head(path).await; let head_elapsed = head_start.elapsed().as_secs_f64(); - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "HEAD"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "HEAD", &Utc::now().date_naive().to_string()]) - .inc(); + + increment_object_store_calls_by_date("gcs", "HEAD", &Utc::now().date_naive().to_string()); let meta = match meta { Ok(meta) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "HEAD", "200"]) .observe(head_elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "HEAD", + 1, + &Utc::now().date_naive().to_string(), + ); meta } Err(err) => { @@ -544,17 +575,19 @@ impl ObjectStorage for Gcs { let head_start = Instant::now(); let result = self.client.head(&to_object_store_path(path)).await; let head_elapsed = head_start.elapsed().as_secs_f64(); - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "HEAD"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "HEAD", &Utc::now().date_naive().to_string()]) - .inc(); + + increment_object_store_calls_by_date("gcs", "HEAD", &Utc::now().date_naive().to_string()); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "HEAD", "200"]) .observe(head_elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "HEAD", + 1, + &Utc::now().date_naive().to_string(), + ); } Err(err) => { let status_code = error_to_status_code(err); @@ -614,13 +647,17 @@ impl ObjectStorage for Gcs { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "GET", "200"]) .observe(list_start.elapsed().as_secs_f64()); - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "GET"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "GET", &Utc::now().date_naive().to_string()]) - .inc(); - + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "GET", + 1, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date( + "gcs", + "GET", + &Utc::now().date_naive().to_string(), + ); res.push(byts); } let list_elapsed = list_start.elapsed().as_secs_f64(); @@ -629,13 +666,13 @@ impl ObjectStorage for Gcs { .observe(list_elapsed); // Record total files scanned - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "LIST"]) - .inc_by(files_scanned as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(files_scanned as f64); - + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "LIST", + files_scanned as u64, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date("gcs", "LIST", &Utc::now().date_naive().to_string()); Ok(res) } @@ -648,6 +685,7 @@ impl ObjectStorage for Gcs { // Track list operation let list_start = Instant::now(); let mut object_stream = self.client.list(Some(&self.root)); + increment_object_store_calls_by_date("gcs", "LIST", &Utc::now().date_naive().to_string()); while let Some(meta_result) = object_stream.next().await { let meta = match meta_result { @@ -669,12 +707,12 @@ impl ObjectStorage for Gcs { .with_label_values(&["gcs", "LIST", "200"]) .observe(list_elapsed); // Record total files scanned - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "LIST"]) - .inc_by(files_scanned as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(files_scanned as f64); + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "LIST", + files_scanned as u64, + &Utc::now().date_naive().to_string(), + ); Ok(path_arr) } @@ -700,19 +738,19 @@ impl ObjectStorage for Gcs { let delete_start = Instant::now(); let result = self.client.delete(&to_object_store_path(path)).await; let delete_elapsed = delete_start.elapsed().as_secs_f64(); - + increment_object_store_calls_by_date("gcs", "DELETE", &Utc::now().date_naive().to_string()); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "DELETE", "200"]) .observe(delete_elapsed); // Record single file deleted - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "DELETE"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "DELETE", &Utc::now().date_naive().to_string()]) - .inc(); + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "DELETE", + 1, + &Utc::now().date_naive().to_string(), + ); } Err(err) => { let status_code = error_to_status_code(err); @@ -732,12 +770,19 @@ impl ObjectStorage for Gcs { .head(&to_object_store_path(&parseable_json_path())) .await; let head_elapsed = head_start.elapsed().as_secs_f64(); + increment_object_store_calls_by_date("gcs", "HEAD", &Utc::now().date_naive().to_string()); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "HEAD", "200"]) .observe(head_elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "HEAD", + 1, + &Utc::now().date_naive().to_string(), + ); } Err(err) => { let status_code = error_to_status_code(err); @@ -746,12 +791,6 @@ impl ObjectStorage for Gcs { .observe(head_elapsed); } } - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "HEAD"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "HEAD", &Utc::now().date_naive().to_string()]) - .inc(); Ok(result.map(|_| ())?) } @@ -768,17 +807,19 @@ impl ObjectStorage for Gcs { let delete_start = Instant::now(); let result = self.client.delete(&to_object_store_path(&file)).await; let delete_elapsed = delete_start.elapsed().as_secs_f64(); - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "DELETE"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "DELETE", &Utc::now().date_naive().to_string()]) - .inc(); + + increment_object_store_calls_by_date("gcs", "DELETE", &Utc::now().date_naive().to_string()); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "DELETE", "200"]) .observe(delete_elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "DELETE", + 1, + &Utc::now().date_naive().to_string(), + ); Ok(()) } Err(err) => { @@ -809,12 +850,13 @@ impl ObjectStorage for Gcs { .observe(list_elapsed); let common_prefixes = resp.common_prefixes; // get all dirs - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "LIST"]) - .inc_by(common_prefixes.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(common_prefixes.len() as f64); + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "LIST", + common_prefixes.len() as u64, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date("gcs", "LIST", &Utc::now().date_naive().to_string()); // return prefixes at the root level let dirs: HashSet<_> = common_prefixes .iter() @@ -831,7 +873,11 @@ impl ObjectStorage for Gcs { let head_start = Instant::now(); let result = self.client.head(&StorePath::from(key)).await; let head_elapsed = head_start.elapsed().as_secs_f64(); - + increment_object_store_calls_by_date( + "gcs", + "HEAD", + &Utc::now().date_naive().to_string(), + ); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -850,13 +896,12 @@ impl ObjectStorage for Gcs { }; stream_json_check.push(task); } - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "HEAD"]) - .inc_by(dirs.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "HEAD", &Utc::now().date_naive().to_string()]) - .inc_by(dirs.len() as f64); - + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "HEAD", + dirs.len() as u64, + &Utc::now().date_naive().to_string(), + ); stream_json_check.try_collect::<()>().await?; Ok(dirs) @@ -880,12 +925,14 @@ impl ObjectStorage for Gcs { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "LIST", "200"]) .observe(list_elapsed); - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "LIST"]) - .inc_by(resp.common_prefixes.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(resp.common_prefixes.len() as f64); + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "LIST", + resp.common_prefixes.len() as u64, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date("gcs", "LIST", &Utc::now().date_naive().to_string()); + let hours: Vec = resp .common_prefixes .iter() @@ -919,12 +966,13 @@ impl ObjectStorage for Gcs { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "LIST", "200"]) .observe(list_elapsed); - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "LIST"]) - .inc_by(resp.common_prefixes.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(resp.common_prefixes.len() as f64); + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "LIST", + resp.common_prefixes.len() as u64, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date("gcs", "LIST", &Utc::now().date_naive().to_string()); let minutes: Vec = resp .common_prefixes .iter() @@ -974,18 +1022,19 @@ impl ObjectStorage for Gcs { let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await; let list_elapsed = list_start.elapsed().as_secs_f64(); - + increment_object_store_calls_by_date("gcs", "LIST", &Utc::now().date_naive().to_string()); let resp = match resp { Ok(resp) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "LIST", "200"]) .observe(list_elapsed); - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "LIST"]) - .inc_by(resp.common_prefixes.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(resp.common_prefixes.len() as f64); + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "LIST", + resp.common_prefixes.len() as u64, + &Utc::now().date_naive().to_string(), + ); + resp } Err(err) => { @@ -1014,18 +1063,19 @@ impl ObjectStorage for Gcs { let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&prefix)).await; let list_elapsed = list_start.elapsed().as_secs_f64(); - + increment_object_store_calls_by_date("gcs", "LIST", &Utc::now().date_naive().to_string()); let resp = match resp { Ok(resp) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "LIST", "200"]) .observe(list_elapsed); - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "LIST"]) - .inc_by(resp.common_prefixes.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(resp.common_prefixes.len() as f64); + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "LIST", + resp.common_prefixes.len() as u64, + &Utc::now().date_naive().to_string(), + ); + resp } Err(err) => { diff --git a/src/storage/localfs.rs b/src/storage/localfs.rs index 28b1aeb74..01611306c 100644 --- a/src/storage/localfs.rs +++ b/src/storage/localfs.rs @@ -39,9 +39,9 @@ use tokio_stream::wrappers::ReadDirStream; use crate::{ handlers::http::users::USERS_ROOT_DIR, - metrics::storage::{ - STORAGE_FILES_SCANNED, STORAGE_FILES_SCANNED_DATE, STORAGE_REQUEST_RESPONSE_TIME, - StorageMetrics, + metrics::{ + STORAGE_REQUEST_RESPONSE_TIME, increment_files_scanned_in_object_store_calls_by_date, + increment_object_store_calls_by_date, }, option::validation, parseable::LogStream, @@ -88,10 +88,6 @@ impl ObjectStorageProvider for FSConfig { fn get_endpoint(&self) -> String { self.root.to_str().unwrap().to_string() } - - fn register_store_metrics(&self, handler: &actix_web_prometheus::PrometheusMetrics) { - self.register_metrics(handler); - } } #[derive(Debug)] @@ -135,12 +131,17 @@ impl ObjectStorage for LocalFS { } async fn head(&self, _path: &RelativePath) -> Result { // Record attempt to access file (even though operation not implemented) - STORAGE_FILES_SCANNED - .with_label_values(&["localfs", "HEAD"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["localfs", "HEAD", &Utc::now().date_naive().to_string()]) - .inc(); + increment_files_scanned_in_object_store_calls_by_date( + "localfs", + "HEAD", + 1, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date( + "localfs", + "HEAD", + &Utc::now().date_naive().to_string(), + ); Err(ObjectStorageError::UnhandledError(Box::new( std::io::Error::new( std::io::ErrorKind::Unsupported, @@ -182,12 +183,17 @@ impl ObjectStorage for LocalFS { .with_label_values(&["localfs", "GET", "200"]) .observe(get_elapsed); // Record single file accessed successfully - STORAGE_FILES_SCANNED - .with_label_values(&["localfs", "GET"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["localfs", "GET", &Utc::now().date_naive().to_string()]) - .inc(); + increment_files_scanned_in_object_store_calls_by_date( + "localfs", + "GET", + 1, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date( + "localfs", + "GET", + &Utc::now().date_naive().to_string(), + ); Ok(x.into()) } Err(e) => { @@ -253,12 +259,17 @@ impl ObjectStorage for LocalFS { } // Record total files scanned - STORAGE_FILES_SCANNED - .with_label_values(&["localfs", "LIST"]) - .inc_by(files_scanned as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["localfs", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(files_scanned as f64); + increment_files_scanned_in_object_store_calls_by_date( + "localfs", + "LIST", + files_scanned, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date( + "localfs", + "LIST", + &Utc::now().date_naive().to_string(), + ); Ok(path_arr) } @@ -334,14 +345,17 @@ impl ObjectStorage for LocalFS { } // Record total files scanned - STORAGE_FILES_SCANNED - .with_label_values(&["localfs", "GET"]) - .inc_by(files_scanned as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["localfs", "GET", &Utc::now().date_naive().to_string()]) - .inc_by(files_scanned as f64); - - // maybe change the return code + increment_files_scanned_in_object_store_calls_by_date( + "localfs", + "GET", + files_scanned as u64, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date( + "localfs", + "GET", + &Utc::now().date_naive().to_string(), + ); Ok(res) } @@ -366,12 +380,17 @@ impl ObjectStorage for LocalFS { .with_label_values(&["localfs", "PUT", "200"]) .observe(put_elapsed); // Record single file written successfully - STORAGE_FILES_SCANNED - .with_label_values(&["localfs", "PUT"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["localfs", "PUT", &Utc::now().date_naive().to_string()]) - .inc(); + increment_files_scanned_in_object_store_calls_by_date( + "localfs", + "PUT", + 1, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date( + "localfs", + "PUT", + &Utc::now().date_naive().to_string(), + ); } Err(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -425,12 +444,17 @@ impl ObjectStorage for LocalFS { .with_label_values(&["localfs", "DELETE", "200"]) .observe(delete_elapsed); // Record single file deleted successfully - STORAGE_FILES_SCANNED - .with_label_values(&["localfs", "DELETE"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["localfs", "DELETE", &Utc::now().date_naive().to_string()]) - .inc(); + increment_files_scanned_in_object_store_calls_by_date( + "localfs", + "DELETE", + 1, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date( + "localfs", + "DELETE", + &Utc::now().date_naive().to_string(), + ); } Err(err) => { let status_code = match err.kind() { @@ -458,6 +482,11 @@ impl ObjectStorage for LocalFS { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["localfs", "HEAD", "200"]) .observe(check_elapsed); + increment_object_store_calls_by_date( + "localfs", + "HEAD", + &Utc::now().date_naive().to_string(), + ); } Err(err) => { let status_code = match err.kind() { @@ -549,6 +578,11 @@ impl ObjectStorage for LocalFS { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["localfs", "LIST", "200"]) .observe(list_elapsed); + increment_object_store_calls_by_date( + "localfs", + "LIST", + &Utc::now().date_naive().to_string(), + ); ReadDirStream::new(read_dir) } Err(err) => { @@ -595,6 +629,11 @@ impl ObjectStorage for LocalFS { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["localfs", "LIST", "200"]) .observe(list_elapsed); + increment_object_store_calls_by_date( + "localfs", + "LIST", + &Utc::now().date_naive().to_string(), + ); ReadDirStream::new(read_dir) } Err(err) => { @@ -722,6 +761,11 @@ impl ObjectStorage for LocalFS { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["localfs", "LIST", "200"]) .observe(list_elapsed); + increment_object_store_calls_by_date( + "localfs", + "LIST", + &Utc::now().date_naive().to_string(), + ); read_dir } Err(err) => { @@ -802,6 +846,11 @@ impl ObjectStorage for LocalFS { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["localfs", "PUT", "200"]) .observe(upload_elapsed); + increment_object_store_calls_by_date( + "localfs", + "PUT", + &Utc::now().date_naive().to_string(), + ); Ok(()) } Err(err) => { diff --git a/src/storage/metrics_layer.rs b/src/storage/metrics_layer.rs index 6de1d9e64..945ee6ba8 100644 --- a/src/storage/metrics_layer.rs +++ b/src/storage/metrics_layer.rs @@ -30,11 +30,7 @@ use object_store::{ PutOptions, PutPayload, PutResult, Result as ObjectStoreResult, path::Path, }; -/* NOTE: Keeping these imports as they would make migration to object_store 0.10.0 easier -use object_store::{MultipartUpload, PutMultipartOpts, PutPayload} -*/ - -use crate::metrics::storage::STORAGE_REQUEST_RESPONSE_TIME; +use crate::metrics::STORAGE_REQUEST_RESPONSE_TIME; // Public helper function to map object_store errors to HTTP status codes pub fn error_to_status_code(err: &object_store::Error) -> &'static str { diff --git a/src/storage/object_storage.rs b/src/storage/object_storage.rs index 5a3f1b1eb..858e8740e 100644 --- a/src/storage/object_storage.rs +++ b/src/storage/object_storage.rs @@ -16,7 +16,6 @@ * */ -use actix_web_prometheus::PrometheusMetrics; use arrow_schema::Schema; use async_trait::async_trait; use bytes::Bytes; @@ -51,7 +50,8 @@ use crate::handlers::http::modal::ingest_server::INGESTOR_EXPECT; use crate::handlers::http::modal::ingest_server::INGESTOR_META; use crate::handlers::http::users::{FILTER_DIR, USERS_ROOT_DIR}; use crate::metrics::TOTAL_EVENTS_STORAGE_SIZE_DATE; -use crate::metrics::storage::StorageMetrics; +use crate::metrics::increment_parquets_stored_by_date; +use crate::metrics::increment_parquets_stored_size_by_date; use crate::metrics::{EVENTS_STORAGE_SIZE_DATE, LIFETIME_EVENTS_STORAGE_SIZE, STORAGE_SIZE}; use crate::option::Mode; use crate::parseable::{LogStream, PARSEABLE, Stream}; @@ -187,6 +187,10 @@ fn update_storage_metrics( .with_label_values(&["parquet", file_date_part]) .add(compressed_size as i64); + // billing metrics for parquet storage + increment_parquets_stored_by_date(file_date_part); + increment_parquets_stored_size_by_date(compressed_size, file_date_part); + Ok(()) } @@ -248,7 +252,7 @@ async fn validate_uploaded_parquet_file( } } -pub trait ObjectStorageProvider: StorageMetrics + std::fmt::Debug + Send + Sync { +pub trait ObjectStorageProvider: std::fmt::Debug + Send + Sync { fn get_datafusion_runtime(&self) -> RuntimeEnvBuilder; fn construct_client(&self) -> Arc; fn get_object_store(&self) -> Arc { @@ -257,7 +261,6 @@ pub trait ObjectStorageProvider: StorageMetrics + std::fmt::Debug + Send + Sync STORE.get_or_init(|| self.construct_client()).clone() } fn get_endpoint(&self) -> String; - fn register_store_metrics(&self, handler: &PrometheusMetrics); fn name(&self) -> &'static str; } diff --git a/src/storage/s3.rs b/src/storage/s3.rs index fc4ab5638..203e0af34 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -48,12 +48,12 @@ use object_store::{ }; use relative_path::{RelativePath, RelativePathBuf}; use tokio::{fs::OpenOptions, io::AsyncReadExt}; -use tracing::{error, info}; +use tracing::error; use crate::{ - metrics::storage::{ - STORAGE_FILES_SCANNED, STORAGE_FILES_SCANNED_DATE, STORAGE_REQUEST_RESPONSE_TIME, - StorageMetrics, + metrics::{ + STORAGE_REQUEST_RESPONSE_TIME, increment_files_scanned_in_object_store_calls_by_date, + increment_object_store_calls_by_date, }, parseable::LogStream, }; @@ -328,10 +328,6 @@ impl ObjectStorageProvider for S3Config { fn get_endpoint(&self) -> String { format!("{}/{}", self.endpoint_url, self.bucket_name) } - - fn register_store_metrics(&self, handler: &actix_web_prometheus::PrometheusMetrics) { - self.register_metrics(handler) - } } #[derive(Debug)] @@ -346,18 +342,21 @@ impl S3 { let time = std::time::Instant::now(); let resp = self.client.get(&to_object_store_path(path)).await; let elapsed = time.elapsed().as_secs_f64(); - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "GET"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "GET", &Utc::now().date_naive().to_string()]) - .inc(); + + increment_object_store_calls_by_date("s3", "GET", &Utc::now().date_naive().to_string()); + match resp { Ok(resp) => { let body = resp.bytes().await?; STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "GET", "200"]) .observe(elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "GET", + 1, + &Utc::now().date_naive().to_string(), + ); Ok(body) } Err(err) => { @@ -378,17 +377,19 @@ impl S3 { let time = std::time::Instant::now(); let resp = self.client.put(&to_object_store_path(path), resource).await; let elapsed = time.elapsed().as_secs_f64(); - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "PUT"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "PUT", &Utc::now().date_naive().to_string()]) - .inc(); + + increment_object_store_calls_by_date("s3", "PUT", &Utc::now().date_naive().to_string()); match resp { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "PUT", "200"]) .observe(elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "PUT", + 1, + &Utc::now().date_naive().to_string(), + ); Ok(()) } Err(err) => { @@ -411,16 +412,23 @@ impl S3 { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "LIST", "200"]) .observe(list_elapsed); + increment_object_store_calls_by_date("s3", "LIST", &Utc::now().date_naive().to_string()); object_stream .for_each_concurrent(None, |x| async { files_scanned.fetch_add(1, Ordering::Relaxed); + match x { Ok(obj) => { files_deleted.fetch_add(1, Ordering::Relaxed); let delete_start = Instant::now(); let delete_resp = self.client.delete(&obj.location).await; let delete_elapsed = delete_start.elapsed().as_secs_f64(); + increment_object_store_calls_by_date( + "s3", + "DELETE", + &Utc::now().date_naive().to_string(), + ); match delete_resp { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -443,18 +451,19 @@ impl S3 { }) .await; - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "LIST"]) - .inc_by(files_scanned.load(Ordering::Relaxed) as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(files_scanned.load(Ordering::Relaxed) as f64); - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "DELETE"]) - .inc_by(files_deleted.load(Ordering::Relaxed) as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "DELETE", &Utc::now().date_naive().to_string()]) - .inc_by(files_deleted.load(Ordering::Relaxed) as f64); + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "LIST", + files_scanned.load(Ordering::Relaxed), + &Utc::now().date_naive().to_string(), + ); + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "DELETE", + files_deleted.load(Ordering::Relaxed), + &Utc::now().date_naive().to_string(), + ); + // Note: Individual DELETE calls are tracked inside the concurrent loop Ok(()) } @@ -465,6 +474,7 @@ impl S3 { .list_with_delimiter(Some(&(stream.into()))) .await; let list_elapsed = list_start.elapsed().as_secs_f64(); + increment_object_store_calls_by_date("s3", "LIST", &Utc::now().date_naive().to_string()); let resp = match resp { Ok(resp) => { @@ -484,12 +494,12 @@ impl S3 { let common_prefixes = resp.common_prefixes; - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "LIST"]) - .inc_by(common_prefixes.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(common_prefixes.len() as f64); + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "LIST", + common_prefixes.len() as u64, + &Utc::now().date_naive().to_string(), + ); // return prefixes at the root level let dates: Vec<_> = common_prefixes @@ -507,18 +517,19 @@ impl S3 { let put_start = Instant::now(); let result = self.client.put(&key.into(), bytes.into()).await; let put_elapsed = put_start.elapsed().as_secs_f64(); - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "PUT"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "PUT", &Utc::now().date_naive().to_string()]) - .inc(); + + increment_object_store_calls_by_date("s3", "PUT", &Utc::now().date_naive().to_string()); match result { - Ok(result) => { + Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "PUT", "200"]) .observe(put_elapsed); - info!("Uploaded file to S3: {:?}", result); + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "PUT", + 1, + &Utc::now().date_naive().to_string(), + ); Ok(()) } Err(err) => { @@ -543,12 +554,17 @@ impl S3 { let multipart_start = Instant::now(); let async_writer = self.client.put_multipart(location).await; let multipart_elapsed = multipart_start.elapsed().as_secs_f64(); - + increment_object_store_calls_by_date( + "s3", + "PUT_MULTIPART", + &Utc::now().date_naive().to_string(), + ); let mut async_writer = match async_writer { Ok(writer) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "PUT_MULTIPART", "200"]) .observe(multipart_elapsed); + writer } Err(err) => { @@ -570,17 +586,19 @@ impl S3 { let put_start = Instant::now(); let result = self.client.put(location, data.into()).await; let put_elapsed = put_start.elapsed().as_secs_f64(); - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "PUT"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "PUT", &Utc::now().date_naive().to_string()]) - .inc(); + + increment_object_store_calls_by_date("s3", "PUT", &Utc::now().date_naive().to_string()); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "PUT", "200"]) .observe(put_elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "PUT", + 1, + &Utc::now().date_naive().to_string(), + ); } Err(err) => { let status_code = error_to_status_code(&err); @@ -622,17 +640,27 @@ impl S3 { let part_start = Instant::now(); let result = async_writer.put_part(part_data.into()).await; let part_elapsed = part_start.elapsed().as_secs_f64(); - + increment_object_store_calls_by_date( + "s3", + "PUT_MULTIPART", + &Utc::now().date_naive().to_string(), + ); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT_MULTIPART_PART", "200"]) + .with_label_values(&["s3", "PUT_MULTIPART", "200"]) .observe(part_elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "PUT_MULTIPART", + 1, + &Utc::now().date_naive().to_string(), + ); } Err(err) => { let status_code = error_to_status_code(&err); STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT_MULTIPART_PART", status_code]) + .with_label_values(&["s3", "PUT_MULTIPART", status_code]) .observe(part_elapsed); return Err(err.into()); } @@ -675,17 +703,19 @@ impl ObjectStorage for S3 { let head_start = Instant::now(); let meta = self.client.head(path).await; let head_elapsed = head_start.elapsed().as_secs_f64(); - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "HEAD"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "HEAD", &Utc::now().date_naive().to_string()]) - .inc(); + + increment_object_store_calls_by_date("s3", "HEAD", &Utc::now().date_naive().to_string()); let meta = match meta { Ok(meta) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "HEAD", "200"]) .observe(head_elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "HEAD", + 1, + &Utc::now().date_naive().to_string(), + ); meta } Err(err) => { @@ -714,17 +744,19 @@ impl ObjectStorage for S3 { let head_start = Instant::now(); let result = self.client.head(&to_object_store_path(path)).await; let head_elapsed = head_start.elapsed().as_secs_f64(); - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "HEAD"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "HEAD", &Utc::now().date_naive().to_string()]) - .inc(); + + increment_object_store_calls_by_date("s3", "HEAD", &Utc::now().date_naive().to_string()); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "HEAD", "200"]) .observe(head_elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "HEAD", + 1, + &Utc::now().date_naive().to_string(), + ); } Err(err) => { let status_code = error_to_status_code(err); @@ -784,13 +816,13 @@ impl ObjectStorage for S3 { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "GET", "200"]) .observe(list_start.elapsed().as_secs_f64()); - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "GET"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "GET", &Utc::now().date_naive().to_string()]) - .inc(); - + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "GET", + 1, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date("s3", "GET", &Utc::now().date_naive().to_string()); res.push(byts); } let list_elapsed = list_start.elapsed().as_secs_f64(); @@ -799,13 +831,13 @@ impl ObjectStorage for S3 { .observe(list_elapsed); // Record total files scanned - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "LIST"]) - .inc_by(files_scanned as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(files_scanned as f64); - + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "LIST", + files_scanned as u64, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date("s3", "LIST", &Utc::now().date_naive().to_string()); Ok(res) } @@ -818,6 +850,7 @@ impl ObjectStorage for S3 { // Track list operation let list_start = Instant::now(); let mut object_stream = self.client.list(Some(&self.root)); + increment_object_store_calls_by_date("s3", "LIST", &Utc::now().date_naive().to_string()); while let Some(meta_result) = object_stream.next().await { let meta = match meta_result { @@ -839,13 +872,12 @@ impl ObjectStorage for S3 { .with_label_values(&["s3", "LIST", "200"]) .observe(list_elapsed); // Record total files scanned - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "LIST"]) - .inc_by(files_scanned as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(files_scanned as f64); - + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "LIST", + files_scanned as u64, + &Utc::now().date_naive().to_string(), + ); Ok(path_arr) } @@ -871,19 +903,19 @@ impl ObjectStorage for S3 { let delete_start = Instant::now(); let result = self.client.delete(&to_object_store_path(path)).await; let delete_elapsed = delete_start.elapsed().as_secs_f64(); - + increment_object_store_calls_by_date("s3", "DELETE", &Utc::now().date_naive().to_string()); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "DELETE", "200"]) .observe(delete_elapsed); // Record single file deleted - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "DELETE"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "DELETE", &Utc::now().date_naive().to_string()]) - .inc(); + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "DELETE", + 1, + &Utc::now().date_naive().to_string(), + ); } Err(err) => { let status_code = error_to_status_code(err); @@ -903,12 +935,19 @@ impl ObjectStorage for S3 { .head(&to_object_store_path(&parseable_json_path())) .await; let head_elapsed = head_start.elapsed().as_secs_f64(); + increment_object_store_calls_by_date("s3", "HEAD", &Utc::now().date_naive().to_string()); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "HEAD", "200"]) .observe(head_elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "HEAD", + 1, + &Utc::now().date_naive().to_string(), + ); } Err(err) => { let status_code = error_to_status_code(err); @@ -917,12 +956,6 @@ impl ObjectStorage for S3 { .observe(head_elapsed); } } - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "HEAD"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "HEAD", &Utc::now().date_naive().to_string()]) - .inc(); Ok(result.map(|_| ())?) } @@ -939,17 +972,19 @@ impl ObjectStorage for S3 { let delete_start = Instant::now(); let result = self.client.delete(&to_object_store_path(&file)).await; let delete_elapsed = delete_start.elapsed().as_secs_f64(); - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "DELETE"]) - .inc(); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "DELETE", &Utc::now().date_naive().to_string()]) - .inc(); + + increment_object_store_calls_by_date("s3", "DELETE", &Utc::now().date_naive().to_string()); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "DELETE", "200"]) .observe(delete_elapsed); + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "DELETE", + 1, + &Utc::now().date_naive().to_string(), + ); Ok(()) } Err(err) => { @@ -980,12 +1015,13 @@ impl ObjectStorage for S3 { .observe(list_elapsed); let common_prefixes = resp.common_prefixes; // get all dirs - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "LIST"]) - .inc_by(common_prefixes.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(common_prefixes.len() as f64); + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "LIST", + common_prefixes.len() as u64, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date("s3", "LIST", &Utc::now().date_naive().to_string()); // return prefixes at the root level let dirs: HashSet<_> = common_prefixes .iter() @@ -1002,7 +1038,11 @@ impl ObjectStorage for S3 { let head_start = Instant::now(); let result = self.client.head(&StorePath::from(key)).await; let head_elapsed = head_start.elapsed().as_secs_f64(); - + increment_object_store_calls_by_date( + "s3", + "HEAD", + &Utc::now().date_naive().to_string(), + ); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -1021,13 +1061,12 @@ impl ObjectStorage for S3 { }; stream_json_check.push(task); } - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "HEAD"]) - .inc_by(dirs.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "HEAD", &Utc::now().date_naive().to_string()]) - .inc_by(dirs.len() as f64); - + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "HEAD", + dirs.len() as u64, + &Utc::now().date_naive().to_string(), + ); stream_json_check.try_collect::<()>().await?; Ok(dirs) @@ -1051,12 +1090,13 @@ impl ObjectStorage for S3 { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "LIST", "200"]) .observe(list_elapsed); - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "LIST"]) - .inc_by(resp.common_prefixes.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(resp.common_prefixes.len() as f64); + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "LIST", + resp.common_prefixes.len() as u64, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date("s3", "LIST", &Utc::now().date_naive().to_string()); let hours: Vec = resp .common_prefixes @@ -1091,13 +1131,13 @@ impl ObjectStorage for S3 { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "LIST", "200"]) .observe(list_elapsed); - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "LIST"]) - .inc_by(resp.common_prefixes.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(resp.common_prefixes.len() as f64); - + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "LIST", + resp.common_prefixes.len() as u64, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date("s3", "LIST", &Utc::now().date_naive().to_string()); let minutes: Vec = resp .common_prefixes .iter() @@ -1156,18 +1196,19 @@ impl ObjectStorage for S3 { let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await; let list_elapsed = list_start.elapsed().as_secs_f64(); - + increment_object_store_calls_by_date("s3", "LIST", &Utc::now().date_naive().to_string()); let resp = match resp { Ok(resp) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "LIST", "200"]) .observe(list_elapsed); - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "LIST"]) - .inc_by(resp.common_prefixes.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(resp.common_prefixes.len() as f64); + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "LIST", + resp.common_prefixes.len() as u64, + &Utc::now().date_naive().to_string(), + ); + resp } Err(err) => { @@ -1196,18 +1237,19 @@ impl ObjectStorage for S3 { let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&prefix)).await; let list_elapsed = list_start.elapsed().as_secs_f64(); - + increment_object_store_calls_by_date("s3", "LIST", &Utc::now().date_naive().to_string()); let resp = match resp { Ok(resp) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "LIST", "200"]) .observe(list_elapsed); - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "LIST"]) - .inc_by(resp.common_prefixes.len() as f64); - STORAGE_FILES_SCANNED_DATE - .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) - .inc_by(resp.common_prefixes.len() as f64); + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "LIST", + resp.common_prefixes.len() as u64, + &Utc::now().date_naive().to_string(), + ); + resp } Err(err) => { From 67a75b5809b5831bcd8794df8418b5b096d699fc Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Thu, 11 Sep 2025 19:11:38 -0700 Subject: [PATCH 05/14] add cluster metrics, update regex for parseable server logs --- resources/formats.json | 16 ++++- src/metrics/mod.rs | 145 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 160 insertions(+), 1 deletion(-) diff --git a/resources/formats.json b/resources/formats.json index 18dad8ae6..ad5efae39 100644 --- a/resources/formats.json +++ b/resources/formats.json @@ -1466,8 +1466,22 @@ ] }, { - "name": "rust_server_logs", + "name": "parseable_server_logs", "regex": [ + { + "pattern": "^(?P\\S+)\\s+(?P\\S+)\\s+(?P\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z?)\\s+(?P\\w+)\\s+(?P\\S+)\\s+(?PThreadId\\(\\d+\\))\\s+(?P.*?):(?P\\d+):\\s+(?P.*)", + "fields": [ + "customer_id", + "deployment_id", + "timestamp", + "level", + "logger_context", + "thread_id", + "module", + "line_number", + "body" + ] + }, { "pattern": "^(?P\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z)\\s+(?P\\w+)\\s+(?P\\S+)\\s+(?PThreadId\\(\\d+\\))\\s+(?P.*?):(?P\\d+):\\s+(?P.*)", "fields": [ diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index ee25e4a5f..e2d297627 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -336,6 +336,117 @@ pub static TOTAL_FILES_SCANNED_IN_OBJECT_STORE_CALLS_BY_DATE: Lazy = Lazy::new(|| { + IntGaugeVec::new( + Opts::new( + "total_cluster_events_ingested_by_date", + "Total cluster events ingested by date (Gauge for cluster billing)", + ) + .namespace(METRICS_NAMESPACE), + &["date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_CLUSTER_EVENTS_INGESTED_SIZE_BY_DATE: Lazy = Lazy::new(|| { + IntGaugeVec::new( + Opts::new( + "total_cluster_events_ingested_size_by_date", + "Total cluster events ingested size in bytes by date (Gauge for cluster billing)", + ) + .namespace(METRICS_NAMESPACE), + &["date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_CLUSTER_PARQUETS_STORED_BY_DATE: Lazy = Lazy::new(|| { + IntGaugeVec::new( + Opts::new( + "total_cluster_parquets_stored_by_date", + "Total cluster parquet files stored by date (Gauge for cluster billing)", + ) + .namespace(METRICS_NAMESPACE), + &["date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_CLUSTER_PARQUETS_STORED_SIZE_BY_DATE: Lazy = Lazy::new(|| { + IntGaugeVec::new( + Opts::new( + "total_cluster_parquets_stored_size_by_date", + "Total cluster parquet files stored size in bytes by date (Gauge for cluster billing)", + ) + .namespace(METRICS_NAMESPACE), + &["date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_CLUSTER_QUERY_CALLS_BY_DATE: Lazy = Lazy::new(|| { + IntGaugeVec::new( + Opts::new( + "total_cluster_query_calls_by_date", + "Total cluster query calls by date (Gauge for cluster billing)", + ) + .namespace(METRICS_NAMESPACE), + &["date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_CLUSTER_FILES_SCANNED_IN_QUERY_BY_DATE: Lazy = Lazy::new(|| { + IntGaugeVec::new( + Opts::new( + "total_cluster_files_scanned_in_query_by_date", + "Total cluster files scanned in queries by date (Gauge for cluster billing)", + ) + .namespace(METRICS_NAMESPACE), + &["date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_CLUSTER_BYTES_SCANNED_IN_QUERY_BY_DATE: Lazy = Lazy::new(|| { + IntGaugeVec::new( + Opts::new( + "total_cluster_bytes_scanned_in_query_by_date", + "Total cluster bytes scanned in queries by date (Gauge for cluster billing)", + ) + .namespace(METRICS_NAMESPACE), + &["date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_CLUSTER_OBJECT_STORE_CALLS_BY_DATE: Lazy = Lazy::new(|| { + IntGaugeVec::new( + Opts::new( + "total_cluster_object_store_calls_by_date", + "Total cluster object store calls by date (Gauge for cluster billing)", + ) + .namespace(METRICS_NAMESPACE), + &["provider", "method", "date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_CLUSTER_FILES_SCANNED_IN_OBJECT_STORE_CALLS_BY_DATE: Lazy = Lazy::new( + || { + IntGaugeVec::new( + Opts::new( + "total_cluster_files_scanned_in_object_store_calls_by_date", + "Total cluster files scanned in object store calls by date (Gauge for cluster billing)", + ) + .namespace(METRICS_NAMESPACE), + &["provider", "method", "date"], + ) + .expect("metric can be created") + }, +); + pub static STORAGE_REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { HistogramVec::new( HistogramOpts::new("storage_request_response_time", "Storage Request Latency") @@ -433,6 +544,40 @@ fn custom_metrics(registry: &Registry) { TOTAL_FILES_SCANNED_IN_OBJECT_STORE_CALLS_BY_DATE.clone(), )) .expect("metric can be registered"); + // Register cluster billing metrics + registry + .register(Box::new(TOTAL_CLUSTER_EVENTS_INGESTED_BY_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_CLUSTER_EVENTS_INGESTED_SIZE_BY_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_CLUSTER_PARQUETS_STORED_BY_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_CLUSTER_PARQUETS_STORED_SIZE_BY_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_CLUSTER_QUERY_CALLS_BY_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new( + TOTAL_CLUSTER_FILES_SCANNED_IN_QUERY_BY_DATE.clone(), + )) + .expect("metric can be registered"); + registry + .register(Box::new( + TOTAL_CLUSTER_BYTES_SCANNED_IN_QUERY_BY_DATE.clone(), + )) + .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_CLUSTER_OBJECT_STORE_CALLS_BY_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new( + TOTAL_CLUSTER_FILES_SCANNED_IN_OBJECT_STORE_CALLS_BY_DATE.clone(), + )) + .expect("metric can be registered"); registry .register(Box::new(STORAGE_REQUEST_RESPONSE_TIME.clone())) .expect("metric can be registered"); From 4358caa3c33f3293e8c6bbc42475aafee56868f7 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Thu, 11 Sep 2025 19:59:00 -0700 Subject: [PATCH 06/14] remove unused --- src/storage/azure_blob.rs | 172 +------------------------------------- 1 file changed, 1 insertion(+), 171 deletions(-) diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index 4e3d56dcb..38a5431f4 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -17,7 +17,7 @@ */ use std::{ - collections::{BTreeMap, HashSet}, + collections::HashSet, path::Path, sync::{ Arc, @@ -50,13 +50,11 @@ use tracing::error; use url::Url; use crate::{ - handlers::http::users::USERS_ROOT_DIR, metrics::{ STORAGE_REQUEST_RESPONSE_TIME, increment_files_scanned_in_object_store_calls_by_date, increment_object_store_calls_by_date, }, parseable::LogStream, - storage::STREAM_ROOT_DIRECTORY, }; use super::{ @@ -354,82 +352,6 @@ impl BlobStore { Ok(()) } - async fn _list_streams(&self) -> Result, ObjectStorageError> { - let mut result_file_list = HashSet::new(); - let mut total_files_scanned = 0u64; - - let list_start = Instant::now(); - let resp = self.client.list_with_delimiter(None).await?; - let list_elapsed = list_start.elapsed().as_secs_f64(); - total_files_scanned += resp.objects.len() as u64; - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", "200"]) - .observe(list_elapsed); - increment_object_store_calls_by_date( - "azure_blob", - "LIST", - &Utc::now().date_naive().to_string(), - ); - - let streams = resp - .common_prefixes - .iter() - .flat_map(|path| path.parts()) - .map(|name| name.as_ref().to_string()) - .filter(|name| name != PARSEABLE_ROOT_DIRECTORY && name != USERS_ROOT_DIR) - .collect::>(); - - for stream in streams { - let stream_path = - object_store::path::Path::from(format!("{}/{}", &stream, STREAM_ROOT_DIRECTORY)); - - // Track individual LIST operations for each stream - let stream_list_start = Instant::now(); - let resp = self.client.list_with_delimiter(Some(&stream_path)).await; - let stream_list_elapsed = stream_list_start.elapsed().as_secs_f64(); - increment_object_store_calls_by_date( - "azure_blob", - "LIST", - &Utc::now().date_naive().to_string(), - ); - match &resp { - Ok(resp) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", "200"]) - .observe(stream_list_elapsed); - - total_files_scanned += resp.objects.len() as u64; - if resp - .objects - .iter() - .any(|name| name.location.filename().unwrap().ends_with("stream.json")) - { - result_file_list.insert(stream); - } - } - Err(err) => { - let status_code = error_to_status_code(err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", status_code]) - .observe(stream_list_elapsed); - - return Err(ObjectStorageError::UnhandledError(Box::new( - std::io::Error::other(format!("List operation failed: {}", err)), - ))); - } - } - } - - // Record total files scanned across all operations - increment_files_scanned_in_object_store_calls_by_date( - "azure_blob", - "LIST", - total_files_scanned, - &Utc::now().date_naive().to_string(), - ); - Ok(result_file_list) - } - async fn _list_dates(&self, stream: &str) -> Result, ObjectStorageError> { let list_start = Instant::now(); let resp: Result = self @@ -478,98 +400,6 @@ impl BlobStore { Ok(dates) } - async fn _list_manifest_files( - &self, - stream: &str, - ) -> Result>, ObjectStorageError> { - let mut result_file_list: BTreeMap> = BTreeMap::new(); - let mut total_files_scanned = 0u64; - - // Track initial LIST operation - let list_start = Instant::now(); - let resp = self - .client - .list_with_delimiter(Some(&(stream.into()))) - .await; - let list_elapsed = list_start.elapsed().as_secs_f64(); - increment_object_store_calls_by_date( - "azure_blob", - "LIST", - &Utc::now().date_naive().to_string(), - ); - let resp = match resp { - Ok(resp) => { - total_files_scanned += resp.objects.len() as u64; - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", "200"]) - .observe(list_elapsed); - - resp - } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", status_code]) - .observe(list_elapsed); - return Err(err.into()); - } - }; - - let dates = resp - .common_prefixes - .iter() - .flat_map(|path| path.parts()) - .filter(|name| name.as_ref() != stream && name.as_ref() != STREAM_ROOT_DIRECTORY) - .map(|name| name.as_ref().to_string()) - .collect::>(); - - for date in dates { - let date_path = object_store::path::Path::from(format!("{}/{}", stream, &date)); - - // Track individual LIST operation for each date - let date_list_start = Instant::now(); - let resp = self.client.list_with_delimiter(Some(&date_path)).await; - let date_list_elapsed = date_list_start.elapsed().as_secs_f64(); - increment_object_store_calls_by_date( - "azure_blob", - "LIST", - &Utc::now().date_naive().to_string(), - ); - match resp { - Ok(resp) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", "200"]) - .observe(date_list_elapsed); - - total_files_scanned += resp.objects.len() as u64; - let manifests: Vec = resp - .objects - .iter() - .filter(|name| name.location.filename().unwrap().ends_with("manifest.json")) - .map(|name| name.location.to_string()) - .collect(); - result_file_list.entry(date).or_default().extend(manifests); - } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", status_code]) - .observe(date_list_elapsed); - return Err(err.into()); - } - } - } - - // Record total files scanned across all date operations - increment_files_scanned_in_object_store_calls_by_date( - "azure_blob", - "LIST", - total_files_scanned, - &Utc::now().date_naive().to_string(), - ); - Ok(result_file_list) - } - async fn _upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { let bytes = tokio::fs::read(path).await?; From dec0492a531d74213f9a2f08c1cb1edc34850dcd Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Thu, 11 Sep 2025 23:18:02 -0700 Subject: [PATCH 07/14] remove unused, update parseable server log test --- src/event/format/known_schema.rs | 18 +++++---------- src/query/stream_schema_provider.rs | 34 ++++------------------------- src/storage/azure_blob.rs | 11 +--------- src/storage/gcs.rs | 2 +- src/storage/s3.rs | 9 -------- 5 files changed, 12 insertions(+), 62 deletions(-) diff --git a/src/event/format/known_schema.rs b/src/event/format/known_schema.rs index d231dc2b2..b5a155972 100644 --- a/src/event/format/known_schema.rs +++ b/src/event/format/known_schema.rs @@ -515,25 +515,19 @@ mod tests { } #[test] - fn test_rust_server_logs() { + fn test_parseable_server_logs() { let processor = EventProcessor::new(FORMATS_JSON); let schema = processor .schema_definitions - .get("rust_server_logs") + .get("parseable_server_logs") .unwrap(); let test_logs = vec![ // Current parseable format with ThreadId - "2025-09-06T10:43:01.628980875Z WARN main ThreadId(01) parseable::handlers::http::cluster:919: node http://0.0.0.0:8010/ is not live", - "2025-09-06T10:44:12.62276265Z ERROR actix-rt|system:0|arbiter:17 ThreadId(163) parseable_enterprise::http::handlers::query:43: JsonParse(\"Datafusion Error: Schema error: No field named a. Valid fields are serverlogs.log\")", - "2025-09-06T05:16:46.092071318Z ERROR actix-rt|system:0|arbiter:21 ThreadId(167) parseable_enterprise::http::handlers::query:43: JsonParse(\"Datafusion Error: Schema error: No field named ansible.host.ip\")", - "2025-09-06T11:22:07.500864363Z WARN main ThreadId(01) parseable_enterprise:70: Received shutdown signal, notifying server to shut down...", - // env_logger format - "[2025-09-06T10:43:01.628980875Z INFO parseable::storage] Initializing storage backend", - "[2025-09-06T10:43:01.628980875Z ERROR parseable::http::ingest] Failed to parse JSON", - // Simple tracing format (no ThreadId) - "2025-09-06T10:43:01.628980875Z INFO parseable::storage::s3: Storage configured successfully", - "2025-09-06T10:43:01.628980875Z DEBUG parseable::query::engine: Query executed in 45ms", + "01K4SHM6VQASBJ7G8V0STZN6N1 01K4SHM6VQASBJ7G8V0STZN6N1 2025-09-06T10:43:01.628980875Z WARN main ThreadId(01) parseable::handlers::http::cluster:919: node http://0.0.0.0:8010/ is not live", + "01K4SHM6VQASBJ7G8V0STZN6N1 01K4SHM6VQASBJ7G8V0STZN6N1 2025-09-06T10:44:12.62276265Z ERROR actix-rt|system:0|arbiter:17 ThreadId(163) parseable_enterprise::http::handlers::query:43: JsonParse(\"Datafusion Error: Schema error: No field named a. Valid fields are serverlogs.log\")", + "01K4SHM6VQASBJ7G8V0STZN6N1 01K4SHM6VQASBJ7G8V0STZN6N1 2025-09-06T05:16:46.092071318Z ERROR actix-rt|system:0|arbiter:21 ThreadId(167) parseable_enterprise::http::handlers::query:43: JsonParse(\"Datafusion Error: Schema error: No field named ansible.host.ip\")", + "01K4SHM6VQASBJ7G8V0STZN6N1 01K4SHM6VQASBJ7G8V0STZN6N1 2025-09-06T11:22:07.500864363Z WARN main ThreadId(01) parseable_enterprise:70: Received shutdown signal, notifying server to shut down...", ]; for (i, log_text) in test_logs.iter().enumerate() { diff --git a/src/query/stream_schema_provider.rs b/src/query/stream_schema_provider.rs index 286800018..5d4745127 100644 --- a/src/query/stream_schema_provider.rs +++ b/src/query/stream_schema_provider.rs @@ -16,7 +16,7 @@ * */ -use std::{any::Any, collections::HashMap, ops::Bound, path::PathBuf, sync::Arc}; +use std::{any::Any, collections::HashMap, ops::Bound, sync::Arc}; use arrow_array::RecordBatch; use arrow_schema::{Schema, SchemaRef, SortOptions}; @@ -46,13 +46,12 @@ use datafusion::{ }; use futures_util::TryFutureExt; use itertools::Itertools; -use relative_path::RelativePathBuf; use crate::{ catalog::{ ManifestFile, Snapshot as CatalogSnapshot, column::{Column, TypedStatistics}, - manifest::{File, Manifest}, + manifest::File, snapshot::{ManifestItem, Snapshot}, }, event::DEFAULT_TIMESTAMP_KEY, @@ -64,7 +63,7 @@ use crate::{ }, option::Mode, parseable::{PARSEABLE, STREAM_EXISTS}, - storage::{ObjectStorage, ObjectStorageError, ObjectStoreFormat}, + storage::{ObjectStorage, ObjectStoreFormat}, }; use super::listing_table_builder::ListingTableBuilder; @@ -869,37 +868,12 @@ fn extract_timestamp_bound( DateTime::from_timestamp_nanos(*value).naive_utc(), )), ScalarValue::Utf8(Some(str_value)) if is_time_partition => { - Some((binexpr.op, str_value.parse::().unwrap())) + Some((binexpr.op, str_value.parse().unwrap())) } _ => None, } } -pub async fn collect_manifest_files( - storage: Arc, - manifest_urls: Vec, -) -> Result, ObjectStorageError> { - let mut tasks = Vec::new(); - manifest_urls.into_iter().for_each(|path| { - let path = RelativePathBuf::from_path(PathBuf::from(path)).expect("Invalid path"); - let storage = Arc::clone(&storage); - tasks.push(tokio::task::spawn(async move { - storage.get_object(&path).await - })); - }); - - let mut op = Vec::new(); - for task in tasks { - let file = task.await??; - op.push(file); - } - - Ok(op - .into_iter() - .map(|res| serde_json::from_slice(&res).expect("Data is invalid for Manifest")) - .collect()) -} - // Extract start time and end time from filter predicate pub fn extract_primary_filter( filters: &[Expr], diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index 38a5431f4..c1fb2bb21 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -224,7 +224,7 @@ impl BlobStore { match resp { Ok(resp) => { - let body: Bytes = resp.bytes().await.unwrap(); + let body: Bytes = resp.bytes().await?; STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "GET", "200"]) .observe(elapsed); @@ -1062,15 +1062,6 @@ impl ObjectStorage for BlobStore { Ok(minutes) } - // async fn list_manifest_files( - // &self, - // stream_name: &str, - // ) -> Result>, ObjectStorageError> { - // let files = self._list_manifest_files(stream_name).await?; - - // Ok(files) - // } - async fn upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { Ok(self._upload_file(key, path).await?) } diff --git a/src/storage/gcs.rs b/src/storage/gcs.rs index 135f13267..a11d786ef 100644 --- a/src/storage/gcs.rs +++ b/src/storage/gcs.rs @@ -185,7 +185,7 @@ impl Gcs { match resp { Ok(resp) => { - let body: Bytes = resp.bytes().await.unwrap(); + let body: Bytes = resp.bytes().await?; STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "GET", "200"]) .observe(elapsed); diff --git a/src/storage/s3.rs b/src/storage/s3.rs index 203e0af34..57fe60992 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -1159,15 +1159,6 @@ impl ObjectStorage for S3 { Ok(minutes) } - // async fn list_manifest_files( - // &self, - // stream_name: &str, - // ) -> Result>, ObjectStorageError> { - // let files = self._list_manifest_files(stream_name).await?; - - // Ok(files) - // } - async fn upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { Ok(self._upload_file(key, path).await?) } From 30edb0561901ae66c311b632d40e1f430b3951cd Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Mon, 15 Sep 2025 00:29:35 -0700 Subject: [PATCH 08/14] add llm usage metrics --- src/metrics/mod.rs | 65 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 13 deletions(-) diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index e2d297627..3ab96a6cc 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -231,7 +231,7 @@ pub static TOTAL_EVENTS_INGESTED_BY_DATE: Lazy = Lazy::new(|| { IntCounterVec::new( Opts::new( "total_events_ingested_by_date", - "Total events ingested by date (Counter for billing)", + "Total events ingested by date", ) .namespace(METRICS_NAMESPACE), &["date"], @@ -243,7 +243,7 @@ pub static TOTAL_EVENTS_INGESTED_SIZE_BY_DATE: Lazy = Lazy::new(| IntCounterVec::new( Opts::new( "total_events_ingested_size_by_date", - "Total events ingested size in bytes by date (Counter for billing)", + "Total events ingested size in bytes by date", ) .namespace(METRICS_NAMESPACE), &["date"], @@ -255,7 +255,7 @@ pub static TOTAL_PARQUETS_STORED_BY_DATE: Lazy = Lazy::new(|| { IntCounterVec::new( Opts::new( "total_parquets_stored_by_date", - "Total parquet files stored by date (Counter for billing)", + "Total parquet files stored by date", ) .namespace(METRICS_NAMESPACE), &["date"], @@ -267,7 +267,7 @@ pub static TOTAL_PARQUETS_STORED_SIZE_BY_DATE: Lazy = Lazy::new(| IntCounterVec::new( Opts::new( "total_parquets_stored_size_by_date", - "Total parquet files stored size in bytes by date (Counter for billing)", + "Total parquet files stored size in bytes by date", ) .namespace(METRICS_NAMESPACE), &["date"], @@ -277,11 +277,8 @@ pub static TOTAL_PARQUETS_STORED_SIZE_BY_DATE: Lazy = Lazy::new(| pub static TOTAL_QUERY_CALLS_BY_DATE: Lazy = Lazy::new(|| { IntCounterVec::new( - Opts::new( - "total_query_calls_by_date", - "Total query calls by date (Counter for billing)", - ) - .namespace(METRICS_NAMESPACE), + Opts::new("total_query_calls_by_date", "Total query calls by date") + .namespace(METRICS_NAMESPACE), &["date"], ) .expect("metric can be created") @@ -291,7 +288,7 @@ pub static TOTAL_FILES_SCANNED_IN_QUERY_BY_DATE: Lazy = Lazy::new IntCounterVec::new( Opts::new( "total_files_scanned_in_query_by_date", - "Total files scanned in queries by date (Counter for billing)", + "Total files scanned in queries by date", ) .namespace(METRICS_NAMESPACE), &["date"], @@ -303,7 +300,7 @@ pub static TOTAL_BYTES_SCANNED_IN_QUERY_BY_DATE: Lazy = Lazy::new IntCounterVec::new( Opts::new( "total_bytes_scanned_in_query_by_date", - "Total bytes scanned in queries by date (Counter for billing)", + "Total bytes scanned in queries by date", ) .namespace(METRICS_NAMESPACE), &["date"], @@ -315,7 +312,7 @@ pub static TOTAL_OBJECT_STORE_CALLS_BY_DATE: Lazy = Lazy::new(|| IntCounterVec::new( Opts::new( "total_object_store_calls_by_date", - "Total object store calls by date (Counter for billing)", + "Total object store calls by date", ) .namespace(METRICS_NAMESPACE), &["provider", "method", "date"], @@ -328,7 +325,7 @@ pub static TOTAL_FILES_SCANNED_IN_OBJECT_STORE_CALLS_BY_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( + Opts::new( + "total_input_llm_tokens_by_date", + "Total input LLM tokens used by date", + ) + .namespace(METRICS_NAMESPACE), + &["provider", "model", "date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_OUTPUT_LLM_TOKENS_BY_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( + Opts::new( + "total_output_llm_tokens_by_date", + "Total output LLM tokens used by date", + ) + .namespace(METRICS_NAMESPACE), + &["provider", "model", "date"], + ) + .expect("metric can be created") +}); + // Cluster Billing Metrics - Gauge type metrics for cluster-wide aggregated billing data pub static TOTAL_CLUSTER_EVENTS_INGESTED_BY_DATE: Lazy = Lazy::new(|| { IntGaugeVec::new( @@ -544,6 +565,12 @@ fn custom_metrics(registry: &Registry) { TOTAL_FILES_SCANNED_IN_OBJECT_STORE_CALLS_BY_DATE.clone(), )) .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_INPUT_LLM_TOKENS_BY_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_OUTPUT_LLM_TOKENS_BY_DATE.clone())) + .expect("metric can be registered"); // Register cluster billing metrics registry .register(Box::new(TOTAL_CLUSTER_EVENTS_INGESTED_BY_DATE.clone())) @@ -698,6 +725,18 @@ pub fn increment_files_scanned_in_object_store_calls_by_date( .inc_by(count); } +pub fn increment_input_llm_tokens_by_date(provider: &str, model: &str, tokens: u64, date: &str) { + TOTAL_INPUT_LLM_TOKENS_BY_DATE + .with_label_values(&[provider, model, date]) + .inc_by(tokens); +} + +pub fn increment_output_llm_tokens_by_date(provider: &str, model: &str, tokens: u64, date: &str) { + TOTAL_OUTPUT_LLM_TOKENS_BY_DATE + .with_label_values(&[provider, model, date]) + .inc_by(tokens); +} + use actix_web::HttpResponse; pub async fn get() -> Result { From c6b3eca749075796c253cb913612b5e14ba24124 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Mon, 15 Sep 2025 02:08:52 -0700 Subject: [PATCH 09/14] add total cluster llm input and output token --- src/metrics/mod.rs | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index 3ab96a6cc..749fa56ca 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -468,6 +468,30 @@ pub static TOTAL_CLUSTER_FILES_SCANNED_IN_OBJECT_STORE_CALLS_BY_DATE: Lazy = Lazy::new(|| { + IntGaugeVec::new( + Opts::new( + "total_cluster_input_llm_tokens_by_date", + "Total cluster input LLM tokens used by date (Gauge for cluster billing)", + ) + .namespace(METRICS_NAMESPACE), + &["provider", "model", "date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_CLUSTER_OUTPUT_LLM_TOKENS_BY_DATE: Lazy = Lazy::new(|| { + IntGaugeVec::new( + Opts::new( + "total_cluster_output_llm_tokens_by_date", + "Total cluster output LLM tokens used by date (Gauge for cluster billing)", + ) + .namespace(METRICS_NAMESPACE), + &["provider", "model", "date"], + ) + .expect("metric can be created") +}); + pub static STORAGE_REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { HistogramVec::new( HistogramOpts::new("storage_request_response_time", "Storage Request Latency") @@ -605,6 +629,12 @@ fn custom_metrics(registry: &Registry) { TOTAL_CLUSTER_FILES_SCANNED_IN_OBJECT_STORE_CALLS_BY_DATE.clone(), )) .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_CLUSTER_INPUT_LLM_TOKENS_BY_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_CLUSTER_OUTPUT_LLM_TOKENS_BY_DATE.clone())) + .expect("metric can be registered"); registry .register(Box::new(STORAGE_REQUEST_RESPONSE_TIME.clone())) .expect("metric can be registered"); From dfddc4f70e0cb16bc5a97b694ff19c20a9e326c8 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Thu, 18 Sep 2025 03:10:44 -0700 Subject: [PATCH 10/14] fix coderabbit suggestions --- src/handlers/http/query.rs | 8 +++----- src/query/stream_schema_provider.rs | 14 ++++---------- src/storage/localfs.rs | 10 ++++++++++ 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/src/handlers/http/query.rs b/src/handlers/http/query.rs index 63d908a8e..a9dd88b43 100644 --- a/src/handlers/http/query.rs +++ b/src/handlers/http/query.rs @@ -345,10 +345,6 @@ pub async fn get_counts( req: HttpRequest, counts_request: Json, ) -> Result { - // Track billing metrics for query calls - let current_date = chrono::Utc::now().date_naive().to_string(); - increment_query_calls_by_date(¤t_date); - let creds = extract_session_key_from_req(&req)?; let permissions = Users.get_permissions(&creds); @@ -356,7 +352,9 @@ pub async fn get_counts( // does user have access to table? user_auth_for_datasets(&permissions, std::slice::from_ref(&body.stream)).await?; - + // Track billing metrics for query calls + let current_date = chrono::Utc::now().date_naive().to_string(); + increment_query_calls_by_date(¤t_date); // if the user has given a sql query (counts call with filters applied), then use this flow // this could include filters or group by if body.conditions.is_some() { diff --git a/src/query/stream_schema_provider.rs b/src/query/stream_schema_provider.rs index 5d4745127..b3907a9e9 100644 --- a/src/query/stream_schema_provider.rs +++ b/src/query/stream_schema_provider.rs @@ -58,7 +58,6 @@ use crate::{ hottier::HotTierManager, metrics::{ QUERY_CACHE_HIT, increment_bytes_scanned_in_query_by_date, - increment_files_scanned_in_object_store_calls_by_date, increment_files_scanned_in_query_by_date, }, option::Mode, @@ -597,14 +596,6 @@ impl TableProvider for StandardTableProvider { return self.final_plan(execution_plans, projection); } - let parquet_files_to_scan = manifest_files.len(); - increment_files_scanned_in_object_store_calls_by_date( - PARSEABLE.storage().name(), - "GET", - parquet_files_to_scan as u64, - &Utc::now().date_naive().to_string(), - ); - let (partitioned_files, statistics) = self.partitioned_files(manifest_files); self.create_parquet_physical_plan( &mut execution_plans, @@ -868,7 +859,10 @@ fn extract_timestamp_bound( DateTime::from_timestamp_nanos(*value).naive_utc(), )), ScalarValue::Utf8(Some(str_value)) if is_time_partition => { - Some((binexpr.op, str_value.parse().unwrap())) + match str_value.parse::() { + Ok(dt) => Some((binexpr.op, dt)), + Err(_) => None, + } } _ => None, } diff --git a/src/storage/localfs.rs b/src/storage/localfs.rs index 01611306c..765fc4101 100644 --- a/src/storage/localfs.rs +++ b/src/storage/localfs.rs @@ -515,6 +515,11 @@ impl ObjectStorage for LocalFS { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["localfs", "DELETE", "200"]) .observe(delete_elapsed); + increment_object_store_calls_by_date( + "localfs", + "DELETE", + &Utc::now().date_naive().to_string(), + ); } Err(err) => { let status_code = match err.kind() { @@ -543,6 +548,11 @@ impl ObjectStorage for LocalFS { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["localfs", "DELETE", "200"]) .observe(delete_elapsed); + increment_object_store_calls_by_date( + "localfs", + "DELETE", + &Utc::now().date_naive().to_string(), + ); } Err(err) => { let status_code = match err.kind() { From 08f85441d4491ed5d2dfced03e8350a67bce761a Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Thu, 18 Sep 2025 23:47:10 -0700 Subject: [PATCH 11/14] improved STORAGE_REQUEST_RESPONSE_TIME metrics collection --- src/storage/azure_blob.rs | 309 ++++----------------------------- src/storage/gcs.rs | 316 ++++----------------------------- src/storage/localfs.rs | 357 +++++++------------------------------- src/storage/s3.rs | 325 ++++------------------------------ 4 files changed, 174 insertions(+), 1133 deletions(-) diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index c1fb2bb21..32b3cb1c5 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -23,7 +23,7 @@ use std::{ Arc, atomic::{AtomicU64, Ordering}, }, - time::{Duration, Instant}, + time::Duration, }; use async_trait::async_trait; @@ -51,8 +51,7 @@ use url::Url; use crate::{ metrics::{ - STORAGE_REQUEST_RESPONSE_TIME, increment_files_scanned_in_object_store_calls_by_date, - increment_object_store_calls_by_date, + increment_files_scanned_in_object_store_calls_by_date, increment_object_store_calls_by_date, }, parseable::LogStream, }; @@ -60,8 +59,8 @@ use crate::{ use super::{ CONNECT_TIMEOUT_SECS, MIN_MULTIPART_UPLOAD_SIZE, ObjectStorage, ObjectStorageError, ObjectStorageProvider, PARSEABLE_ROOT_DIRECTORY, REQUEST_TIMEOUT_SECS, - STREAM_METADATA_FILE_NAME, metrics_layer::MetricLayer, metrics_layer::error_to_status_code, - object_storage::parseable_json_path, to_object_store_path, + STREAM_METADATA_FILE_NAME, metrics_layer::MetricLayer, object_storage::parseable_json_path, + to_object_store_path, }; #[derive(Debug, Clone, clap::Args)] @@ -212,10 +211,7 @@ pub struct BlobStore { impl BlobStore { async fn _get_object(&self, path: &RelativePath) -> Result { - let time = std::time::Instant::now(); let resp = self.client.get(&to_object_store_path(path)).await; - let elapsed = time.elapsed().as_secs_f64(); - increment_object_store_calls_by_date( "azure_blob", "GET", @@ -225,9 +221,6 @@ impl BlobStore { match resp { Ok(resp) => { let body: Bytes = resp.bytes().await?; - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "GET", "200"]) - .observe(elapsed); increment_files_scanned_in_object_store_calls_by_date( "azure_blob", "GET", @@ -236,13 +229,7 @@ impl BlobStore { ); Ok(body) } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "GET", status_code]) - .observe(elapsed); - Err(err.into()) - } + Err(err) => Err(err.into()), } } @@ -251,10 +238,7 @@ impl BlobStore { path: &RelativePath, resource: PutPayload, ) -> Result<(), ObjectStorageError> { - let time = std::time::Instant::now(); let resp = self.client.put(&to_object_store_path(path), resource).await; - let elapsed = time.elapsed().as_secs_f64(); - increment_object_store_calls_by_date( "azure_blob", "PUT", @@ -262,9 +246,6 @@ impl BlobStore { ); match resp { Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT", "200"]) - .observe(elapsed); increment_files_scanned_in_object_store_calls_by_date( "azure_blob", "PUT", @@ -273,26 +254,14 @@ impl BlobStore { ); Ok(()) } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT", status_code]) - .observe(elapsed); - Err(err.into()) - } + Err(err) => Err(err.into()), } } async fn _delete_prefix(&self, key: &str) -> Result<(), ObjectStorageError> { let files_scanned = Arc::new(AtomicU64::new(0)); let files_deleted = Arc::new(AtomicU64::new(0)); - // Track LIST operation - let list_start = Instant::now(); let object_stream = self.client.list(Some(&(key.into()))); - let list_elapsed = list_start.elapsed().as_secs_f64(); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", "200"]) - .observe(list_elapsed); increment_object_store_calls_by_date( "azure_blob", "LIST", @@ -306,27 +275,17 @@ impl BlobStore { match x { Ok(obj) => { files_deleted.fetch_add(1, Ordering::Relaxed); - let delete_start = Instant::now(); let delete_resp = self.client.delete(&obj.location).await; - let delete_elapsed = delete_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date( "azure_blob", "DELETE", &Utc::now().date_naive().to_string(), ); - match delete_resp { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "DELETE", "200"]) - .observe(delete_elapsed); - } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "DELETE", status_code]) - .observe(delete_elapsed); - error!("Failed to delete object during delete stream: {:?}", err); - } + if delete_resp.is_err() { + error!( + "Failed to delete object during delete stream: {:?}", + delete_resp + ); } } Err(err) => { @@ -348,17 +307,14 @@ impl BlobStore { files_deleted.load(Ordering::Relaxed), &Utc::now().date_naive().to_string(), ); - // Note: Individual DELETE calls are tracked inside the concurrent loop Ok(()) } async fn _list_dates(&self, stream: &str) -> Result, ObjectStorageError> { - let list_start = Instant::now(); let resp: Result = self .client .list_with_delimiter(Some(&(stream.into()))) .await; - let list_elapsed = list_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date( "azure_blob", "LIST", @@ -366,17 +322,8 @@ impl BlobStore { ); let resp = match resp { - Ok(resp) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", "200"]) - .observe(list_elapsed); - resp - } + Ok(resp) => resp, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -403,10 +350,7 @@ impl BlobStore { async fn _upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { let bytes = tokio::fs::read(path).await?; - let put_start = Instant::now(); let result = self.client.put(&key.into(), bytes.into()).await; - let put_elapsed = put_start.elapsed().as_secs_f64(); - increment_object_store_calls_by_date( "azure_blob", "PUT", @@ -414,9 +358,6 @@ impl BlobStore { ); match result { Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT", "200"]) - .observe(put_elapsed); increment_files_scanned_in_object_store_calls_by_date( "azure_blob", "PUT", @@ -425,13 +366,7 @@ impl BlobStore { ); Ok(()) } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT", status_code]) - .observe(put_elapsed); - Err(err.into()) - } + Err(err) => Err(err.into()), } } @@ -444,26 +379,15 @@ impl BlobStore { let location = &to_object_store_path(key); // Track multipart initiation - let multipart_start = Instant::now(); let async_writer = self.client.put_multipart(location).await; - let multipart_elapsed = multipart_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date( "azure_blob", "PUT_MULTIPART", &Utc::now().date_naive().to_string(), ); let mut async_writer = match async_writer { - Ok(writer) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT_MULTIPART", "200"]) - .observe(multipart_elapsed); - writer - } + Ok(writer) => writer, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT_MULTIPART", status_code]) - .observe(multipart_elapsed); return Err(err.into()); } }; @@ -473,11 +397,7 @@ impl BlobStore { if total_size < MIN_MULTIPART_UPLOAD_SIZE { let mut data = Vec::new(); file.read_to_end(&mut data).await?; - - // Track single PUT operation for small files - let put_start = Instant::now(); let result = self.client.put(location, data.into()).await; - let put_elapsed = put_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date( "azure_blob", "PUT", @@ -486,9 +406,6 @@ impl BlobStore { match result { Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT", "200"]) - .observe(put_elapsed); increment_files_scanned_in_object_store_calls_by_date( "azure_blob", "PUT", @@ -497,10 +414,6 @@ impl BlobStore { ); } Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT", status_code]) - .observe(put_elapsed); return Err(err.into()); } } @@ -531,50 +444,25 @@ impl BlobStore { // Extract this part's data let part_data = data[start_pos..end_pos].to_vec(); - // Track individual part upload - let part_start = Instant::now(); let result = async_writer.put_part(part_data.into()).await; - let part_elapsed = part_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date( "azure_blob", "PUT_MULTIPART", &Utc::now().date_naive().to_string(), ); - match result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT_MULTIPART", "200"]) - .observe(part_elapsed); - } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT_MULTIPART", status_code]) - .observe(part_elapsed); - return Err(err.into()); - } + if result.is_err() { + return Err(result.err().unwrap().into()); } // upload_parts.push(part_number as u64 + 1); } // Track multipart completion - let complete_start = Instant::now(); let complete_result = async_writer.complete().await; - let complete_elapsed = complete_start.elapsed().as_secs_f64(); - if let Err(err) = complete_result { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT_MULTIPART_COMPLETE", status_code]) - .observe(complete_elapsed); error!("Failed to complete multipart upload. {:?}", err); async_writer.abort().await?; return Err(err.into()); - } else { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT_MULTIPART_COMPLETE", "200"]) - .observe(complete_elapsed); } } Ok(()) @@ -604,33 +492,19 @@ impl ObjectStorage for BlobStore { } async fn head(&self, path: &RelativePath) -> Result { - let head_start = Instant::now(); let result = self.client.head(&to_object_store_path(path)).await; - let head_elapsed = head_start.elapsed().as_secs_f64(); - increment_object_store_calls_by_date( "azure_blob", "HEAD", &Utc::now().date_naive().to_string(), ); - match &result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "HEAD", "200"]) - .observe(head_elapsed); - increment_files_scanned_in_object_store_calls_by_date( - "azure_blob", - "HEAD", - 1, - &Utc::now().date_naive().to_string(), - ); - } - Err(err) => { - let status_code = error_to_status_code(err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "HEAD", status_code]) - .observe(head_elapsed); - } + if result.is_ok() { + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "HEAD", + 1, + &Utc::now().date_naive().to_string(), + ); } Ok(result?) @@ -651,8 +525,6 @@ impl ObjectStorage for BlobStore { self.root.clone() }; - // Track list operation - let list_start = Instant::now(); let mut list_stream = self.client.list(Some(&prefix)); let mut res = vec![]; @@ -680,9 +552,6 @@ impl ObjectStorage for BlobStore { .map_err(ObjectStorageError::PathError)?, ) .await?; - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "GET", "200"]) - .observe(list_start.elapsed().as_secs_f64()); increment_files_scanned_in_object_store_calls_by_date( "azure_blob", "GET", @@ -696,10 +565,6 @@ impl ObjectStorage for BlobStore { ); res.push(byts); } - let list_elapsed = list_start.elapsed().as_secs_f64(); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", "200"]) - .observe(list_elapsed); // Record total files scanned increment_files_scanned_in_object_store_calls_by_date( @@ -722,8 +587,6 @@ impl ObjectStorage for BlobStore { let mut path_arr = vec![]; let mut files_scanned = 0; - // Track list operation - let list_start = Instant::now(); let mut object_stream = self.client.list(Some(&self.root)); increment_object_store_calls_by_date( "azure_blob", @@ -746,10 +609,6 @@ impl ObjectStorage for BlobStore { path_arr.push(RelativePathBuf::from(meta.location.as_ref())); } } - let list_elapsed = list_start.elapsed().as_secs_f64(); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", "200"]) - .observe(list_elapsed); // Record total files scanned increment_files_scanned_in_object_store_calls_by_date( "azure_blob", @@ -779,69 +638,41 @@ impl ObjectStorage for BlobStore { } async fn delete_object(&self, path: &RelativePath) -> Result<(), ObjectStorageError> { - let delete_start = Instant::now(); let result = self.client.delete(&to_object_store_path(path)).await; - let delete_elapsed = delete_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date( "azure_blob", "DELETE", &Utc::now().date_naive().to_string(), ); - match &result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "DELETE", "200"]) - .observe(delete_elapsed); - // Record single file deleted - increment_files_scanned_in_object_store_calls_by_date( - "azure_blob", - "DELETE", - 1, - &Utc::now().date_naive().to_string(), - ); - } - Err(err) => { - let status_code = error_to_status_code(err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "DELETE", status_code]) - .observe(delete_elapsed); - } + if result.is_ok() { + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "DELETE", + 1, + &Utc::now().date_naive().to_string(), + ); } Ok(result?) } async fn check(&self) -> Result<(), ObjectStorageError> { - let head_start = Instant::now(); let result = self .client .head(&to_object_store_path(&parseable_json_path())) .await; - let head_elapsed = head_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date( "azure_blob", "HEAD", &Utc::now().date_naive().to_string(), ); - - match &result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "HEAD", "200"]) - .observe(head_elapsed); - increment_files_scanned_in_object_store_calls_by_date( - "azure_blob", - "HEAD", - 1, - &Utc::now().date_naive().to_string(), - ); - } - Err(err) => { - let status_code = error_to_status_code(err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "HEAD", status_code]) - .observe(head_elapsed); - } + if result.is_ok() { + increment_files_scanned_in_object_store_calls_by_date( + "azure_blob", + "HEAD", + 1, + &Utc::now().date_naive().to_string(), + ); } Ok(result.map(|_| ())?) @@ -856,10 +687,7 @@ impl ObjectStorage for BlobStore { async fn try_delete_node_meta(&self, node_filename: String) -> Result<(), ObjectStorageError> { let file = RelativePathBuf::from(&node_filename); - let delete_start = Instant::now(); let result = self.client.delete(&to_object_store_path(&file)).await; - let delete_elapsed = delete_start.elapsed().as_secs_f64(); - increment_object_store_calls_by_date( "azure_blob", "DELETE", @@ -867,9 +695,6 @@ impl ObjectStorage for BlobStore { ); match result { Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "DELETE", "200"]) - .observe(delete_elapsed); increment_files_scanned_in_object_store_calls_by_date( "azure_blob", "DELETE", @@ -878,14 +703,7 @@ impl ObjectStorage for BlobStore { ); Ok(()) } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "DELETE", status_code]) - .observe(delete_elapsed); - - Err(err.into()) - } + Err(err) => Err(err.into()), } } @@ -897,13 +715,7 @@ impl ObjectStorage for BlobStore { } async fn list_old_streams(&self) -> Result, ObjectStorageError> { - // Track LIST operation - let list_start = Instant::now(); let resp = self.client.list_with_delimiter(None).await?; - let list_elapsed = list_start.elapsed().as_secs_f64(); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", "200"]) - .observe(list_elapsed); let common_prefixes = resp.common_prefixes; // get all dirs increment_files_scanned_in_object_store_calls_by_date( @@ -930,28 +742,12 @@ impl ObjectStorage for BlobStore { for dir in &dirs { let key = format!("{dir}/{STREAM_METADATA_FILE_NAME}"); let task = async move { - let head_start = Instant::now(); let result = self.client.head(&StorePath::from(key)).await; - let head_elapsed = head_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date( "azure_blob", "HEAD", &Utc::now().date_naive().to_string(), ); - match &result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "HEAD", "200"]) - .observe(head_elapsed); - } - Err(err) => { - let status_code = error_to_status_code(err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "HEAD", status_code]) - .observe(head_elapsed); - } - } - result.map(|_| ()) }; stream_json_check.push(task); @@ -979,12 +775,7 @@ impl ObjectStorage for BlobStore { date: &str, ) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from(format!("{}/{}/", stream_name, date)); - let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await?; - let list_elapsed = list_start.elapsed().as_secs_f64(); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", "200"]) - .observe(list_elapsed); increment_files_scanned_in_object_store_calls_by_date( "azure_blob", "LIST", @@ -1024,12 +815,7 @@ impl ObjectStorage for BlobStore { hour: &str, ) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from(format!("{}/{}/{}/", stream_name, date, hour)); - let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await?; - let list_elapsed = list_start.elapsed().as_secs_f64(); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", "200"]) - .observe(list_elapsed); increment_files_scanned_in_object_store_calls_by_date( "azure_blob", "LIST", @@ -1091,9 +877,7 @@ impl ObjectStorage for BlobStore { async fn list_dirs(&self) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from("/"); - let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await; - let list_elapsed = list_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date( "azure_blob", "LIST", @@ -1101,9 +885,6 @@ impl ObjectStorage for BlobStore { ); let resp = match resp { Ok(resp) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", "200"]) - .observe(list_elapsed); increment_files_scanned_in_object_store_calls_by_date( "azure_blob", "LIST", @@ -1114,10 +895,6 @@ impl ObjectStorage for BlobStore { resp } Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -1135,10 +912,7 @@ impl ObjectStorage for BlobStore { relative_path: &RelativePath, ) -> Result, ObjectStorageError> { let prefix = object_store::path::Path::from(relative_path.as_str()); - - let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&prefix)).await; - let list_elapsed = list_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date( "azure_blob", "LIST", @@ -1146,9 +920,6 @@ impl ObjectStorage for BlobStore { ); let resp = match resp { Ok(resp) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", "200"]) - .observe(list_elapsed); increment_files_scanned_in_object_store_calls_by_date( "azure_blob", "LIST", @@ -1159,10 +930,6 @@ impl ObjectStorage for BlobStore { resp } Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; diff --git a/src/storage/gcs.rs b/src/storage/gcs.rs index a11d786ef..93c7a954f 100644 --- a/src/storage/gcs.rs +++ b/src/storage/gcs.rs @@ -23,13 +23,12 @@ use std::{ Arc, atomic::{AtomicU64, Ordering}, }, - time::{Duration, Instant}, + time::Duration, }; use crate::{ metrics::{ - STORAGE_REQUEST_RESPONSE_TIME, increment_files_scanned_in_object_store_calls_by_date, - increment_object_store_calls_by_date, + increment_files_scanned_in_object_store_calls_by_date, increment_object_store_calls_by_date, }, parseable::LogStream, }; @@ -58,8 +57,8 @@ use tracing::error; use super::{ CONNECT_TIMEOUT_SECS, MIN_MULTIPART_UPLOAD_SIZE, ObjectStorage, ObjectStorageError, ObjectStorageProvider, PARSEABLE_ROOT_DIRECTORY, REQUEST_TIMEOUT_SECS, - STREAM_METADATA_FILE_NAME, metrics_layer::MetricLayer, metrics_layer::error_to_status_code, - object_storage::parseable_json_path, to_object_store_path, + STREAM_METADATA_FILE_NAME, metrics_layer::MetricLayer, object_storage::parseable_json_path, + to_object_store_path, }; #[derive(Debug, Clone, clap::Args)] @@ -177,18 +176,11 @@ pub struct Gcs { impl Gcs { async fn _get_object(&self, path: &RelativePath) -> Result { - let time = std::time::Instant::now(); let resp = self.client.get(&to_object_store_path(path)).await; - let elapsed = time.elapsed().as_secs_f64(); - increment_object_store_calls_by_date("gcs", "GET", &Utc::now().date_naive().to_string()); - match resp { Ok(resp) => { let body: Bytes = resp.bytes().await?; - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "GET", "200"]) - .observe(elapsed); increment_files_scanned_in_object_store_calls_by_date( "gcs", "GET", @@ -197,13 +189,7 @@ impl Gcs { ); Ok(body) } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "GET", status_code]) - .observe(elapsed); - Err(err.into()) - } + Err(err) => Err(err.into()), } } @@ -212,16 +198,10 @@ impl Gcs { path: &RelativePath, resource: PutPayload, ) -> Result<(), ObjectStorageError> { - let time = std::time::Instant::now(); let resp = self.client.put(&to_object_store_path(path), resource).await; - let elapsed = time.elapsed().as_secs_f64(); - increment_object_store_calls_by_date("gcs", "PUT", &Utc::now().date_naive().to_string()); match resp { Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "PUT", "200"]) - .observe(elapsed); increment_files_scanned_in_object_store_calls_by_date( "gcs", "PUT", @@ -230,13 +210,7 @@ impl Gcs { ); Ok(()) } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "PUT", status_code]) - .observe(elapsed); - Err(err.into()) - } + Err(err) => Err(err.into()), } } @@ -244,14 +218,8 @@ impl Gcs { let files_scanned = Arc::new(AtomicU64::new(0)); let files_deleted = Arc::new(AtomicU64::new(0)); // Track LIST operation - let list_start = Instant::now(); let object_stream = self.client.list(Some(&(key.into()))); - let list_elapsed = list_start.elapsed().as_secs_f64(); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", "200"]) - .observe(list_elapsed); increment_object_store_calls_by_date("gcs", "LIST", &Utc::now().date_naive().to_string()); - object_stream .for_each_concurrent(None, |x| async { files_scanned.fetch_add(1, Ordering::Relaxed); @@ -259,27 +227,17 @@ impl Gcs { match x { Ok(obj) => { files_deleted.fetch_add(1, Ordering::Relaxed); - let delete_start = Instant::now(); let delete_resp = self.client.delete(&obj.location).await; - let delete_elapsed = delete_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date( "gcs", "DELETE", &Utc::now().date_naive().to_string(), ); - match delete_resp { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "DELETE", "200"]) - .observe(delete_elapsed); - } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "DELETE", status_code]) - .observe(delete_elapsed); - error!("Failed to delete object during delete stream: {:?}", err); - } + if delete_resp.is_err() { + error!( + "Failed to delete object during delete stream: {:?}", + delete_resp + ); } } Err(err) => { @@ -301,31 +259,19 @@ impl Gcs { files_deleted.load(Ordering::Relaxed), &Utc::now().date_naive().to_string(), ); - // Note: Individual DELETE calls are tracked inside the concurrent loop Ok(()) } async fn _list_dates(&self, stream: &str) -> Result, ObjectStorageError> { - let list_start = Instant::now(); let resp: Result = self .client .list_with_delimiter(Some(&(stream.into()))) .await; - let list_elapsed = list_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date("gcs", "LIST", &Utc::now().date_naive().to_string()); let resp = match resp { - Ok(resp) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", "200"]) - .observe(list_elapsed); - resp - } + Ok(resp) => resp, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -352,16 +298,10 @@ impl Gcs { async fn _upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { let bytes = tokio::fs::read(path).await?; - let put_start = Instant::now(); let result = self.client.put(&key.into(), bytes.into()).await; - let put_elapsed = put_start.elapsed().as_secs_f64(); - increment_object_store_calls_by_date("gcs", "PUT", &Utc::now().date_naive().to_string()); match result { Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "PUT", "200"]) - .observe(put_elapsed); increment_files_scanned_in_object_store_calls_by_date( "gcs", "PUT", @@ -370,13 +310,7 @@ impl Gcs { ); Ok(()) } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "PUT", status_code]) - .observe(put_elapsed); - Err(err.into()) - } + Err(err) => Err(err.into()), } } @@ -389,26 +323,15 @@ impl Gcs { let location = &to_object_store_path(key); // Track multipart initiation - let multipart_start = Instant::now(); let async_writer = self.client.put_multipart(location).await; - let multipart_elapsed = multipart_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date( "gcs", "PUT_MULTIPART", &Utc::now().date_naive().to_string(), ); let mut async_writer = match async_writer { - Ok(writer) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "PUT_MULTIPART", "200"]) - .observe(multipart_elapsed); - writer - } + Ok(writer) => writer, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "PUT_MULTIPART", status_code]) - .observe(multipart_elapsed); return Err(err.into()); } }; @@ -420,9 +343,7 @@ impl Gcs { file.read_to_end(&mut data).await?; // Track single PUT operation for small files - let put_start = Instant::now(); let result = self.client.put(location, data.into()).await; - let put_elapsed = put_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date( "gcs", "PUT", @@ -430,9 +351,6 @@ impl Gcs { ); match result { Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "PUT", "200"]) - .observe(put_elapsed); increment_files_scanned_in_object_store_calls_by_date( "gcs", "PUT", @@ -441,10 +359,6 @@ impl Gcs { ); } Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "PUT", status_code]) - .observe(put_elapsed); return Err(err.into()); } } @@ -472,40 +386,20 @@ impl Gcs { let part_data = data[start_pos..end_pos].to_vec(); // Track individual part upload - let part_start = Instant::now(); let result = async_writer.put_part(part_data.into()).await; - let part_elapsed = part_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date( "gcs", "PUT_MULTIPART", &Utc::now().date_naive().to_string(), ); - match result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "PUT_MULTIPART", "200"]) - .observe(part_elapsed); - } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "PUT_MULTIPART", status_code]) - .observe(part_elapsed); - return Err(err.into()); - } + if result.is_err() { + return Err(result.err().unwrap().into()); } } // Track multipart completion - let complete_start = Instant::now(); let complete_result = async_writer.complete().await; - let complete_elapsed = complete_start.elapsed().as_secs_f64(); - if let Err(err) = complete_result { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "PUT_MULTIPART_COMPLETE", status_code]) - .observe(complete_elapsed); if let Err(abort_err) = async_writer.abort().await { error!( "Failed to abort multipart upload after completion failure: {:?}", @@ -513,10 +407,6 @@ impl Gcs { ); } return Err(err.into()); - } else { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "PUT_MULTIPART_COMPLETE", "200"]) - .observe(complete_elapsed); } } Ok(()) @@ -531,16 +421,10 @@ impl ObjectStorage for Gcs { ) -> Result { let path = &to_object_store_path(path); - let head_start = Instant::now(); let meta = self.client.head(path).await; - let head_elapsed = head_start.elapsed().as_secs_f64(); - increment_object_store_calls_by_date("gcs", "HEAD", &Utc::now().date_naive().to_string()); let meta = match meta { Ok(meta) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "HEAD", "200"]) - .observe(head_elapsed); increment_files_scanned_in_object_store_calls_by_date( "gcs", "HEAD", @@ -550,10 +434,6 @@ impl ObjectStorage for Gcs { meta } Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "HEAD", status_code]) - .observe(head_elapsed); return Err(err.into()); } }; @@ -572,29 +452,15 @@ impl ObjectStorage for Gcs { } async fn head(&self, path: &RelativePath) -> Result { - let head_start = Instant::now(); let result = self.client.head(&to_object_store_path(path)).await; - let head_elapsed = head_start.elapsed().as_secs_f64(); - increment_object_store_calls_by_date("gcs", "HEAD", &Utc::now().date_naive().to_string()); - match &result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "HEAD", "200"]) - .observe(head_elapsed); - increment_files_scanned_in_object_store_calls_by_date( - "gcs", - "HEAD", - 1, - &Utc::now().date_naive().to_string(), - ); - } - Err(err) => { - let status_code = error_to_status_code(err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "HEAD", status_code]) - .observe(head_elapsed); - } + if result.is_ok() { + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "HEAD", + 1, + &Utc::now().date_naive().to_string(), + ); } Ok(result?) @@ -615,8 +481,6 @@ impl ObjectStorage for Gcs { self.root.clone() }; - // Track list operation - let list_start = Instant::now(); let mut list_stream = self.client.list(Some(&prefix)); let mut res = vec![]; @@ -644,9 +508,6 @@ impl ObjectStorage for Gcs { .map_err(ObjectStorageError::PathError)?, ) .await?; - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "GET", "200"]) - .observe(list_start.elapsed().as_secs_f64()); increment_files_scanned_in_object_store_calls_by_date( "gcs", "GET", @@ -660,10 +521,6 @@ impl ObjectStorage for Gcs { ); res.push(byts); } - let list_elapsed = list_start.elapsed().as_secs_f64(); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", "200"]) - .observe(list_elapsed); // Record total files scanned increment_files_scanned_in_object_store_calls_by_date( @@ -682,8 +539,6 @@ impl ObjectStorage for Gcs { let mut path_arr = vec![]; let mut files_scanned = 0; - // Track list operation - let list_start = Instant::now(); let mut object_stream = self.client.list(Some(&self.root)); increment_object_store_calls_by_date("gcs", "LIST", &Utc::now().date_naive().to_string()); @@ -702,10 +557,6 @@ impl ObjectStorage for Gcs { path_arr.push(RelativePathBuf::from(meta.location.as_ref())); } } - let list_elapsed = list_start.elapsed().as_secs_f64(); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", "200"]) - .observe(list_elapsed); // Record total files scanned increment_files_scanned_in_object_store_calls_by_date( "gcs", @@ -735,61 +586,34 @@ impl ObjectStorage for Gcs { } async fn delete_object(&self, path: &RelativePath) -> Result<(), ObjectStorageError> { - let delete_start = Instant::now(); let result = self.client.delete(&to_object_store_path(path)).await; - let delete_elapsed = delete_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date("gcs", "DELETE", &Utc::now().date_naive().to_string()); - match &result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "DELETE", "200"]) - .observe(delete_elapsed); - // Record single file deleted - increment_files_scanned_in_object_store_calls_by_date( - "gcs", - "DELETE", - 1, - &Utc::now().date_naive().to_string(), - ); - } - Err(err) => { - let status_code = error_to_status_code(err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "DELETE", status_code]) - .observe(delete_elapsed); - } + if result.is_ok() { + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "DELETE", + 1, + &Utc::now().date_naive().to_string(), + ); } Ok(result?) } async fn check(&self) -> Result<(), ObjectStorageError> { - let head_start = Instant::now(); let result = self .client .head(&to_object_store_path(&parseable_json_path())) .await; - let head_elapsed = head_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date("gcs", "HEAD", &Utc::now().date_naive().to_string()); - match &result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "HEAD", "200"]) - .observe(head_elapsed); - increment_files_scanned_in_object_store_calls_by_date( - "gcs", - "HEAD", - 1, - &Utc::now().date_naive().to_string(), - ); - } - Err(err) => { - let status_code = error_to_status_code(err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "HEAD", status_code]) - .observe(head_elapsed); - } + if result.is_ok() { + increment_files_scanned_in_object_store_calls_by_date( + "gcs", + "HEAD", + 1, + &Utc::now().date_naive().to_string(), + ); } Ok(result.map(|_| ())?) @@ -804,16 +628,10 @@ impl ObjectStorage for Gcs { async fn try_delete_node_meta(&self, node_filename: String) -> Result<(), ObjectStorageError> { let file = RelativePathBuf::from(&node_filename); - let delete_start = Instant::now(); let result = self.client.delete(&to_object_store_path(&file)).await; - let delete_elapsed = delete_start.elapsed().as_secs_f64(); - increment_object_store_calls_by_date("gcs", "DELETE", &Utc::now().date_naive().to_string()); match result { Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "DELETE", "200"]) - .observe(delete_elapsed); increment_files_scanned_in_object_store_calls_by_date( "gcs", "DELETE", @@ -822,14 +640,7 @@ impl ObjectStorage for Gcs { ); Ok(()) } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "DELETE", status_code]) - .observe(delete_elapsed); - - Err(err.into()) - } + Err(err) => Err(err.into()), } } @@ -841,14 +652,7 @@ impl ObjectStorage for Gcs { } async fn list_old_streams(&self) -> Result, ObjectStorageError> { - // Track LIST operation - let list_start = Instant::now(); let resp = self.client.list_with_delimiter(None).await?; - let list_elapsed = list_start.elapsed().as_secs_f64(); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", "200"]) - .observe(list_elapsed); - let common_prefixes = resp.common_prefixes; // get all dirs increment_files_scanned_in_object_store_calls_by_date( "gcs", @@ -870,28 +674,12 @@ impl ObjectStorage for Gcs { for dir in &dirs { let key = format!("{dir}/{STREAM_METADATA_FILE_NAME}"); let task = async move { - let head_start = Instant::now(); let result = self.client.head(&StorePath::from(key)).await; - let head_elapsed = head_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date( "gcs", "HEAD", &Utc::now().date_naive().to_string(), ); - match &result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "HEAD", "200"]) - .observe(head_elapsed); - } - Err(err) => { - let status_code = error_to_status_code(err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "HEAD", status_code]) - .observe(head_elapsed); - } - } - result.map(|_| ()) }; stream_json_check.push(task); @@ -919,12 +707,7 @@ impl ObjectStorage for Gcs { date: &str, ) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from(format!("{}/{}/", stream_name, date)); - let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await?; - let list_elapsed = list_start.elapsed().as_secs_f64(); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", "200"]) - .observe(list_elapsed); increment_files_scanned_in_object_store_calls_by_date( "gcs", "LIST", @@ -960,12 +743,7 @@ impl ObjectStorage for Gcs { hour: &str, ) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from(format!("{}/{}/{}/", stream_name, date, hour)); - let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await?; - let list_elapsed = list_start.elapsed().as_secs_f64(); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", "200"]) - .observe(list_elapsed); increment_files_scanned_in_object_store_calls_by_date( "gcs", "LIST", @@ -1019,15 +797,10 @@ impl ObjectStorage for Gcs { async fn list_dirs(&self) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from("/"); - let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await; - let list_elapsed = list_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date("gcs", "LIST", &Utc::now().date_naive().to_string()); let resp = match resp { Ok(resp) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", "200"]) - .observe(list_elapsed); increment_files_scanned_in_object_store_calls_by_date( "gcs", "LIST", @@ -1038,10 +811,6 @@ impl ObjectStorage for Gcs { resp } Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -1060,15 +829,10 @@ impl ObjectStorage for Gcs { ) -> Result, ObjectStorageError> { let prefix = object_store::path::Path::from(relative_path.as_str()); - let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&prefix)).await; - let list_elapsed = list_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date("gcs", "LIST", &Utc::now().date_naive().to_string()); let resp = match resp { Ok(resp) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", "200"]) - .observe(list_elapsed); increment_files_scanned_in_object_store_calls_by_date( "gcs", "LIST", @@ -1079,10 +843,6 @@ impl ObjectStorage for Gcs { resp } Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; diff --git a/src/storage/localfs.rs b/src/storage/localfs.rs index 765fc4101..6bbeafcc2 100644 --- a/src/storage/localfs.rs +++ b/src/storage/localfs.rs @@ -20,7 +20,6 @@ use std::{ collections::HashSet, path::{Path, PathBuf}, sync::Arc, - time::Instant, }; use async_trait::async_trait; @@ -40,8 +39,7 @@ use tokio_stream::wrappers::ReadDirStream; use crate::{ handlers::http::users::USERS_ROOT_DIR, metrics::{ - STORAGE_REQUEST_RESPONSE_TIME, increment_files_scanned_in_object_store_calls_by_date, - increment_object_store_calls_by_date, + increment_files_scanned_in_object_store_calls_by_date, increment_object_store_calls_by_date, }, option::validation, parseable::LogStream, @@ -130,18 +128,6 @@ impl ObjectStorage for LocalFS { ))) } async fn head(&self, _path: &RelativePath) -> Result { - // Record attempt to access file (even though operation not implemented) - increment_files_scanned_in_object_store_calls_by_date( - "localfs", - "HEAD", - 1, - &Utc::now().date_naive().to_string(), - ); - increment_object_store_calls_by_date( - "localfs", - "HEAD", - &Utc::now().date_naive().to_string(), - ); Err(ObjectStorageError::UnhandledError(Box::new( std::io::Error::new( std::io::ErrorKind::Unsupported, @@ -173,15 +159,9 @@ impl ObjectStorage for LocalFS { }; } - let get_start = Instant::now(); let file_result = fs::read(file_path).await; - let get_elapsed = get_start.elapsed().as_secs_f64(); - let res: Result = match file_result { Ok(x) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "GET", "200"]) - .observe(get_elapsed); // Record single file accessed successfully increment_files_scanned_in_object_store_calls_by_date( "localfs", @@ -198,14 +178,8 @@ impl ObjectStorage for LocalFS { } Err(e) => { if e.kind() == std::io::ErrorKind::NotFound { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "GET", "404"]) - .observe(get_elapsed); Err(ObjectStorageError::NoSuchKey(path.to_string())) } else { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "GET", "500"]) - .observe(get_elapsed); Err(ObjectStorageError::UnhandledError(Box::new(e))) } } @@ -220,22 +194,10 @@ impl ObjectStorage for LocalFS { let mut path_arr = vec![]; let mut files_scanned = 0u64; - // Track list operation - let list_start = Instant::now(); let entries_result = fs::read_dir(&self.root).await; - let list_elapsed = list_start.elapsed().as_secs_f64(); - let mut entries = match entries_result { - Ok(entries) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "LIST", "200"]) - .observe(list_elapsed); - entries - } + Ok(entries) => entries, Err(err) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "LIST", "404"]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -279,7 +241,6 @@ impl ObjectStorage for LocalFS { base_path: Option<&RelativePath>, filter_func: Box bool + std::marker::Send + 'static>, ) -> Result, ObjectStorageError> { - let list_start = Instant::now(); let prefix = if let Some(path) = base_path { path.to_path(&self.root) } else { @@ -287,19 +248,9 @@ impl ObjectStorage for LocalFS { }; let entries_result = fs::read_dir(&prefix).await; - let list_elapsed = list_start.elapsed().as_secs_f64(); - let mut entries = match entries_result { - Ok(entries) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "LIST", "200"]) - .observe(list_elapsed); - entries - } + Ok(entries) => entries, Err(err) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "LIST", "404"]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -324,21 +275,12 @@ impl ObjectStorage for LocalFS { continue; } - let file_read_start = Instant::now(); let file_result = fs::read(entry.path()).await; - let file_read_elapsed = file_read_start.elapsed().as_secs_f64(); - match file_result { Ok(file) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "GET", "200"]) - .observe(file_read_elapsed); res.push(file.into()); } Err(err) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "GET", "404"]) - .observe(file_read_elapsed); return Err(err.into()); } } @@ -370,33 +312,20 @@ impl ObjectStorage for LocalFS { fs::create_dir_all(parent).await?; } - let put_start = Instant::now(); let res = fs::write(path, resource).await; - let put_elapsed = put_start.elapsed().as_secs_f64(); - - match &res { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "PUT", "200"]) - .observe(put_elapsed); - // Record single file written successfully - increment_files_scanned_in_object_store_calls_by_date( - "localfs", - "PUT", - 1, - &Utc::now().date_naive().to_string(), - ); - increment_object_store_calls_by_date( - "localfs", - "PUT", - &Utc::now().date_naive().to_string(), - ); - } - Err(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "PUT", "500"]) - .observe(put_elapsed); - } + if res.is_ok() { + // Record single file written successfully + increment_files_scanned_in_object_store_calls_by_date( + "localfs", + "PUT", + 1, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date( + "localfs", + "PUT", + &Utc::now().date_naive().to_string(), + ); } res.map_err(Into::into) @@ -405,28 +334,14 @@ impl ObjectStorage for LocalFS { async fn delete_prefix(&self, path: &RelativePath) -> Result<(), ObjectStorageError> { let path = self.path_in_root(path); - let delete_start = Instant::now(); let result = tokio::fs::remove_dir_all(path).await; - let delete_elapsed = delete_start.elapsed().as_secs_f64(); - - match &result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "DELETE", "200"]) - .observe(delete_elapsed); - } - Err(err) => { - let status_code = match err.kind() { - std::io::ErrorKind::NotFound => "404", - std::io::ErrorKind::PermissionDenied => "403", - _ => "500", - }; - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "DELETE", status_code]) - .observe(delete_elapsed); - } + if result.is_ok() { + increment_object_store_calls_by_date( + "localfs", + "DELETE", + &Utc::now().date_naive().to_string(), + ); } - result?; Ok(()) } @@ -434,38 +349,20 @@ impl ObjectStorage for LocalFS { async fn delete_object(&self, path: &RelativePath) -> Result<(), ObjectStorageError> { let path = self.path_in_root(path); - let delete_start = Instant::now(); let result = tokio::fs::remove_file(path).await; - let delete_elapsed = delete_start.elapsed().as_secs_f64(); - - match &result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "DELETE", "200"]) - .observe(delete_elapsed); - // Record single file deleted successfully - increment_files_scanned_in_object_store_calls_by_date( - "localfs", - "DELETE", - 1, - &Utc::now().date_naive().to_string(), - ); - increment_object_store_calls_by_date( - "localfs", - "DELETE", - &Utc::now().date_naive().to_string(), - ); - } - Err(err) => { - let status_code = match err.kind() { - std::io::ErrorKind::NotFound => "404", - std::io::ErrorKind::PermissionDenied => "403", - _ => "500", - }; - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "DELETE", status_code]) - .observe(delete_elapsed); - } + if result.is_ok() { + // Record single file deleted successfully + increment_files_scanned_in_object_store_calls_by_date( + "localfs", + "DELETE", + 1, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date( + "localfs", + "DELETE", + &Utc::now().date_naive().to_string(), + ); } result?; @@ -473,31 +370,13 @@ impl ObjectStorage for LocalFS { } async fn check(&self) -> Result<(), ObjectStorageError> { - let check_start = Instant::now(); let result = fs::create_dir_all(&self.root).await; - let check_elapsed = check_start.elapsed().as_secs_f64(); - - match &result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "HEAD", "200"]) - .observe(check_elapsed); - increment_object_store_calls_by_date( - "localfs", - "HEAD", - &Utc::now().date_naive().to_string(), - ); - } - Err(err) => { - let status_code = match err.kind() { - std::io::ErrorKind::PermissionDenied => "403", - std::io::ErrorKind::NotFound => "404", - _ => "500", - }; - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "HEAD", status_code]) - .observe(check_elapsed); - } + if result.is_ok() { + increment_object_store_calls_by_date( + "localfs", + "HEAD", + &Utc::now().date_naive().to_string(), + ); } result.map_err(|e| ObjectStorageError::UnhandledError(e.into())) @@ -506,31 +385,13 @@ impl ObjectStorage for LocalFS { async fn delete_stream(&self, stream_name: &str) -> Result<(), ObjectStorageError> { let path = self.root.join(stream_name); - let delete_start = Instant::now(); let result = fs::remove_dir_all(path).await; - let delete_elapsed = delete_start.elapsed().as_secs_f64(); - - match &result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "DELETE", "200"]) - .observe(delete_elapsed); - increment_object_store_calls_by_date( - "localfs", - "DELETE", - &Utc::now().date_naive().to_string(), - ); - } - Err(err) => { - let status_code = match err.kind() { - std::io::ErrorKind::NotFound => "404", - std::io::ErrorKind::PermissionDenied => "403", - _ => "500", - }; - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "DELETE", status_code]) - .observe(delete_elapsed); - } + if result.is_ok() { + increment_object_store_calls_by_date( + "localfs", + "DELETE", + &Utc::now().date_naive().to_string(), + ); } Ok(result?) @@ -539,39 +400,19 @@ impl ObjectStorage for LocalFS { async fn try_delete_node_meta(&self, node_filename: String) -> Result<(), ObjectStorageError> { let path = self.root.join(node_filename); - let delete_start = Instant::now(); let result = fs::remove_file(path).await; - let delete_elapsed = delete_start.elapsed().as_secs_f64(); - - match &result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "DELETE", "200"]) - .observe(delete_elapsed); - increment_object_store_calls_by_date( - "localfs", - "DELETE", - &Utc::now().date_naive().to_string(), - ); - } - Err(err) => { - let status_code = match err.kind() { - std::io::ErrorKind::NotFound => "404", - std::io::ErrorKind::PermissionDenied => "403", - _ => "500", - }; - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "DELETE", status_code]) - .observe(delete_elapsed); - } + if result.is_ok() { + increment_object_store_calls_by_date( + "localfs", + "DELETE", + &Utc::now().date_naive().to_string(), + ); } Ok(result?) } async fn list_streams(&self) -> Result, ObjectStorageError> { - let list_start = Instant::now(); - let ignore_dir = &[ "lost+found", PARSEABLE_ROOT_DIRECTORY, @@ -581,13 +422,8 @@ impl ObjectStorage for LocalFS { ]; let result = fs::read_dir(&self.root).await; - let list_elapsed = list_start.elapsed().as_secs_f64(); - let directories = match result { Ok(read_dir) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "LIST", "200"]) - .observe(list_elapsed); increment_object_store_calls_by_date( "localfs", "LIST", @@ -596,14 +432,6 @@ impl ObjectStorage for LocalFS { ReadDirStream::new(read_dir) } Err(err) => { - let status_code = match err.kind() { - std::io::ErrorKind::NotFound => "404", - std::io::ErrorKind::PermissionDenied => "403", - _ => "500", - }; - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -622,8 +450,6 @@ impl ObjectStorage for LocalFS { } async fn list_old_streams(&self) -> Result, ObjectStorageError> { - let list_start = Instant::now(); - let ignore_dir = &[ "lost+found", PARSEABLE_ROOT_DIRECTORY, @@ -632,13 +458,8 @@ impl ObjectStorage for LocalFS { ]; let result = fs::read_dir(&self.root).await; - let list_elapsed = list_start.elapsed().as_secs_f64(); - let directories = match result { Ok(read_dir) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "LIST", "200"]) - .observe(list_elapsed); increment_object_store_calls_by_date( "localfs", "LIST", @@ -647,14 +468,6 @@ impl ObjectStorage for LocalFS { ReadDirStream::new(read_dir) } Err(err) => { - let status_code = match err.kind() { - std::io::ErrorKind::NotFound => "404", - std::io::ErrorKind::PermissionDenied => "403", - _ => "500", - }; - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -673,26 +486,17 @@ impl ObjectStorage for LocalFS { } async fn list_dirs(&self) -> Result, ObjectStorageError> { - let list_start = Instant::now(); let result = fs::read_dir(&self.root).await; - let list_elapsed = list_start.elapsed().as_secs_f64(); - let read_dir = match result { Ok(read_dir) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "LIST", "200"]) - .observe(list_elapsed); + increment_object_store_calls_by_date( + "localfs", + "LIST", + &Utc::now().date_naive().to_string(), + ); read_dir } Err(err) => { - let status_code = match err.kind() { - std::io::ErrorKind::NotFound => "404", - std::io::ErrorKind::PermissionDenied => "403", - _ => "500", - }; - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -719,26 +523,10 @@ impl ObjectStorage for LocalFS { ) -> Result, ObjectStorageError> { let root = self.root.join(relative_path.as_str()); - let list_start = Instant::now(); let result = fs::read_dir(root).await; - let list_elapsed = list_start.elapsed().as_secs_f64(); - let read_dir = match result { - Ok(read_dir) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "LIST", "200"]) - .observe(list_elapsed); - read_dir - } + Ok(read_dir) => read_dir, Err(err) => { - let status_code = match err.kind() { - std::io::ErrorKind::NotFound => "404", - std::io::ErrorKind::PermissionDenied => "403", - _ => "500", - }; - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -762,15 +550,9 @@ impl ObjectStorage for LocalFS { async fn list_dates(&self, stream_name: &str) -> Result, ObjectStorageError> { let path = self.root.join(stream_name); - let list_start = Instant::now(); let result = fs::read_dir(&path).await; - let list_elapsed = list_start.elapsed().as_secs_f64(); - let read_dir = match result { Ok(read_dir) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "LIST", "200"]) - .observe(list_elapsed); increment_object_store_calls_by_date( "localfs", "LIST", @@ -779,14 +561,6 @@ impl ObjectStorage for LocalFS { read_dir } Err(err) => { - let status_code = match err.kind() { - std::io::ErrorKind::NotFound => "404", - std::io::ErrorKind::PermissionDenied => "403", - _ => "500", - }; - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -837,7 +611,6 @@ impl ObjectStorage for LocalFS { } async fn upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { - let upload_start = Instant::now(); let op = CopyOptions { overwrite: true, skip_exist: true, @@ -849,13 +622,8 @@ impl ObjectStorage for LocalFS { } let result = fs_extra::file::copy(path, to_path, &op); - let upload_elapsed = upload_start.elapsed().as_secs_f64(); - match result { Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "PUT", "200"]) - .observe(upload_elapsed); increment_object_store_calls_by_date( "localfs", "PUT", @@ -863,12 +631,7 @@ impl ObjectStorage for LocalFS { ); Ok(()) } - Err(err) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["localfs", "PUT", "500"]) - .observe(upload_elapsed); - Err(err.into()) - } + Err(err) => Err(err.into()), } } diff --git a/src/storage/s3.rs b/src/storage/s3.rs index 57fe60992..853fef8b7 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -25,7 +25,7 @@ use std::{ Arc, atomic::{AtomicU64, Ordering}, }, - time::{Duration, Instant}, + time::Duration, }; use async_trait::async_trait; @@ -52,8 +52,7 @@ use tracing::error; use crate::{ metrics::{ - STORAGE_REQUEST_RESPONSE_TIME, increment_files_scanned_in_object_store_calls_by_date, - increment_object_store_calls_by_date, + increment_files_scanned_in_object_store_calls_by_date, increment_object_store_calls_by_date, }, parseable::LogStream, }; @@ -61,8 +60,8 @@ use crate::{ use super::{ CONNECT_TIMEOUT_SECS, MIN_MULTIPART_UPLOAD_SIZE, ObjectStorage, ObjectStorageError, ObjectStorageProvider, PARSEABLE_ROOT_DIRECTORY, REQUEST_TIMEOUT_SECS, - STREAM_METADATA_FILE_NAME, metrics_layer::MetricLayer, metrics_layer::error_to_status_code, - object_storage::parseable_json_path, to_object_store_path, + STREAM_METADATA_FILE_NAME, metrics_layer::MetricLayer, object_storage::parseable_json_path, + to_object_store_path, }; // in bytes @@ -339,18 +338,12 @@ pub struct S3 { impl S3 { async fn _get_object(&self, path: &RelativePath) -> Result { - let time = std::time::Instant::now(); let resp = self.client.get(&to_object_store_path(path)).await; - let elapsed = time.elapsed().as_secs_f64(); - increment_object_store_calls_by_date("s3", "GET", &Utc::now().date_naive().to_string()); match resp { Ok(resp) => { let body = resp.bytes().await?; - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "GET", "200"]) - .observe(elapsed); increment_files_scanned_in_object_store_calls_by_date( "s3", "GET", @@ -359,13 +352,7 @@ impl S3 { ); Ok(body) } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "GET", status_code]) - .observe(elapsed); - Err(err.into()) - } + Err(err) => Err(err.into()), } } @@ -374,16 +361,10 @@ impl S3 { path: &RelativePath, resource: PutPayload, ) -> Result<(), ObjectStorageError> { - let time = std::time::Instant::now(); let resp = self.client.put(&to_object_store_path(path), resource).await; - let elapsed = time.elapsed().as_secs_f64(); - increment_object_store_calls_by_date("s3", "PUT", &Utc::now().date_naive().to_string()); match resp { Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT", "200"]) - .observe(elapsed); increment_files_scanned_in_object_store_calls_by_date( "s3", "PUT", @@ -392,13 +373,7 @@ impl S3 { ); Ok(()) } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT", status_code]) - .observe(elapsed); - Err(err.into()) - } + Err(err) => Err(err.into()), } } @@ -406,12 +381,7 @@ impl S3 { let files_scanned = Arc::new(AtomicU64::new(0)); let files_deleted = Arc::new(AtomicU64::new(0)); // Track LIST operation - let list_start = Instant::now(); let object_stream = self.client.list(Some(&(key.into()))); - let list_elapsed = list_start.elapsed().as_secs_f64(); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", "200"]) - .observe(list_elapsed); increment_object_store_calls_by_date("s3", "LIST", &Utc::now().date_naive().to_string()); object_stream @@ -421,27 +391,17 @@ impl S3 { match x { Ok(obj) => { files_deleted.fetch_add(1, Ordering::Relaxed); - let delete_start = Instant::now(); let delete_resp = self.client.delete(&obj.location).await; - let delete_elapsed = delete_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date( "s3", "DELETE", &Utc::now().date_naive().to_string(), ); - match delete_resp { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "DELETE", "200"]) - .observe(delete_elapsed); - } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "DELETE", status_code]) - .observe(delete_elapsed); - error!("Failed to delete object during delete stream: {:?}", err); - } + if delete_resp.is_err() { + error!( + "Failed to delete object during delete stream: {:?}", + delete_resp + ); } } Err(err) => { @@ -463,31 +423,19 @@ impl S3 { files_deleted.load(Ordering::Relaxed), &Utc::now().date_naive().to_string(), ); - // Note: Individual DELETE calls are tracked inside the concurrent loop Ok(()) } async fn _list_dates(&self, stream: &str) -> Result, ObjectStorageError> { - let list_start = Instant::now(); let resp: Result = self .client .list_with_delimiter(Some(&(stream.into()))) .await; - let list_elapsed = list_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date("s3", "LIST", &Utc::now().date_naive().to_string()); let resp = match resp { - Ok(resp) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", "200"]) - .observe(list_elapsed); - resp - } + Ok(resp) => resp, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -514,16 +462,10 @@ impl S3 { async fn _upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { let bytes = tokio::fs::read(path).await?; - let put_start = Instant::now(); let result = self.client.put(&key.into(), bytes.into()).await; - let put_elapsed = put_start.elapsed().as_secs_f64(); - increment_object_store_calls_by_date("s3", "PUT", &Utc::now().date_naive().to_string()); match result { Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT", "200"]) - .observe(put_elapsed); increment_files_scanned_in_object_store_calls_by_date( "s3", "PUT", @@ -532,13 +474,7 @@ impl S3 { ); Ok(()) } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT", status_code]) - .observe(put_elapsed); - Err(err.into()) - } + Err(err) => Err(err.into()), } } @@ -551,27 +487,15 @@ impl S3 { let location = &to_object_store_path(key); // Track multipart initiation - let multipart_start = Instant::now(); let async_writer = self.client.put_multipart(location).await; - let multipart_elapsed = multipart_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date( "s3", "PUT_MULTIPART", &Utc::now().date_naive().to_string(), ); let mut async_writer = match async_writer { - Ok(writer) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT_MULTIPART", "200"]) - .observe(multipart_elapsed); - - writer - } + Ok(writer) => writer, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT_MULTIPART", status_code]) - .observe(multipart_elapsed); return Err(err.into()); } }; @@ -583,16 +507,10 @@ impl S3 { file.read_to_end(&mut data).await?; // Track single PUT operation for small files - let put_start = Instant::now(); let result = self.client.put(location, data.into()).await; - let put_elapsed = put_start.elapsed().as_secs_f64(); - increment_object_store_calls_by_date("s3", "PUT", &Utc::now().date_naive().to_string()); match result { Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT", "200"]) - .observe(put_elapsed); increment_files_scanned_in_object_store_calls_by_date( "s3", "PUT", @@ -601,10 +519,6 @@ impl S3 { ); } Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT", status_code]) - .observe(put_elapsed); return Err(err.into()); } } @@ -637,55 +551,25 @@ impl S3 { let part_data = data[start_pos..end_pos].to_vec(); // Track individual part upload - let part_start = Instant::now(); let result = async_writer.put_part(part_data.into()).await; - let part_elapsed = part_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date( "s3", "PUT_MULTIPART", &Utc::now().date_naive().to_string(), ); - match result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT_MULTIPART", "200"]) - .observe(part_elapsed); - increment_files_scanned_in_object_store_calls_by_date( - "s3", - "PUT_MULTIPART", - 1, - &Utc::now().date_naive().to_string(), - ); - } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT_MULTIPART", status_code]) - .observe(part_elapsed); - return Err(err.into()); - } + if result.is_err() { + return Err(result.err().unwrap().into()); } // upload_parts.push(part_number as u64 + 1); } // Track multipart completion - let complete_start = Instant::now(); let complete_result = async_writer.complete().await; - let complete_elapsed = complete_start.elapsed().as_secs_f64(); - if let Err(err) = complete_result { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT_MULTIPART_COMPLETE", status_code]) - .observe(complete_elapsed); error!("Failed to complete multipart upload. {:?}", err); async_writer.abort().await?; return Err(err.into()); - } else { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT_MULTIPART_COMPLETE", "200"]) - .observe(complete_elapsed); } } Ok(()) @@ -699,17 +583,10 @@ impl ObjectStorage for S3 { path: &RelativePath, ) -> Result { let path = &to_object_store_path(path); - - let head_start = Instant::now(); let meta = self.client.head(path).await; - let head_elapsed = head_start.elapsed().as_secs_f64(); - increment_object_store_calls_by_date("s3", "HEAD", &Utc::now().date_naive().to_string()); let meta = match meta { Ok(meta) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "HEAD", "200"]) - .observe(head_elapsed); increment_files_scanned_in_object_store_calls_by_date( "s3", "HEAD", @@ -719,10 +596,6 @@ impl ObjectStorage for S3 { meta } Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "HEAD", status_code]) - .observe(head_elapsed); return Err(err.into()); } }; @@ -741,29 +614,15 @@ impl ObjectStorage for S3 { } async fn head(&self, path: &RelativePath) -> Result { - let head_start = Instant::now(); let result = self.client.head(&to_object_store_path(path)).await; - let head_elapsed = head_start.elapsed().as_secs_f64(); - increment_object_store_calls_by_date("s3", "HEAD", &Utc::now().date_naive().to_string()); - match &result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "HEAD", "200"]) - .observe(head_elapsed); - increment_files_scanned_in_object_store_calls_by_date( - "s3", - "HEAD", - 1, - &Utc::now().date_naive().to_string(), - ); - } - Err(err) => { - let status_code = error_to_status_code(err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "HEAD", status_code]) - .observe(head_elapsed); - } + if result.is_ok() { + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "HEAD", + 1, + &Utc::now().date_naive().to_string(), + ); } Ok(result?) @@ -784,8 +643,6 @@ impl ObjectStorage for S3 { self.root.clone() }; - // Track list operation - let list_start = Instant::now(); let mut list_stream = self.client.list(Some(&prefix)); let mut res = vec![]; @@ -813,9 +670,6 @@ impl ObjectStorage for S3 { .map_err(ObjectStorageError::PathError)?, ) .await?; - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "GET", "200"]) - .observe(list_start.elapsed().as_secs_f64()); increment_files_scanned_in_object_store_calls_by_date( "s3", "GET", @@ -825,11 +679,6 @@ impl ObjectStorage for S3 { increment_object_store_calls_by_date("s3", "GET", &Utc::now().date_naive().to_string()); res.push(byts); } - let list_elapsed = list_start.elapsed().as_secs_f64(); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", "200"]) - .observe(list_elapsed); - // Record total files scanned increment_files_scanned_in_object_store_calls_by_date( "s3", @@ -847,8 +696,6 @@ impl ObjectStorage for S3 { let mut path_arr = vec![]; let mut files_scanned = 0; - // Track list operation - let list_start = Instant::now(); let mut object_stream = self.client.list(Some(&self.root)); increment_object_store_calls_by_date("s3", "LIST", &Utc::now().date_naive().to_string()); @@ -867,10 +714,6 @@ impl ObjectStorage for S3 { path_arr.push(RelativePathBuf::from(meta.location.as_ref())); } } - let list_elapsed = list_start.elapsed().as_secs_f64(); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", "200"]) - .observe(list_elapsed); // Record total files scanned increment_files_scanned_in_object_store_calls_by_date( "s3", @@ -900,61 +743,34 @@ impl ObjectStorage for S3 { } async fn delete_object(&self, path: &RelativePath) -> Result<(), ObjectStorageError> { - let delete_start = Instant::now(); let result = self.client.delete(&to_object_store_path(path)).await; - let delete_elapsed = delete_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date("s3", "DELETE", &Utc::now().date_naive().to_string()); - match &result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "DELETE", "200"]) - .observe(delete_elapsed); - // Record single file deleted - increment_files_scanned_in_object_store_calls_by_date( - "s3", - "DELETE", - 1, - &Utc::now().date_naive().to_string(), - ); - } - Err(err) => { - let status_code = error_to_status_code(err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "DELETE", status_code]) - .observe(delete_elapsed); - } + if result.is_ok() { + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "DELETE", + 1, + &Utc::now().date_naive().to_string(), + ); } Ok(result?) } async fn check(&self) -> Result<(), ObjectStorageError> { - let head_start = Instant::now(); let result = self .client .head(&to_object_store_path(&parseable_json_path())) .await; - let head_elapsed = head_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date("s3", "HEAD", &Utc::now().date_naive().to_string()); - match &result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "HEAD", "200"]) - .observe(head_elapsed); - increment_files_scanned_in_object_store_calls_by_date( - "s3", - "HEAD", - 1, - &Utc::now().date_naive().to_string(), - ); - } - Err(err) => { - let status_code = error_to_status_code(err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "HEAD", status_code]) - .observe(head_elapsed); - } + if result.is_ok() { + increment_files_scanned_in_object_store_calls_by_date( + "s3", + "HEAD", + 1, + &Utc::now().date_naive().to_string(), + ); } Ok(result.map(|_| ())?) @@ -969,16 +785,10 @@ impl ObjectStorage for S3 { async fn try_delete_node_meta(&self, node_filename: String) -> Result<(), ObjectStorageError> { let file = RelativePathBuf::from(&node_filename); - let delete_start = Instant::now(); let result = self.client.delete(&to_object_store_path(&file)).await; - let delete_elapsed = delete_start.elapsed().as_secs_f64(); - increment_object_store_calls_by_date("s3", "DELETE", &Utc::now().date_naive().to_string()); match result { Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "DELETE", "200"]) - .observe(delete_elapsed); increment_files_scanned_in_object_store_calls_by_date( "s3", "DELETE", @@ -987,14 +797,7 @@ impl ObjectStorage for S3 { ); Ok(()) } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "DELETE", status_code]) - .observe(delete_elapsed); - - Err(err.into()) - } + Err(err) => Err(err.into()), } } @@ -1006,14 +809,7 @@ impl ObjectStorage for S3 { } async fn list_old_streams(&self) -> Result, ObjectStorageError> { - // Track LIST operation - let list_start = Instant::now(); let resp = self.client.list_with_delimiter(None).await?; - let list_elapsed = list_start.elapsed().as_secs_f64(); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", "200"]) - .observe(list_elapsed); - let common_prefixes = resp.common_prefixes; // get all dirs increment_files_scanned_in_object_store_calls_by_date( "s3", @@ -1035,28 +831,12 @@ impl ObjectStorage for S3 { for dir in &dirs { let key = format!("{dir}/{STREAM_METADATA_FILE_NAME}"); let task = async move { - let head_start = Instant::now(); let result = self.client.head(&StorePath::from(key)).await; - let head_elapsed = head_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date( "s3", "HEAD", &Utc::now().date_naive().to_string(), ); - match &result { - Ok(_) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "HEAD", "200"]) - .observe(head_elapsed); - } - Err(err) => { - let status_code = error_to_status_code(err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "HEAD", status_code]) - .observe(head_elapsed); - } - } - result.map(|_| ()) }; stream_json_check.push(task); @@ -1084,12 +864,7 @@ impl ObjectStorage for S3 { date: &str, ) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from(format!("{}/{}/", stream_name, date)); - let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await?; - let list_elapsed = list_start.elapsed().as_secs_f64(); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", "200"]) - .observe(list_elapsed); increment_files_scanned_in_object_store_calls_by_date( "s3", "LIST", @@ -1125,12 +900,7 @@ impl ObjectStorage for S3 { hour: &str, ) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from(format!("{}/{}/{}/", stream_name, date, hour)); - let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await?; - let list_elapsed = list_start.elapsed().as_secs_f64(); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", "200"]) - .observe(list_elapsed); increment_files_scanned_in_object_store_calls_by_date( "s3", "LIST", @@ -1183,16 +953,10 @@ impl ObjectStorage for S3 { async fn list_dirs(&self) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from("/"); - - let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await; - let list_elapsed = list_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date("s3", "LIST", &Utc::now().date_naive().to_string()); let resp = match resp { Ok(resp) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", "200"]) - .observe(list_elapsed); increment_files_scanned_in_object_store_calls_by_date( "s3", "LIST", @@ -1203,10 +967,6 @@ impl ObjectStorage for S3 { resp } Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -1225,15 +985,10 @@ impl ObjectStorage for S3 { ) -> Result, ObjectStorageError> { let prefix = object_store::path::Path::from(relative_path.as_str()); - let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&prefix)).await; - let list_elapsed = list_start.elapsed().as_secs_f64(); increment_object_store_calls_by_date("s3", "LIST", &Utc::now().date_naive().to_string()); let resp = match resp { Ok(resp) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", "200"]) - .observe(list_elapsed); increment_files_scanned_in_object_store_calls_by_date( "s3", "LIST", @@ -1244,10 +999,6 @@ impl ObjectStorage for S3 { resp } Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; From b87431f03bfbff4ebe67c2632c6ff7bb1a43e55b Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Fri, 19 Sep 2025 01:51:10 -0700 Subject: [PATCH 12/14] clippy suggestions --- src/storage/localfs.rs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/storage/localfs.rs b/src/storage/localfs.rs index 6bbeafcc2..2271cbf80 100644 --- a/src/storage/localfs.rs +++ b/src/storage/localfs.rs @@ -278,6 +278,18 @@ impl ObjectStorage for LocalFS { let file_result = fs::read(entry.path()).await; match file_result { Ok(file) => { + // Record total files scanned + increment_files_scanned_in_object_store_calls_by_date( + "localfs", + "GET", + 1, + &Utc::now().date_naive().to_string(), + ); + increment_object_store_calls_by_date( + "localfs", + "GET", + &Utc::now().date_naive().to_string(), + ); res.push(file.into()); } Err(err) => { @@ -286,16 +298,15 @@ impl ObjectStorage for LocalFS { } } - // Record total files scanned increment_files_scanned_in_object_store_calls_by_date( "localfs", - "GET", + "LIST", files_scanned as u64, &Utc::now().date_naive().to_string(), ); increment_object_store_calls_by_date( "localfs", - "GET", + "LIST", &Utc::now().date_naive().to_string(), ); From 09b5b59a849ab94eef76161ab1790261cd563cc2 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Tue, 23 Sep 2025 03:01:34 -0700 Subject: [PATCH 13/14] remove unused metrics --- src/metadata.rs | 7 -- src/metrics/mod.rs | 220 ---------------------------------- src/stats.rs | 12 +- src/storage/object_storage.rs | 4 - 4 files changed, 1 insertion(+), 242 deletions(-) diff --git a/src/metadata.rs b/src/metadata.rs index 2d3bcae22..e49fc2119 100644 --- a/src/metadata.rs +++ b/src/metadata.rs @@ -29,7 +29,6 @@ use crate::handlers::TelemetryType; use crate::metrics::{ EVENTS_INGESTED, EVENTS_INGESTED_DATE, EVENTS_INGESTED_SIZE, EVENTS_INGESTED_SIZE_DATE, EVENTS_STORAGE_SIZE_DATE, LIFETIME_EVENTS_INGESTED, LIFETIME_EVENTS_INGESTED_SIZE, - TOTAL_EVENTS_INGESTED_DATE, TOTAL_EVENTS_INGESTED_SIZE_DATE, }; use crate::storage::StreamType; use crate::storage::retention::Retention; @@ -60,12 +59,6 @@ pub fn update_stats( LIFETIME_EVENTS_INGESTED_SIZE .with_label_values(&[stream_name, origin]) .add(size as i64); - TOTAL_EVENTS_INGESTED_DATE - .with_label_values(&[origin, &parsed_date]) - .add(num_rows as i64); - TOTAL_EVENTS_INGESTED_SIZE_DATE - .with_label_values(&[origin, &parsed_date]) - .add(size as i64); } /// In order to support backward compatability with streams created before v1.6.4, diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index 749fa56ca..56f3d98b9 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -158,42 +158,6 @@ pub static EVENTS_STORAGE_SIZE_DATE: Lazy = Lazy::new(|| { .expect("metric can be created") }); -pub static TOTAL_EVENTS_INGESTED_DATE: Lazy = Lazy::new(|| { - IntGaugeVec::new( - Opts::new( - "total_events_ingested_date", - "total events ingested on a particular date", - ) - .namespace(METRICS_NAMESPACE), - &["format", "date"], - ) - .expect("metric can be created") -}); - -pub static TOTAL_EVENTS_INGESTED_SIZE_DATE: Lazy = Lazy::new(|| { - IntGaugeVec::new( - Opts::new( - "total_events_ingested_size_date", - "Total events ingested size in bytes on a particular date", - ) - .namespace(METRICS_NAMESPACE), - &["format", "date"], - ) - .expect("metric can be created") -}); - -pub static TOTAL_EVENTS_STORAGE_SIZE_DATE: Lazy = Lazy::new(|| { - IntGaugeVec::new( - Opts::new( - "total_events_storage_size_date", - "Total events storage size in bytes on a particular date", - ) - .namespace(METRICS_NAMESPACE), - &["format", "date"], - ) - .expect("metric can be created") -}); - pub static STAGING_FILES: Lazy = Lazy::new(|| { IntGaugeVec::new( Opts::new("staging_files", "Active Staging files").namespace(METRICS_NAMESPACE), @@ -357,141 +321,6 @@ pub static TOTAL_OUTPUT_LLM_TOKENS_BY_DATE: Lazy = Lazy::new(|| { .expect("metric can be created") }); -// Cluster Billing Metrics - Gauge type metrics for cluster-wide aggregated billing data -pub static TOTAL_CLUSTER_EVENTS_INGESTED_BY_DATE: Lazy = Lazy::new(|| { - IntGaugeVec::new( - Opts::new( - "total_cluster_events_ingested_by_date", - "Total cluster events ingested by date (Gauge for cluster billing)", - ) - .namespace(METRICS_NAMESPACE), - &["date"], - ) - .expect("metric can be created") -}); - -pub static TOTAL_CLUSTER_EVENTS_INGESTED_SIZE_BY_DATE: Lazy = Lazy::new(|| { - IntGaugeVec::new( - Opts::new( - "total_cluster_events_ingested_size_by_date", - "Total cluster events ingested size in bytes by date (Gauge for cluster billing)", - ) - .namespace(METRICS_NAMESPACE), - &["date"], - ) - .expect("metric can be created") -}); - -pub static TOTAL_CLUSTER_PARQUETS_STORED_BY_DATE: Lazy = Lazy::new(|| { - IntGaugeVec::new( - Opts::new( - "total_cluster_parquets_stored_by_date", - "Total cluster parquet files stored by date (Gauge for cluster billing)", - ) - .namespace(METRICS_NAMESPACE), - &["date"], - ) - .expect("metric can be created") -}); - -pub static TOTAL_CLUSTER_PARQUETS_STORED_SIZE_BY_DATE: Lazy = Lazy::new(|| { - IntGaugeVec::new( - Opts::new( - "total_cluster_parquets_stored_size_by_date", - "Total cluster parquet files stored size in bytes by date (Gauge for cluster billing)", - ) - .namespace(METRICS_NAMESPACE), - &["date"], - ) - .expect("metric can be created") -}); - -pub static TOTAL_CLUSTER_QUERY_CALLS_BY_DATE: Lazy = Lazy::new(|| { - IntGaugeVec::new( - Opts::new( - "total_cluster_query_calls_by_date", - "Total cluster query calls by date (Gauge for cluster billing)", - ) - .namespace(METRICS_NAMESPACE), - &["date"], - ) - .expect("metric can be created") -}); - -pub static TOTAL_CLUSTER_FILES_SCANNED_IN_QUERY_BY_DATE: Lazy = Lazy::new(|| { - IntGaugeVec::new( - Opts::new( - "total_cluster_files_scanned_in_query_by_date", - "Total cluster files scanned in queries by date (Gauge for cluster billing)", - ) - .namespace(METRICS_NAMESPACE), - &["date"], - ) - .expect("metric can be created") -}); - -pub static TOTAL_CLUSTER_BYTES_SCANNED_IN_QUERY_BY_DATE: Lazy = Lazy::new(|| { - IntGaugeVec::new( - Opts::new( - "total_cluster_bytes_scanned_in_query_by_date", - "Total cluster bytes scanned in queries by date (Gauge for cluster billing)", - ) - .namespace(METRICS_NAMESPACE), - &["date"], - ) - .expect("metric can be created") -}); - -pub static TOTAL_CLUSTER_OBJECT_STORE_CALLS_BY_DATE: Lazy = Lazy::new(|| { - IntGaugeVec::new( - Opts::new( - "total_cluster_object_store_calls_by_date", - "Total cluster object store calls by date (Gauge for cluster billing)", - ) - .namespace(METRICS_NAMESPACE), - &["provider", "method", "date"], - ) - .expect("metric can be created") -}); - -pub static TOTAL_CLUSTER_FILES_SCANNED_IN_OBJECT_STORE_CALLS_BY_DATE: Lazy = Lazy::new( - || { - IntGaugeVec::new( - Opts::new( - "total_cluster_files_scanned_in_object_store_calls_by_date", - "Total cluster files scanned in object store calls by date (Gauge for cluster billing)", - ) - .namespace(METRICS_NAMESPACE), - &["provider", "method", "date"], - ) - .expect("metric can be created") - }, -); - -pub static TOTAL_CLUSTER_INPUT_LLM_TOKENS_BY_DATE: Lazy = Lazy::new(|| { - IntGaugeVec::new( - Opts::new( - "total_cluster_input_llm_tokens_by_date", - "Total cluster input LLM tokens used by date (Gauge for cluster billing)", - ) - .namespace(METRICS_NAMESPACE), - &["provider", "model", "date"], - ) - .expect("metric can be created") -}); - -pub static TOTAL_CLUSTER_OUTPUT_LLM_TOKENS_BY_DATE: Lazy = Lazy::new(|| { - IntGaugeVec::new( - Opts::new( - "total_cluster_output_llm_tokens_by_date", - "Total cluster output LLM tokens used by date (Gauge for cluster billing)", - ) - .namespace(METRICS_NAMESPACE), - &["provider", "model", "date"], - ) - .expect("metric can be created") -}); - pub static STORAGE_REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { HistogramVec::new( HistogramOpts::new("storage_request_response_time", "Storage Request Latency") @@ -538,15 +367,6 @@ fn custom_metrics(registry: &Registry) { registry .register(Box::new(EVENTS_STORAGE_SIZE_DATE.clone())) .expect("metric can be registered"); - registry - .register(Box::new(TOTAL_EVENTS_INGESTED_DATE.clone())) - .expect("metric can be registered"); - registry - .register(Box::new(TOTAL_EVENTS_INGESTED_SIZE_DATE.clone())) - .expect("metric can be registered"); - registry - .register(Box::new(TOTAL_EVENTS_STORAGE_SIZE_DATE.clone())) - .expect("metric can be registered"); registry .register(Box::new(STAGING_FILES.clone())) .expect("metric can be registered"); @@ -595,46 +415,6 @@ fn custom_metrics(registry: &Registry) { registry .register(Box::new(TOTAL_OUTPUT_LLM_TOKENS_BY_DATE.clone())) .expect("metric can be registered"); - // Register cluster billing metrics - registry - .register(Box::new(TOTAL_CLUSTER_EVENTS_INGESTED_BY_DATE.clone())) - .expect("metric can be registered"); - registry - .register(Box::new(TOTAL_CLUSTER_EVENTS_INGESTED_SIZE_BY_DATE.clone())) - .expect("metric can be registered"); - registry - .register(Box::new(TOTAL_CLUSTER_PARQUETS_STORED_BY_DATE.clone())) - .expect("metric can be registered"); - registry - .register(Box::new(TOTAL_CLUSTER_PARQUETS_STORED_SIZE_BY_DATE.clone())) - .expect("metric can be registered"); - registry - .register(Box::new(TOTAL_CLUSTER_QUERY_CALLS_BY_DATE.clone())) - .expect("metric can be registered"); - registry - .register(Box::new( - TOTAL_CLUSTER_FILES_SCANNED_IN_QUERY_BY_DATE.clone(), - )) - .expect("metric can be registered"); - registry - .register(Box::new( - TOTAL_CLUSTER_BYTES_SCANNED_IN_QUERY_BY_DATE.clone(), - )) - .expect("metric can be registered"); - registry - .register(Box::new(TOTAL_CLUSTER_OBJECT_STORE_CALLS_BY_DATE.clone())) - .expect("metric can be registered"); - registry - .register(Box::new( - TOTAL_CLUSTER_FILES_SCANNED_IN_OBJECT_STORE_CALLS_BY_DATE.clone(), - )) - .expect("metric can be registered"); - registry - .register(Box::new(TOTAL_CLUSTER_INPUT_LLM_TOKENS_BY_DATE.clone())) - .expect("metric can be registered"); - registry - .register(Box::new(TOTAL_CLUSTER_OUTPUT_LLM_TOKENS_BY_DATE.clone())) - .expect("metric can be registered"); registry .register(Box::new(STORAGE_REQUEST_RESPONSE_TIME.clone())) .expect("metric can be registered"); diff --git a/src/stats.rs b/src/stats.rs index 464a22b0a..ced7b10b6 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -29,8 +29,7 @@ use crate::metrics::{ DELETED_EVENTS_STORAGE_SIZE, EVENTS_DELETED, EVENTS_DELETED_SIZE, EVENTS_INGESTED, EVENTS_INGESTED_DATE, EVENTS_INGESTED_SIZE, EVENTS_INGESTED_SIZE_DATE, EVENTS_STORAGE_SIZE_DATE, LIFETIME_EVENTS_INGESTED, LIFETIME_EVENTS_INGESTED_SIZE, - LIFETIME_EVENTS_STORAGE_SIZE, STORAGE_SIZE, TOTAL_EVENTS_INGESTED_DATE, - TOTAL_EVENTS_INGESTED_SIZE_DATE, TOTAL_EVENTS_STORAGE_SIZE_DATE, + LIFETIME_EVENTS_STORAGE_SIZE, STORAGE_SIZE, }; use crate::storage::{ObjectStorage, ObjectStorageError, ObjectStoreFormat}; @@ -141,15 +140,6 @@ pub async fn update_deleted_stats( num_row += manifest.events_ingested as i64; ingestion_size += manifest.ingestion_size as i64; storage_size += manifest.storage_size as i64; - TOTAL_EVENTS_INGESTED_DATE - .with_label_values(&["json", &manifest_date]) - .sub(manifest.events_ingested as i64); - TOTAL_EVENTS_INGESTED_SIZE_DATE - .with_label_values(&["json", &manifest_date]) - .sub(manifest.ingestion_size as i64); - TOTAL_EVENTS_STORAGE_SIZE_DATE - .with_label_values(&["parquet", &manifest_date]) - .sub(manifest.storage_size as i64); } } EVENTS_DELETED diff --git a/src/storage/object_storage.rs b/src/storage/object_storage.rs index 858e8740e..225244bb9 100644 --- a/src/storage/object_storage.rs +++ b/src/storage/object_storage.rs @@ -49,7 +49,6 @@ use crate::handlers::http::fetch_schema; use crate::handlers::http::modal::ingest_server::INGESTOR_EXPECT; use crate::handlers::http::modal::ingest_server::INGESTOR_META; use crate::handlers::http::users::{FILTER_DIR, USERS_ROOT_DIR}; -use crate::metrics::TOTAL_EVENTS_STORAGE_SIZE_DATE; use crate::metrics::increment_parquets_stored_by_date; use crate::metrics::increment_parquets_stored_size_by_date; use crate::metrics::{EVENTS_STORAGE_SIZE_DATE, LIFETIME_EVENTS_STORAGE_SIZE, STORAGE_SIZE}; @@ -183,9 +182,6 @@ fn update_storage_metrics( LIFETIME_EVENTS_STORAGE_SIZE .with_label_values(&["data", stream_name, "parquet"]) .add(compressed_size as i64); - TOTAL_EVENTS_STORAGE_SIZE_DATE - .with_label_values(&["parquet", file_date_part]) - .add(compressed_size as i64); // billing metrics for parquet storage increment_parquets_stored_by_date(file_date_part); From 4f899a43f19a9937c4a04a6b30341be8cc5accfe Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Tue, 23 Sep 2025 03:21:27 -0700 Subject: [PATCH 14/14] fix table listing prefixes --- src/query/listing_table_builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/query/listing_table_builder.rs b/src/query/listing_table_builder.rs index a2087d2cc..989734811 100644 --- a/src/query/listing_table_builder.rs +++ b/src/query/listing_table_builder.rs @@ -100,7 +100,7 @@ impl ListingTableBuilder { for prefix in prefixes { match storage.list_dirs_relative(&prefix).await { Ok(paths) => { - listing.extend(paths.into_iter().map(|p| p.to_string())); + listing.extend(paths.into_iter().map(|p| prefix.join(p).to_string())); } Err(e) => { return Err(DataFusionError::External(Box::new(e)));