From d25929b05f02bff37b08056b5639973ba0e4d55d Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Wed, 22 Jan 2025 00:00:15 +0530 Subject: [PATCH 01/34] refactor: construct prefix list in-place --- src/utils/mod.rs | 75 +++++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 43 deletions(-) diff --git a/src/utils/mod.rs b/src/utils/mod.rs index e539b9e9f..4c4ba8425 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -106,23 +106,21 @@ impl TimePeriod { } pub fn generate_prefixes(&self) -> Vec { - let end_minute = self.end.minute() + u32::from(self.end.second() > 0); - self.generate_date_prefixes( - self.start.date_naive(), - self.end.date_naive(), - (self.start.hour(), self.start.minute()), - (self.end.hour(), end_minute), - ) + let mut prefixes = vec![]; + self.generate_date_prefixes(&mut prefixes); + + prefixes } - pub fn generate_minute_prefixes( + fn generate_minute_prefixes( &self, + prefixes: &mut Vec, prefix: &str, start_minute: u32, end_minute: u32, - ) -> Vec { + ) { if start_minute == end_minute { - return vec![]; + return; } let (start_block, end_block) = ( @@ -134,83 +132,76 @@ impl TimePeriod { // ensure both start and end are within the same hour, else return prefix as is if end_block - start_block >= forbidden_block { - return vec![prefix.to_owned()]; + prefixes.push(prefix.to_owned()); + return; } - let mut prefixes = vec![]; - let push_prefix = |block: u32, prefixes: &mut Vec<_>| { - if let Some(minute_prefix) = - minute_to_prefix(block * self.data_granularity, self.data_granularity) + if let Some(minute_slot) = + minute_to_slot(block * self.data_granularity, self.data_granularity) { - let prefix = prefix.to_owned() + &minute_prefix; + let prefix = prefix.to_owned() + &format!("minute={minute_slot}/",); prefixes.push(prefix); } }; for block in start_block..end_block { - push_prefix(block, &mut prefixes); + push_prefix(block, prefixes); } // NOTE: for block sizes larger than a minute ensure // ensure last block is considered if self.data_granularity > 1 { - push_prefix(end_block, &mut prefixes); + push_prefix(end_block, prefixes); } - - prefixes } - pub fn generate_hour_prefixes( + fn generate_hour_prefixes( &self, + prefixes: &mut Vec, prefix: &str, start_hour: u32, start_minute: u32, end_hour: u32, end_minute: u32, - ) -> Vec { + ) { // ensure both start and end are within the same day if end_hour - start_hour >= 24 { - return vec![prefix.to_owned()]; + prefixes.push(prefix.to_owned()); + return; } - let mut prefixes = vec![]; - for hour in start_hour..=end_hour { if hour == 24 { break; } - let prefix = prefix.to_owned() + &hour_to_prefix(hour); + let prefix = prefix.to_owned() + &format!("hour={hour:02}/"); let is_start = hour == start_hour; let is_end = hour == end_hour; if is_start || is_end { - let minute_prefixes = self.generate_minute_prefixes( + self.generate_minute_prefixes( + prefixes, &prefix, if is_start { start_minute } else { 0 }, if is_end { end_minute } else { 60 }, ); - prefixes.extend(minute_prefixes); } else { prefixes.push(prefix); } } - - prefixes } - pub fn generate_date_prefixes( - &self, - start_date: NaiveDate, - end_date: NaiveDate, - start_time: (u32, u32), - end_time: (u32, u32), - ) -> Vec { - let mut prefixes = vec![]; + fn generate_date_prefixes(&self, prefixes: &mut Vec) { + let end_minute = self.end.minute() + u32::from(self.end.second() > 0); + let start_date = self.start.date_naive(); + let end_date = self.end.date_naive(); + let start_time = (self.start.hour(), self.start.minute()); + let end_time = (self.end.hour(), end_minute); let mut date = start_date; while date <= end_date { - let prefix = date_to_prefix(date); + let prefix = format!("date={date}/"); let is_start = date == start_date; let is_end = date == end_date; @@ -219,21 +210,19 @@ impl TimePeriod { if is_start { start_time } else { (0, 0) }, if is_end { end_time } else { (24, 60) }, ); - let hour_prefixes = self.generate_hour_prefixes( + self.generate_hour_prefixes( + prefixes, &prefix, start_hour, start_minute, end_hour, end_minute, ); - prefixes.extend(hour_prefixes); } else { prefixes.push(prefix); } date = date.succ_opt().unwrap(); } - - prefixes } } From bf64fcd76fe10436e506c91d24b60168fee0327e Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Wed, 22 Jan 2025 01:01:56 +0530 Subject: [PATCH 02/34] test: `StorageDir` --- Cargo.lock | 11 +- Cargo.toml | 3 +- src/cli.rs | 4 +- src/event/writer/file_writer.rs | 3 +- src/handlers/http/logstream.rs | 2 +- src/handlers/http/modal/ingest_server.rs | 2 +- .../http/modal/query/querier_logstream.rs | 2 +- src/metadata.rs | 3 +- src/migration/metadata_migration.rs | 2 +- src/query/mod.rs | 2 +- src/storage/object_storage.rs | 2 +- src/storage/staging.rs | 182 +++++++++++++++--- 12 files changed, 177 insertions(+), 41 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 313fca411..cfbab6396 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "actix-codec" @@ -3268,7 +3268,7 @@ dependencies = [ "base64 0.22.0", "byteorder", "bytes", - "bzip2 0.4.4", + "bzip2 0.5.0", "cargo_toml", "chrono", "chrono-humanize", @@ -3321,6 +3321,7 @@ dependencies = [ "sha2", "static-files", "sysinfo", + "temp-dir", "thiserror 2.0.9", "thread-priority", "tokio", @@ -4629,6 +4630,12 @@ dependencies = [ "libc", ] +[[package]] +name = "temp-dir" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc1ee6eef34f12f765cb94725905c6312b6610ab2b0940889cfe58dae7bc3c72" + [[package]] name = "tempfile" version = "3.10.1" diff --git a/Cargo.toml b/Cargo.toml index b8a3f1461..5d863ad79 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -137,6 +137,7 @@ zip = { version = "2.2.0", default-features = false, features = ["deflate"] } maplit = "1.0" rstest = "0.23.0" arrow = "53.0.0" +temp-dir = "0.1.14" [package.metadata.parseable_ui] assets-url = "https://github.com/parseablehq/console/releases/download/v0.9.18/build.zip" @@ -152,4 +153,4 @@ codegen-units = 1 rdkafka = { version = "0.36.2", default-features = false, features = ["tokio"] } [target.'cfg(all(target_os = "macos", target_arch = "aarch64"))'.dependencies] -rdkafka = { version = "0.36.2", default-features = false, features = ["tokio"] } \ No newline at end of file +rdkafka = { version = "0.36.2", default-features = false, features = ["tokio"] } diff --git a/src/cli.rs b/src/cli.rs index a46ea01aa..c070a5160 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -114,7 +114,7 @@ pub struct BlobStoreArgs { pub storage: AzureBlobConfig, } -#[derive(Parser, Debug)] +#[derive(Parser, Debug, Default)] pub struct Options { // Authentication #[arg(long, env = "P_USERNAME", help = "Admin username to be set for this Parseable server", default_value = DEFAULT_USERNAME)] @@ -295,7 +295,7 @@ pub struct Options { pub ingestor_endpoint: String, #[command(flatten)] - oidc: Option, + pub oidc: Option, // Kafka configuration (conditionally compiled) #[cfg(any( diff --git a/src/event/writer/file_writer.rs b/src/event/writer/file_writer.rs index 0b990421d..a04f770d7 100644 --- a/src/event/writer/file_writer.rs +++ b/src/event/writer/file_writer.rs @@ -25,6 +25,7 @@ use std::fs::{File, OpenOptions}; use std::path::PathBuf; use super::errors::StreamWriterError; +use crate::option::CONFIG; use crate::storage::staging::StorageDir; use chrono::NaiveDateTime; @@ -88,7 +89,7 @@ fn init_new_stream_writer_file( parsed_timestamp: NaiveDateTime, custom_partition_values: &HashMap, ) -> Result<(PathBuf, StreamWriter), StreamWriterError> { - let dir = StorageDir::new(stream_name); + let dir = StorageDir::new(&CONFIG.options, stream_name); let path = dir.path_by_current_time(schema_key, parsed_timestamp, custom_partition_values); std::fs::create_dir_all(dir.data_path)?; diff --git a/src/handlers/http/logstream.rs b/src/handlers/http/logstream.rs index 7eac4e822..497f82e9e 100644 --- a/src/handlers/http/logstream.rs +++ b/src/handlers/http/logstream.rs @@ -67,7 +67,7 @@ pub async fn delete(req: HttpRequest) -> Result { let objectstore = CONFIG.storage().get_object_store(); objectstore.delete_stream(&stream_name).await?; - let stream_dir = StorageDir::new(&stream_name); + let stream_dir = StorageDir::new(&CONFIG.options, &stream_name); if fs::remove_dir_all(&stream_dir.data_path).is_err() { warn!( "failed to delete local data for stream {}. Clean {} manually", diff --git a/src/handlers/http/modal/ingest_server.rs b/src/handlers/http/modal/ingest_server.rs index 215f79478..71e9e6b02 100644 --- a/src/handlers/http/modal/ingest_server.rs +++ b/src/handlers/http/modal/ingest_server.rs @@ -56,7 +56,7 @@ use tracing::error; /// ! have to use a guard before using it pub static INGESTOR_META: Lazy = - Lazy::new(|| staging::get_ingestor_info().expect("Should Be valid Json")); + Lazy::new(|| staging::get_ingestor_info(&CONFIG).expect("Should Be valid Json")); pub struct IngestServer; diff --git a/src/handlers/http/modal/query/querier_logstream.rs b/src/handlers/http/modal/query/querier_logstream.rs index 58277f7b8..00445b8af 100644 --- a/src/handlers/http/modal/query/querier_logstream.rs +++ b/src/handlers/http/modal/query/querier_logstream.rs @@ -66,7 +66,7 @@ pub async fn delete(req: HttpRequest) -> Result { let objectstore = CONFIG.storage().get_object_store(); objectstore.delete_stream(&stream_name).await?; - let stream_dir = StorageDir::new(&stream_name); + let stream_dir = StorageDir::new(&CONFIG.options, &stream_name); if fs::remove_dir_all(&stream_dir.data_path).is_err() { warn!( "failed to delete local data for stream {}. Clean {} manually", diff --git a/src/metadata.rs b/src/metadata.rs index c3ff0fce7..d998bab1f 100644 --- a/src/metadata.rs +++ b/src/metadata.rs @@ -36,6 +36,7 @@ use crate::metrics::{ EVENTS_INGESTED_SIZE_DATE, EVENTS_STORAGE_SIZE_DATE, LIFETIME_EVENTS_INGESTED, LIFETIME_EVENTS_INGESTED_SIZE, }; +use crate::option::CONFIG; use crate::storage::retention::Retention; use crate::storage::{ObjectStorage, ObjectStoreFormat, StorageDir, StreamType}; use crate::utils::arrow::MergedRecordReader; @@ -368,7 +369,7 @@ impl StreamInfo { } fn update_schema_from_staging(stream_name: &str, current_schema: Schema) -> Schema { - let staging_files = StorageDir::new(stream_name).arrow_files(); + let staging_files = StorageDir::new(&CONFIG.options, stream_name).arrow_files(); let record_reader = MergedRecordReader::try_new(&staging_files).unwrap(); if record_reader.readers.is_empty() { return current_schema; diff --git a/src/migration/metadata_migration.rs b/src/migration/metadata_migration.rs index 4385c93b0..9198dc1da 100644 --- a/src/migration/metadata_migration.rs +++ b/src/migration/metadata_migration.rs @@ -197,7 +197,7 @@ pub async fn migrate_ingester_metadata() -> anyhow::Result, end: DateTime, ) -> HashMap> { - let dir = StorageDir::new(stream_name); + let dir = StorageDir::new(&CONFIG.options, stream_name); let mut files = dir.arrow_files_grouped_by_time(); files.retain(|k, _| path_intersects_query(k, start, end)); files diff --git a/src/storage/object_storage.rs b/src/storage/object_storage.rs index 05c046179..10bab9e04 100644 --- a/src/storage/object_storage.rs +++ b/src/storage/object_storage.rs @@ -550,7 +550,7 @@ pub trait ObjectStorage: Debug + Send + Sync + 'static { let custom_partition = STREAM_INFO .get_custom_partition(stream) .map_err(|err| ObjectStorageError::UnhandledError(Box::new(err)))?; - let dir = StorageDir::new(stream); + let dir = StorageDir::new(&CONFIG.options, stream); let schema = convert_disk_files_to_parquet( stream, &dir, diff --git a/src/storage/staging.rs b/src/storage/staging.rs index c05ef62c4..e2f8b7f47 100644 --- a/src/storage/staging.rs +++ b/src/storage/staging.rs @@ -18,10 +18,11 @@ */ use crate::{ + cli::Options, event::DEFAULT_TIMESTAMP_KEY, handlers::http::modal::{ingest_server::INGESTOR_META, IngestorMetadata, DEFAULT_VERSION}, metrics, - option::{Mode, CONFIG}, + option::{Config, Mode}, storage::OBJECT_STORE_DATA_GRANULARITY, utils::{ self, arrow::merged_reader::MergedReverseRecordReader, get_ingestor_id, get_url, @@ -56,18 +57,21 @@ const ARROW_FILE_EXTENSION: &str = "data.arrows"; // const PARQUET_FILE_EXTENSION: &str = "data.parquet"; #[derive(Debug)] -pub struct StorageDir { +pub struct StorageDir<'a> { pub data_path: PathBuf, + pub options: &'a Options, } -impl StorageDir { - pub fn new(stream_name: &str) -> Self { - let data_path = CONFIG.options.local_stream_data_path(stream_name); - - Self { data_path } +impl<'a> StorageDir<'a> { + pub fn new(options: &'a Options, stream_name: &str) -> Self { + Self { + data_path: options.local_stream_data_path(stream_name), + options, + } } pub fn file_time_suffix( + options: &Options, time: NaiveDateTime, custom_partition_values: &HashMap, extention: &str, @@ -80,7 +84,7 @@ impl StorageDir { } let local_uri = str::replace(&uri, "/", "."); let hostname = hostname_unchecked(); - if CONFIG.options.mode == Mode::Ingest { + if options.mode == Mode::Ingest { let id = INGESTOR_META.get_ingestor_id(); format!("{local_uri}{hostname}{id}.{extention}") } else { @@ -89,6 +93,7 @@ impl StorageDir { } fn filename_by_time( + options: &Options, stream_hash: &str, time: NaiveDateTime, custom_partition_values: &HashMap, @@ -96,16 +101,22 @@ impl StorageDir { format!( "{}.{}", stream_hash, - Self::file_time_suffix(time, custom_partition_values, ARROW_FILE_EXTENSION) + Self::file_time_suffix(options, time, custom_partition_values, ARROW_FILE_EXTENSION) ) } fn filename_by_current_time( + options: &Options, stream_hash: &str, parsed_timestamp: NaiveDateTime, custom_partition_values: &HashMap, ) -> String { - Self::filename_by_time(stream_hash, parsed_timestamp, custom_partition_values) + Self::filename_by_time( + options, + stream_hash, + parsed_timestamp, + custom_partition_values, + ) } pub fn path_by_current_time( @@ -115,8 +126,12 @@ impl StorageDir { custom_partition_values: &HashMap, ) -> PathBuf { let server_time_in_min = Utc::now().format("%Y%m%dT%H%M").to_string(); - let mut filename = - Self::filename_by_current_time(stream_hash, parsed_timestamp, custom_partition_values); + let mut filename = Self::filename_by_current_time( + &self.options, + stream_hash, + parsed_timestamp, + custom_partition_values, + ); filename = format!("{}{}", server_time_in_min, filename); self.data_path.join(filename) } @@ -216,8 +231,8 @@ impl StorageDir { } } -// pub fn to_parquet_path(stream_name: &str, time: NaiveDateTime) -> PathBuf { -// let data_path = CONFIG.options.local_stream_data_path(stream_name); +// pub fn to_parquet_path(options: &Options, stream_name: &str, time: NaiveDateTime) -> PathBuf { +// let data_path = options.local_stream_data_path(stream_name); // let dir = StorageDir::file_time_suffix(time, &HashMap::new(), PARQUET_FILE_EXTENSION); // // data_path.join(dir) @@ -276,6 +291,7 @@ pub fn convert_disk_files_to_parquet( } } let props = parquet_writer_props( + dir.options, time_partition.clone(), index_time_partition, custom_partition_fields, @@ -320,6 +336,7 @@ pub fn convert_disk_files_to_parquet( } pub fn parquet_writer_props( + options: &Options, time_partition: Option, index_time_partition: usize, custom_partition_fields: HashMap, @@ -336,8 +353,8 @@ pub fn parquet_writer_props( nulls_first: true, }); let mut props = WriterProperties::builder() - .set_max_row_group_size(CONFIG.options.row_group_size) - .set_compression(CONFIG.options.parquet_compression.into()) + .set_max_row_group_size(options.row_group_size) + .set_compression(options.parquet_compression.into()) .set_column_encoding( ColumnPath::new(vec![time_partition_field]), Encoding::DELTA_BINARY_PACKED, @@ -359,8 +376,8 @@ pub fn parquet_writer_props( props } -pub fn get_ingestor_info() -> anyhow::Result { - let path = PathBuf::from(&CONFIG.options.local_staging_path); +pub fn get_ingestor_info(config: &Config) -> anyhow::Result { + let path = PathBuf::from(&config.options.local_staging_path); // all the files should be in the staging directory root let entries = std::fs::read_dir(path)?; @@ -391,7 +408,7 @@ pub fn get_ingestor_info() -> anyhow::Result { if obj.get("flight_port").is_none() { obj.insert( "flight_port".to_owned(), - JsonValue::String(CONFIG.options.flight_port.to_string()), + JsonValue::String(config.options.flight_port.to_string()), ); } @@ -413,7 +430,7 @@ pub fn get_ingestor_info() -> anyhow::Result { let token = base64::prelude::BASE64_STANDARD.encode(format!( "{}:{}", - CONFIG.options.username, CONFIG.options.password + config.options.username, config.options.password )); let token = format!("Basic {}", token); @@ -427,24 +444,24 @@ pub fn get_ingestor_info() -> anyhow::Result { meta.token = token; } - put_ingestor_info(meta.clone())?; + put_ingestor_info(config, meta.clone())?; return Ok(meta); } } - let store = CONFIG.storage().get_object_store(); + let store = config.storage().get_object_store(); let out = IngestorMetadata::new( port, url, DEFAULT_VERSION.to_string(), store.get_bucket_name(), - &CONFIG.options.username, - &CONFIG.options.password, + &config.options.username, + &config.options.password, get_ingestor_id(), - CONFIG.options.flight_port.to_string(), + config.options.flight_port.to_string(), ); - put_ingestor_info(out.clone())?; + put_ingestor_info(config, out.clone())?; Ok(out) } @@ -454,8 +471,8 @@ pub fn get_ingestor_info() -> anyhow::Result { /// # Parameters /// /// * `ingestor_info`: The ingestor info to be stored. -pub fn put_ingestor_info(info: IngestorMetadata) -> anyhow::Result<()> { - let path = PathBuf::from(&CONFIG.options.local_staging_path); +pub fn put_ingestor_info(config: &Config, info: IngestorMetadata) -> anyhow::Result<()> { + let path = PathBuf::from(&config.options.local_staging_path); let file_name = format!("ingestor.{}.json", info.ingestor_id); let file_path = path.join(file_name); @@ -475,3 +492,112 @@ pub enum MoveDataError { #[error("Could not generate parquet file")] Create, } + +#[cfg(test)] +mod tests { + use chrono::NaiveDate; + use temp_dir::TempDir; + use utils::minute_to_slot; + + use super::*; + + #[test] + fn test_storage_dir_new_with_valid_stream() { + let stream_name = "test_stream"; + + let options = Options::default(); + let storage_dir = StorageDir::new(&options, stream_name); + + assert_eq!( + storage_dir.data_path, + options.local_stream_data_path(stream_name) + ); + } + + #[test] + fn test_storage_dir_with_special_characters() { + let stream_name = "test_stream_!@#$%^&*()"; + + let options = Options::default(); + let storage_dir = StorageDir::new(&options, stream_name); + + assert_eq!( + storage_dir.data_path, + options.local_stream_data_path(stream_name) + ); + } + + #[test] + fn test_storage_dir_data_path_initialization() { + let stream_name = "example_stream"; + + let options = Options::default(); + let storage_dir = StorageDir::new(&options, stream_name); + + assert_eq!( + storage_dir.data_path, + options.local_stream_data_path(stream_name) + ); + } + + #[test] + fn test_storage_dir_with_alphanumeric_stream_name() { + let stream_name = "test123stream"; + + let options = Options::default(); + let storage_dir = StorageDir::new(&options, stream_name); + + assert_eq!( + storage_dir.data_path, + options.local_stream_data_path(stream_name) + ); + } + + #[test] + fn test_arrow_files_empty_directory() { + let temp_dir = TempDir::new().unwrap(); + + let options = Options { + local_staging_path: temp_dir.path().to_path_buf(), + ..Default::default() + }; + let storage_dir = StorageDir::new(&options, "test_stream"); + + let files = storage_dir.arrow_files(); + + assert!(files.is_empty()); + } + + #[test] + fn generate_correct_path_with_current_time_and_valid_parameters() { + let stream_name = "test_stream"; + let stream_hash = "abc123"; + let parsed_timestamp = NaiveDate::from_ymd_opt(2023, 10, 1) + .unwrap() + .and_hms_opt(12, 30, 0) + .unwrap(); + let mut custom_partition_values = HashMap::new(); + custom_partition_values.insert("key1".to_string(), "value1".to_string()); + custom_partition_values.insert("key2".to_string(), "value2".to_string()); + + let options = Options::default(); + let storage_dir = StorageDir::new(&options, stream_name); + + let expected_path = storage_dir.data_path.join(format!( + "{}{stream_hash}.date={}.hour={:02}.minute={}.key1=value1.key2=value2.{}.data.arrows", + Utc::now().format("%Y%m%dT%H%M"), + parsed_timestamp.date(), + parsed_timestamp.hour(), + minute_to_slot(parsed_timestamp.minute(), OBJECT_STORE_DATA_GRANULARITY).unwrap(), + hostname::get().unwrap().into_string().unwrap() + )); + + let generated_path = storage_dir.path_by_current_time( + stream_hash, + parsed_timestamp, + &custom_partition_values, + ); + + assert_eq!(generated_path, expected_path); + } +} From adf0840d20a94a63bb81d8e3b1784c81a7de4da2 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Wed, 22 Jan 2025 01:14:02 +0530 Subject: [PATCH 03/34] refactor: flatten out formats --- src/event/mod.rs | 9 ++--- src/metadata.rs | 2 +- src/storage/staging.rs | 87 ++++++++++-------------------------------- src/utils/mod.rs | 41 ++------------------ 4 files changed, 28 insertions(+), 111 deletions(-) diff --git a/src/event/mod.rs b/src/event/mod.rs index 2e9bc7359..038df1cd3 100644 --- a/src/event/mod.rs +++ b/src/event/mod.rs @@ -53,18 +53,15 @@ impl Event { let mut key = get_schema_key(&self.rb.schema().fields); if self.time_partition.is_some() { let parsed_timestamp_to_min = self.parsed_timestamp.format("%Y%m%dT%H%M").to_string(); - key = format!("{key}{parsed_timestamp_to_min}"); + key.push_str(&parsed_timestamp_to_min); } if !self.custom_partition_values.is_empty() { - let mut custom_partition_key = String::default(); for (k, v) in self.custom_partition_values.iter().sorted_by_key(|v| v.0) { - custom_partition_key = format!("{custom_partition_key}&{k}={v}"); + key.push_str(&format!("&{k}={v}")); } - key = format!("{key}{custom_partition_key}"); } - let num_rows = self.rb.num_rows() as u64; if self.is_first_event { commit_schema(&self.stream_name, self.rb.schema())?; } @@ -82,7 +79,7 @@ impl Event { &self.stream_name, self.origin_format, self.origin_size, - num_rows, + self.rb.num_rows(), self.parsed_timestamp, )?; diff --git a/src/metadata.rs b/src/metadata.rs index d998bab1f..f3016b6d3 100644 --- a/src/metadata.rs +++ b/src/metadata.rs @@ -342,7 +342,7 @@ impl StreamInfo { stream_name: &str, origin: &'static str, size: u64, - num_rows: u64, + num_rows: usize, parsed_timestamp: NaiveDateTime, ) -> Result<(), MetadataError> { let parsed_date = parsed_timestamp.date().to_string(); diff --git a/src/storage/staging.rs b/src/storage/staging.rs index e2f8b7f47..5258287ff 100644 --- a/src/storage/staging.rs +++ b/src/storage/staging.rs @@ -25,8 +25,7 @@ use crate::{ option::{Config, Mode}, storage::OBJECT_STORE_DATA_GRANULARITY, utils::{ - self, arrow::merged_reader::MergedReverseRecordReader, get_ingestor_id, get_url, - hostname_unchecked, + arrow::merged_reader::MergedReverseRecordReader, get_ingestor_id, get_url, minute_to_slot, }, }; use anyhow::anyhow; @@ -69,70 +68,28 @@ impl<'a> StorageDir<'a> { options, } } - - pub fn file_time_suffix( - options: &Options, - time: NaiveDateTime, - custom_partition_values: &HashMap, - extention: &str, - ) -> String { - let mut uri = utils::date_to_prefix(time.date()) - + &utils::hour_to_prefix(time.hour()) - + &utils::minute_to_prefix(time.minute(), OBJECT_STORE_DATA_GRANULARITY).unwrap(); - if !custom_partition_values.is_empty() { - uri = uri + &utils::custom_partition_to_prefix(custom_partition_values); - } - let local_uri = str::replace(&uri, "/", "."); - let hostname = hostname_unchecked(); - if options.mode == Mode::Ingest { - let id = INGESTOR_META.get_ingestor_id(); - format!("{local_uri}{hostname}{id}.{extention}") - } else { - format!("{local_uri}{hostname}.{extention}") - } - } - - fn filename_by_time( - options: &Options, - stream_hash: &str, - time: NaiveDateTime, - custom_partition_values: &HashMap, - ) -> String { - format!( - "{}.{}", - stream_hash, - Self::file_time_suffix(options, time, custom_partition_values, ARROW_FILE_EXTENSION) - ) - } - - fn filename_by_current_time( - options: &Options, - stream_hash: &str, - parsed_timestamp: NaiveDateTime, - custom_partition_values: &HashMap, - ) -> String { - Self::filename_by_time( - options, - stream_hash, - parsed_timestamp, - custom_partition_values, - ) - } - pub fn path_by_current_time( &self, stream_hash: &str, parsed_timestamp: NaiveDateTime, custom_partition_values: &HashMap, ) -> PathBuf { - let server_time_in_min = Utc::now().format("%Y%m%dT%H%M").to_string(); - let mut filename = Self::filename_by_current_time( - &self.options, - stream_hash, - parsed_timestamp, - custom_partition_values, + let mut hostname = hostname::get().unwrap().into_string().unwrap(); + if self.options.mode == Mode::Ingest { + hostname.push_str(&INGESTOR_META.get_ingestor_id()); + } + let filename = format!( + "{}{stream_hash}.date={}.hour={:02}.minute={}.{}.{hostname}.{ARROW_FILE_EXTENSION}", + Utc::now().format("%Y%m%dT%H%M"), + parsed_timestamp.date(), + parsed_timestamp.hour(), + minute_to_slot(parsed_timestamp.minute(), OBJECT_STORE_DATA_GRANULARITY).unwrap(), + custom_partition_values + .iter() + .sorted_by_key(|v| v.0) + .map(|(key, value)| format!("{key}={value}")) + .join(".") ); - filename = format!("{}{}", server_time_in_min, filename); self.data_path.join(filename) } @@ -157,7 +114,7 @@ impl<'a> StorageDir<'a> { let mut grouped_arrow_file: HashMap> = HashMap::new(); let arrow_files = self.arrow_files(); for arrow_file_path in arrow_files { - let key = Self::arrow_path_to_parquet(&arrow_file_path, String::default()); + let key = Self::arrow_path_to_parquet(&arrow_file_path, ""); grouped_arrow_file .entry(key) .or_default() @@ -197,7 +154,7 @@ impl<'a> StorageDir<'a> { ); fs::remove_file(&arrow_file_path).unwrap(); } else { - let key = Self::arrow_path_to_parquet(&arrow_file_path, random_string.clone()); + let key = Self::arrow_path_to_parquet(&arrow_file_path, &random_string); grouped_arrow_file .entry(key) .or_default() @@ -218,12 +175,11 @@ impl<'a> StorageDir<'a> { .collect() } - fn arrow_path_to_parquet(path: &Path, random_string: String) -> PathBuf { + fn arrow_path_to_parquet(path: &Path, random_string: &str) -> PathBuf { let filename = path.file_stem().unwrap().to_str().unwrap(); let (_, filename) = filename.split_once('.').unwrap(); - let filename = filename.rsplit_once('.').expect("contains the delim `.`"); - let filename = format!("{}.{}", filename.0, filename.1); - let filename_with_random_number = format!("{}.{}.{}", filename, random_string, "arrows"); + assert!(filename.contains('.'), "contains the delim `.`"); + let filename_with_random_number = format!("{filename}.{random_string}.arrows"); let mut parquet_path = path.to_owned(); parquet_path.set_file_name(filename_with_random_number); parquet_path.set_extension("parquet"); @@ -497,7 +453,6 @@ pub enum MoveDataError { mod tests { use chrono::NaiveDate; use temp_dir::TempDir; - use utils::minute_to_slot; use super::*; diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 4c4ba8425..0a7ae9576 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -30,23 +30,11 @@ use crate::rbac::Users; use actix::extract_session_key_from_req; use actix_web::HttpRequest; use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Timelike, Utc}; -use itertools::Itertools; use regex::Regex; use sha2::{Digest, Sha256}; -use std::collections::HashMap; use std::env; use tracing::debug; use url::Url; -#[allow(dead_code)] -pub fn hostname() -> Option { - hostname::get() - .ok() - .and_then(|hostname| hostname.into_string().ok()) -} - -pub fn hostname_unchecked() -> String { - hostname::get().unwrap().into_string().unwrap() -} /// Convert minutes to a slot range /// e.g. given minute = 15 and OBJECT_STORE_DATA_GRANULARITY = 10 returns "10-19" @@ -65,30 +53,6 @@ pub fn minute_to_slot(minute: u32, data_granularity: u32) -> Option { Some(format!("{block_start:02}-{block_end:02}")) } -pub fn date_to_prefix(date: NaiveDate) -> String { - let date = format!("date={date}/"); - date.replace("UTC", "") -} - -pub fn custom_partition_to_prefix(custom_partition: &HashMap) -> String { - let mut prefix = String::default(); - for (key, value) in custom_partition.iter().sorted_by_key(|v| v.0) { - prefix.push_str(&format!("{key}={value}/", key = key, value = value)); - } - prefix -} - -pub fn hour_to_prefix(hour: u32) -> String { - format!("hour={hour:02}/") -} - -pub fn minute_to_prefix(minute: u32, data_granularity: u32) -> Option { - Some(format!( - "minute={}/", - minute_to_slot(minute, data_granularity)? - )) -} - pub struct TimePeriod { start: DateTime, end: DateTime, @@ -234,8 +198,9 @@ pub fn get_url() -> Url { CONFIG.options.address ) .parse::() // if the value was improperly set, this will panic before hand - .unwrap_or_else(|err| panic!("{}, failed to parse `{}` as Url. Please set the environment variable `P_ADDR` to `:` without the scheme (e.g., 192.168.1.1:8000). Please refer to the documentation: https://logg.ing/env for more details.", - err, CONFIG.options.address)); + .unwrap_or_else(|err| { + panic!("{err}, failed to parse `{}` as Url. Please set the environment variable `P_ADDR` to `:` without the scheme (e.g., 192.168.1.1:8000). Please refer to the documentation: https://logg.ing/env for more details.", CONFIG.options.address) + }); } let ingestor_endpoint = &CONFIG.options.ingestor_endpoint; From a59ca6a220b449072b916398f767b4e61f64c8c0 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Thu, 23 Jan 2025 12:53:41 +0530 Subject: [PATCH 04/34] refactor: staging is sematically different from storage --- src/event/writer/file_writer.rs | 2 +- src/handlers/http/logstream.rs | 3 ++- src/handlers/http/modal/ingest_server.rs | 2 +- src/handlers/http/modal/query/querier_logstream.rs | 10 ++-------- src/lib.rs | 1 + src/metadata.rs | 3 ++- src/migration/metadata_migration.rs | 5 ++--- src/query/mod.rs | 3 ++- src/{storage => }/staging.rs | 0 src/storage/mod.rs | 2 -- src/storage/object_storage.rs | 5 +++-- 11 files changed, 16 insertions(+), 20 deletions(-) rename src/{storage => }/staging.rs (100%) diff --git a/src/event/writer/file_writer.rs b/src/event/writer/file_writer.rs index a04f770d7..d52255797 100644 --- a/src/event/writer/file_writer.rs +++ b/src/event/writer/file_writer.rs @@ -26,7 +26,7 @@ use std::path::PathBuf; use super::errors::StreamWriterError; use crate::option::CONFIG; -use crate::storage::staging::StorageDir; +use crate::staging::StorageDir; use chrono::NaiveDateTime; pub struct ArrowWriter { diff --git a/src/handlers/http/logstream.rs b/src/handlers/http/logstream.rs index 4102281eb..c23935ba2 100644 --- a/src/handlers/http/logstream.rs +++ b/src/handlers/http/logstream.rs @@ -34,8 +34,9 @@ use crate::metrics::{EVENTS_INGESTED_DATE, EVENTS_INGESTED_SIZE_DATE, EVENTS_STO use crate::option::{Mode, CONFIG}; use crate::rbac::role::Action; use crate::rbac::Users; +use crate::staging::StorageDir; use crate::stats::{event_labels_date, storage_size_labels_date, Stats}; -use crate::storage::{retention::Retention, StorageDir}; +use crate::storage::retention::Retention; use crate::storage::{StreamInfo, StreamType}; use crate::utils::actix::extract_session_key_from_req; use crate::{event, stats}; diff --git a/src/handlers/http/modal/ingest_server.rs b/src/handlers/http/modal/ingest_server.rs index 71e9e6b02..5e02b9726 100644 --- a/src/handlers/http/modal/ingest_server.rs +++ b/src/handlers/http/modal/ingest_server.rs @@ -34,9 +34,9 @@ use crate::metrics; use crate::migration; use crate::migration::metadata_migration::migrate_ingester_metadata; use crate::rbac::role::Action; +use crate::staging; use crate::storage::object_storage::ingestor_metadata_path; use crate::storage::object_storage::parseable_json_path; -use crate::storage::staging; use crate::storage::ObjectStorageError; use crate::storage::PARSEABLE_ROOT_DIRECTORY; use crate::sync; diff --git a/src/handlers/http/modal/query/querier_logstream.rs b/src/handlers/http/modal/query/querier_logstream.rs index e8a08dbcd..8d787d3bb 100644 --- a/src/handlers/http/modal/query/querier_logstream.rs +++ b/src/handlers/http/modal/query/querier_logstream.rs @@ -32,8 +32,7 @@ use tracing::{error, warn}; static CREATE_STREAM_LOCK: Mutex<()> = Mutex::const_new(()); use crate::{ - event, - handlers::http::{ + event, handlers::http::{ base_path_without_preceding_slash, cluster::{ self, fetch_daily_stats_from_ingestors, fetch_stats_from_ingestors, @@ -44,12 +43,7 @@ use crate::{ modal::utils::logstream_utils::{ create_stream_and_schema_from_storage, create_update_stream, }, - }, - hottier::HotTierManager, - metadata::{self, STREAM_INFO}, - option::CONFIG, - stats::{self, Stats}, - storage::{StorageDir, StreamType}, + }, hottier::HotTierManager, metadata::{self, STREAM_INFO}, option::CONFIG, staging::StorageDir, stats::{self, Stats}, storage::StreamType }; pub async fn delete(stream_name: Path) -> Result { diff --git a/src/lib.rs b/src/lib.rs index 973406cb5..51b0d45ef 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -42,6 +42,7 @@ pub mod otel; mod query; pub mod rbac; mod response; +mod staging; mod static_schema; mod stats; pub mod storage; diff --git a/src/metadata.rs b/src/metadata.rs index b2289c47e..f4aa94a62 100644 --- a/src/metadata.rs +++ b/src/metadata.rs @@ -37,8 +37,9 @@ use crate::metrics::{ LIFETIME_EVENTS_INGESTED_SIZE, }; use crate::option::CONFIG; +use crate::staging::StorageDir; use crate::storage::retention::Retention; -use crate::storage::{ObjectStorage, ObjectStoreFormat, StorageDir, StreamType}; +use crate::storage::{ObjectStorage, ObjectStoreFormat, StreamType}; use crate::utils::arrow::MergedRecordReader; use derive_more::{Deref, DerefMut}; diff --git a/src/migration/metadata_migration.rs b/src/migration/metadata_migration.rs index 9198dc1da..ce25862ec 100644 --- a/src/migration/metadata_migration.rs +++ b/src/migration/metadata_migration.rs @@ -21,9 +21,8 @@ use rand::distributions::DistString; use serde_json::{json, Map, Value as JsonValue}; use crate::{ - handlers::http::modal::IngestorMetadata, - option::CONFIG, - storage::{object_storage::ingestor_metadata_path, staging}, + handlers::http::modal::IngestorMetadata, option::CONFIG, staging, + storage::object_storage::ingestor_metadata_path, }; /* diff --git a/src/query/mod.rs b/src/query/mod.rs index e60f9b2d4..c51899821 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -56,7 +56,8 @@ use crate::event; use crate::handlers::http::query::QueryError; use crate::metadata::STREAM_INFO; use crate::option::{Mode, CONFIG}; -use crate::storage::{ObjectStorageProvider, ObjectStoreFormat, StorageDir, STREAM_ROOT_DIRECTORY}; +use crate::staging::StorageDir; +use crate::storage::{ObjectStorageProvider, ObjectStoreFormat, STREAM_ROOT_DIRECTORY}; use crate::utils::time::TimeRange; pub static QUERY_SESSION: Lazy = Lazy::new(|| Query::create_session_context(CONFIG.storage())); diff --git a/src/storage/staging.rs b/src/staging.rs similarity index 100% rename from src/storage/staging.rs rename to src/staging.rs diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 85c46dade..433c44bfe 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -35,11 +35,9 @@ mod metrics_layer; pub(crate) mod object_storage; pub mod retention; mod s3; -pub mod staging; mod store_metadata; use self::retention::Retention; -pub use self::staging::StorageDir; pub use azure_blob::AzureBlobConfig; pub use localfs::FSConfig; pub use object_storage::{ObjectStorage, ObjectStorageProvider}; diff --git a/src/storage/object_storage.rs b/src/storage/object_storage.rs index 140c56a1a..3911aba2e 100644 --- a/src/storage/object_storage.rs +++ b/src/storage/object_storage.rs @@ -17,8 +17,8 @@ */ use super::{ - retention::Retention, staging::convert_disk_files_to_parquet, LogStream, ObjectStorageError, - ObjectStoreFormat, Permisssion, StorageDir, StorageMetadata, + retention::Retention, LogStream, ObjectStorageError, ObjectStoreFormat, Permisssion, + StorageMetadata, }; use super::{ Owner, StreamType, ALERT_FILE_NAME, MANIFEST_FILE, PARSEABLE_METADATA_FILE_NAME, @@ -31,6 +31,7 @@ use crate::handlers::http::users::{CORRELATION_DIR, DASHBOARDS_DIR, FILTER_DIR, use crate::metadata::SchemaVersion; use crate::metrics::{EVENTS_STORAGE_SIZE_DATE, LIFETIME_EVENTS_STORAGE_SIZE}; use crate::option::Mode; +use crate::staging::{convert_disk_files_to_parquet, StorageDir}; use crate::{ alerts::Alerts, catalog::{self, manifest::Manifest, snapshot::Snapshot}, From 8c05d15d23895cb40a6034ca22b988cda27a8a3d Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Thu, 23 Jan 2025 13:08:20 +0530 Subject: [PATCH 05/34] style: `StorageDir` ~> `Staging` --- src/event/writer/file_writer.rs | 4 +- src/handlers/http/logstream.rs | 4 +- .../http/modal/query/querier_logstream.rs | 4 +- src/metadata.rs | 4 +- src/query/mod.rs | 4 +- src/staging.rs | 68 +++++++------------ src/storage/object_storage.rs | 4 +- 7 files changed, 36 insertions(+), 56 deletions(-) diff --git a/src/event/writer/file_writer.rs b/src/event/writer/file_writer.rs index d52255797..fc6a3f4c3 100644 --- a/src/event/writer/file_writer.rs +++ b/src/event/writer/file_writer.rs @@ -26,7 +26,7 @@ use std::path::PathBuf; use super::errors::StreamWriterError; use crate::option::CONFIG; -use crate::staging::StorageDir; +use crate::staging::Staging; use chrono::NaiveDateTime; pub struct ArrowWriter { @@ -89,7 +89,7 @@ fn init_new_stream_writer_file( parsed_timestamp: NaiveDateTime, custom_partition_values: &HashMap, ) -> Result<(PathBuf, StreamWriter), StreamWriterError> { - let dir = StorageDir::new(&CONFIG.options, stream_name); + let dir = Staging::new(&CONFIG.options, stream_name); let path = dir.path_by_current_time(schema_key, parsed_timestamp, custom_partition_values); std::fs::create_dir_all(dir.data_path)?; diff --git a/src/handlers/http/logstream.rs b/src/handlers/http/logstream.rs index c23935ba2..a693214e3 100644 --- a/src/handlers/http/logstream.rs +++ b/src/handlers/http/logstream.rs @@ -34,7 +34,7 @@ use crate::metrics::{EVENTS_INGESTED_DATE, EVENTS_INGESTED_SIZE_DATE, EVENTS_STO use crate::option::{Mode, CONFIG}; use crate::rbac::role::Action; use crate::rbac::Users; -use crate::staging::StorageDir; +use crate::staging::Staging; use crate::stats::{event_labels_date, storage_size_labels_date, Stats}; use crate::storage::retention::Retention; use crate::storage::{StreamInfo, StreamType}; @@ -69,7 +69,7 @@ pub async fn delete(stream_name: Path) -> Result) -> Result { @@ -63,7 +63,7 @@ pub async fn delete(stream_name: Path) -> Result Schema { - let staging_files = StorageDir::new(&CONFIG.options, stream_name).arrow_files(); + let staging_files = Staging::new(&CONFIG.options, stream_name).arrow_files(); let record_reader = MergedRecordReader::try_new(&staging_files).unwrap(); if record_reader.readers.is_empty() { return current_schema; diff --git a/src/query/mod.rs b/src/query/mod.rs index c51899821..0c64beab9 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -56,7 +56,7 @@ use crate::event; use crate::handlers::http::query::QueryError; use crate::metadata::STREAM_INFO; use crate::option::{Mode, CONFIG}; -use crate::staging::StorageDir; +use crate::staging::Staging; use crate::storage::{ObjectStorageProvider, ObjectStoreFormat, STREAM_ROOT_DIRECTORY}; use crate::utils::time::TimeRange; pub static QUERY_SESSION: Lazy = @@ -570,7 +570,7 @@ fn get_staging_prefixes( start: DateTime, end: DateTime, ) -> HashMap> { - let dir = StorageDir::new(&CONFIG.options, stream_name); + let dir = Staging::new(&CONFIG.options, stream_name); let mut files = dir.arrow_files_grouped_by_time(); files.retain(|k, _| path_intersects_query(k, start, end)); files diff --git a/src/staging.rs b/src/staging.rs index 5258287ff..12f04cee4 100644 --- a/src/staging.rs +++ b/src/staging.rs @@ -18,15 +18,7 @@ */ use crate::{ - cli::Options, - event::DEFAULT_TIMESTAMP_KEY, - handlers::http::modal::{ingest_server::INGESTOR_META, IngestorMetadata, DEFAULT_VERSION}, - metrics, - option::{Config, Mode}, - storage::OBJECT_STORE_DATA_GRANULARITY, - utils::{ - arrow::merged_reader::MergedReverseRecordReader, get_ingestor_id, get_url, minute_to_slot, - }, + cli::Options, event::DEFAULT_TIMESTAMP_KEY, handlers::http::modal::{ingest_server::INGESTOR_META, IngestorMetadata, DEFAULT_VERSION}, metrics, option::{Config, Mode}, storage::OBJECT_STORE_DATA_GRANULARITY, utils::{arrow::merged_reader::MergedReverseRecordReader, get_ingestor_id, get_url, minute_to_slot} }; use anyhow::anyhow; use arrow_schema::{ArrowError, Schema}; @@ -43,25 +35,30 @@ use parquet::{ }; use rand::distributions::DistString; use serde_json::Value as JsonValue; -use std::{ - collections::HashMap, - fs, - path::{Path, PathBuf}, - process, - sync::Arc, -}; +use std::{collections::HashMap, fs, path::{Path, PathBuf}, process, sync::Arc}; use tracing::{error, info}; const ARROW_FILE_EXTENSION: &str = "data.arrows"; -// const PARQUET_FILE_EXTENSION: &str = "data.parquet"; + +#[derive(Debug, thiserror::Error)] +pub enum MoveDataError { + #[error("Unable to create recordbatch stream")] + Arrow(#[from] ArrowError), + #[error("Could not generate parquet file")] + Parquet(#[from] ParquetError), + #[error("IO Error {0}")] + ObjectStorage(#[from] std::io::Error), + #[error("Could not generate parquet file")] + Create, +} #[derive(Debug)] -pub struct StorageDir<'a> { +pub struct Staging<'a> { pub data_path: PathBuf, pub options: &'a Options, } -impl<'a> StorageDir<'a> { +impl<'a> Staging<'a> { pub fn new(options: &'a Options, stream_name: &str) -> Self { Self { data_path: options.local_stream_data_path(stream_name), @@ -187,16 +184,10 @@ impl<'a> StorageDir<'a> { } } -// pub fn to_parquet_path(options: &Options, stream_name: &str, time: NaiveDateTime) -> PathBuf { -// let data_path = options.local_stream_data_path(stream_name); -// let dir = StorageDir::file_time_suffix(time, &HashMap::new(), PARQUET_FILE_EXTENSION); -// -// data_path.join(dir) -// } pub fn convert_disk_files_to_parquet( stream: &str, - dir: &StorageDir, + dir: &Staging, time_partition: Option, custom_partition: Option, shutdown_signal: bool, @@ -437,18 +428,6 @@ pub fn put_ingestor_info(config: &Config, info: IngestorMetadata) -> anyhow::Res Ok(()) } -#[derive(Debug, thiserror::Error)] -pub enum MoveDataError { - #[error("Unable to create recordbatch stream")] - Arrow(#[from] ArrowError), - #[error("Could not generate parquet file")] - Parquet(#[from] ParquetError), - #[error("IO Error {0}")] - ObjectStorage(#[from] std::io::Error), - #[error("Could not generate parquet file")] - Create, -} - #[cfg(test)] mod tests { use chrono::NaiveDate; @@ -461,7 +440,7 @@ mod tests { let stream_name = "test_stream"; let options = Options::default(); - let storage_dir = StorageDir::new(&options, stream_name); + let storage_dir = Staging::new(&options, stream_name); assert_eq!( storage_dir.data_path, @@ -474,7 +453,7 @@ mod tests { let stream_name = "test_stream_!@#$%^&*()"; let options = Options::default(); - let storage_dir = StorageDir::new(&options, stream_name); + let storage_dir = Staging::new(&options, stream_name); assert_eq!( storage_dir.data_path, @@ -487,7 +466,7 @@ mod tests { let stream_name = "example_stream"; let options = Options::default(); - let storage_dir = StorageDir::new(&options, stream_name); + let storage_dir = Staging::new(&options, stream_name); assert_eq!( storage_dir.data_path, @@ -500,7 +479,7 @@ mod tests { let stream_name = "test123stream"; let options = Options::default(); - let storage_dir = StorageDir::new(&options, stream_name); + let storage_dir = Staging::new(&options, stream_name); assert_eq!( storage_dir.data_path, @@ -516,7 +495,7 @@ mod tests { local_staging_path: temp_dir.path().to_path_buf(), ..Default::default() }; - let storage_dir = StorageDir::new(&options, "test_stream"); + let storage_dir = Staging::new(&options, "test_stream"); let files = storage_dir.arrow_files(); @@ -536,7 +515,7 @@ mod tests { custom_partition_values.insert("key2".to_string(), "value2".to_string()); let options = Options::default(); - let storage_dir = StorageDir::new(&options, stream_name); + let storage_dir = Staging::new(&options, stream_name); let expected_path = storage_dir.data_path.join(format!( "{}{stream_hash}.date={}.hour={:02}.minute={}.key1=value1.key2=value2.{}.data.arrows", @@ -556,3 +535,4 @@ mod tests { assert_eq!(generated_path, expected_path); } } + diff --git a/src/storage/object_storage.rs b/src/storage/object_storage.rs index 3911aba2e..8656e3eb4 100644 --- a/src/storage/object_storage.rs +++ b/src/storage/object_storage.rs @@ -31,7 +31,7 @@ use crate::handlers::http::users::{CORRELATION_DIR, DASHBOARDS_DIR, FILTER_DIR, use crate::metadata::SchemaVersion; use crate::metrics::{EVENTS_STORAGE_SIZE_DATE, LIFETIME_EVENTS_STORAGE_SIZE}; use crate::option::Mode; -use crate::staging::{convert_disk_files_to_parquet, StorageDir}; +use crate::staging::{convert_disk_files_to_parquet, Staging}; use crate::{ alerts::Alerts, catalog::{self, manifest::Manifest, snapshot::Snapshot}, @@ -551,7 +551,7 @@ pub trait ObjectStorage: Debug + Send + Sync + 'static { let custom_partition = STREAM_INFO .get_custom_partition(stream) .map_err(|err| ObjectStorageError::UnhandledError(Box::new(err)))?; - let dir = StorageDir::new(&CONFIG.options, stream); + let dir = Staging::new(&CONFIG.options, stream); let schema = convert_disk_files_to_parquet( stream, &dir, From 4d59e14227a62b3842c8413adeb0ae1104f8ee66 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Thu, 23 Jan 2025 14:16:04 +0530 Subject: [PATCH 06/34] refactor: doesn't err --- src/staging.rs | 2 +- src/utils/arrow/merged_reader.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/staging.rs b/src/staging.rs index 12f04cee4..f090599bb 100644 --- a/src/staging.rs +++ b/src/staging.rs @@ -221,7 +221,7 @@ pub fn convert_disk_files_to_parquet( .add(file_size as i64); } - let record_reader = MergedReverseRecordReader::try_new(&files).unwrap(); + let record_reader = MergedReverseRecordReader::try_new(&files); if record_reader.readers.is_empty() { continue; } diff --git a/src/utils/arrow/merged_reader.rs b/src/utils/arrow/merged_reader.rs index 3248bd37d..7e506a121 100644 --- a/src/utils/arrow/merged_reader.rs +++ b/src/utils/arrow/merged_reader.rs @@ -80,7 +80,7 @@ pub struct MergedReverseRecordReader { } impl MergedReverseRecordReader { - pub fn try_new(files: &[PathBuf]) -> Result { + pub fn try_new(files: &[PathBuf]) -> Self { let mut readers = Vec::with_capacity(files.len()); for file in files { let Ok(reader) = @@ -93,7 +93,7 @@ impl MergedReverseRecordReader { readers.push(reader); } - Ok(Self { readers }) + Self { readers } } pub fn merged_iter( From d50409fd82187a7ef4941e89beef97410ce62ea8 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Thu, 23 Jan 2025 17:37:14 +0530 Subject: [PATCH 07/34] refactor: move writers to staging --- .gitignore | 2 +- src/event/format/mod.rs | 7 ++--- src/event/mod.rs | 7 ++--- src/handlers/http/health_check.rs | 5 ++-- src/handlers/http/logstream.rs | 6 ++--- .../http/modal/ingest/ingestor_logstream.rs | 4 +-- .../http/modal/query/querier_logstream.rs | 12 ++++++--- src/query/stream_schema_provider.rs | 7 +++-- src/{staging.rs => staging/mod.rs} | 27 ++++++++++++++++--- src/{event => staging}/writer/file_writer.rs | 2 +- src/{event => staging}/writer/mem_writer.rs | 0 src/{event => staging}/writer/mod.rs | 22 ++++++--------- src/sync.rs | 3 ++- 13 files changed, 59 insertions(+), 45 deletions(-) rename src/{staging.rs => staging/mod.rs} (96%) rename src/{event => staging}/writer/file_writer.rs (98%) rename src/{event => staging}/writer/mem_writer.rs (100%) rename src/{event => staging}/writer/mod.rs (90%) diff --git a/.gitignore b/.gitignore index 57ea8e65e..22df045e3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ target data* -staging/ +staging/* limitcache examples cert.pem diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs index 2b2c2a0b3..c0a2ec323 100644 --- a/src/event/format/mod.rs +++ b/src/event/format/mod.rs @@ -112,11 +112,8 @@ pub trait EventFormat: Sized { time_partition: Option<&String>, schema_version: SchemaVersion, ) -> Result<(RecordBatch, bool), AnyError> { - let (data, mut schema, is_first) = self.to_data( - storage_schema, - time_partition, - schema_version, - )?; + let (data, mut schema, is_first) = + self.to_data(storage_schema, time_partition, schema_version)?; if get_field(&schema, DEFAULT_TIMESTAMP_KEY).is_some() { return Err(anyhow!( diff --git a/src/event/mod.rs b/src/event/mod.rs index b007060a4..d1c78b088 100644 --- a/src/event/mod.rs +++ b/src/event/mod.rs @@ -18,7 +18,6 @@ */ pub mod format; -mod writer; use arrow_array::RecordBatch; use arrow_schema::{Field, Fields, Schema}; @@ -27,8 +26,7 @@ use std::sync::Arc; use tracing::error; use self::error::EventError; -pub use self::writer::STREAM_WRITERS; -use crate::{metadata, storage::StreamType}; +use crate::{metadata, staging::STREAM_WRITERS, storage::StreamType}; use chrono::NaiveDateTime; use std::collections::HashMap; @@ -143,10 +141,9 @@ pub mod error { use arrow_schema::ArrowError; use crate::metadata::error::stream_info::MetadataError; + use crate::staging::StreamWriterError; use crate::storage::ObjectStorageError; - use super::writer::errors::StreamWriterError; - #[derive(Debug, thiserror::Error)] pub enum EventError { #[error("Stream Writer Failed: {0}")] diff --git a/src/handlers/http/health_check.rs b/src/handlers/http/health_check.rs index a0516d318..b2965479f 100644 --- a/src/handlers/http/health_check.rs +++ b/src/handlers/http/health_check.rs @@ -17,6 +17,7 @@ */ use crate::option::CONFIG; +use crate::staging::STREAM_WRITERS; use actix_web::body::MessageBody; use actix_web::dev::{ServiceRequest, ServiceResponse}; use actix_web::error::ErrorServiceUnavailable; @@ -56,8 +57,8 @@ pub async fn shutdown() { let mut shutdown_flag = SIGNAL_RECEIVED.lock().await; *shutdown_flag = true; - // Sync to local - crate::event::STREAM_WRITERS.unset_all(); + // Sync staging + STREAM_WRITERS.unset_all(); } pub async fn readiness() -> HttpResponse { diff --git a/src/handlers/http/logstream.rs b/src/handlers/http/logstream.rs index a693214e3..4b96c4262 100644 --- a/src/handlers/http/logstream.rs +++ b/src/handlers/http/logstream.rs @@ -34,12 +34,12 @@ use crate::metrics::{EVENTS_INGESTED_DATE, EVENTS_INGESTED_SIZE_DATE, EVENTS_STO use crate::option::{Mode, CONFIG}; use crate::rbac::role::Action; use crate::rbac::Users; -use crate::staging::Staging; +use crate::staging::{Staging, STREAM_WRITERS}; +use crate::stats; use crate::stats::{event_labels_date, storage_size_labels_date, Stats}; use crate::storage::retention::Retention; use crate::storage::{StreamInfo, StreamType}; use crate::utils::actix::extract_session_key_from_req; -use crate::{event, stats}; use crate::{metadata, validator}; use actix_web::http::header::{self, HeaderMap}; @@ -85,7 +85,7 @@ pub async fn delete(stream_name: Path) -> Result) -> Result = Mutex::const_new(()); use crate::{ - event, handlers::http::{ + handlers::http::{ base_path_without_preceding_slash, cluster::{ self, fetch_daily_stats_from_ingestors, fetch_stats_from_ingestors, @@ -43,7 +43,13 @@ use crate::{ modal::utils::logstream_utils::{ create_stream_and_schema_from_storage, create_update_stream, }, - }, hottier::HotTierManager, metadata::{self, STREAM_INFO}, option::CONFIG, staging::Staging, stats::{self, Stats}, storage::StreamType + }, + hottier::HotTierManager, + metadata::{self, STREAM_INFO}, + option::CONFIG, + staging::{Staging, STREAM_WRITERS}, + stats::{self, Stats}, + storage::StreamType, }; pub async fn delete(stream_name: Path) -> Result { @@ -96,7 +102,7 @@ pub async fn delete(stream_name: Path) -> Result = Lazy::new(WriterTable::default); + #[derive(Debug, thiserror::Error)] pub enum MoveDataError { #[error("Unable to create recordbatch stream")] @@ -184,7 +205,6 @@ impl<'a> Staging<'a> { } } - pub fn convert_disk_files_to_parquet( stream: &str, dir: &Staging, @@ -535,4 +555,3 @@ mod tests { assert_eq!(generated_path, expected_path); } } - diff --git a/src/event/writer/file_writer.rs b/src/staging/writer/file_writer.rs similarity index 98% rename from src/event/writer/file_writer.rs rename to src/staging/writer/file_writer.rs index fc6a3f4c3..4b8e013d1 100644 --- a/src/event/writer/file_writer.rs +++ b/src/staging/writer/file_writer.rs @@ -24,7 +24,7 @@ use std::collections::HashMap; use std::fs::{File, OpenOptions}; use std::path::PathBuf; -use super::errors::StreamWriterError; +use super::StreamWriterError; use crate::option::CONFIG; use crate::staging::Staging; use chrono::NaiveDateTime; diff --git a/src/event/writer/mem_writer.rs b/src/staging/writer/mem_writer.rs similarity index 100% rename from src/event/writer/mem_writer.rs rename to src/staging/writer/mem_writer.rs diff --git a/src/event/writer/mod.rs b/src/staging/writer/mod.rs similarity index 90% rename from src/event/writer/mod.rs rename to src/staging/writer/mod.rs index 895cd59ed..472f6ad44 100644 --- a/src/event/writer/mod.rs +++ b/src/staging/writer/mod.rs @@ -30,14 +30,19 @@ use crate::{ storage::StreamType, }; -use self::{errors::StreamWriterError, file_writer::FileWriter, mem_writer::MemWriter}; +use self::{file_writer::FileWriter, mem_writer::MemWriter}; use arrow_array::RecordBatch; use arrow_schema::Schema; use chrono::NaiveDateTime; use derive_more::{Deref, DerefMut}; -use once_cell::sync::Lazy; -pub static STREAM_WRITERS: Lazy = Lazy::new(WriterTable::default); +#[derive(Debug, thiserror::Error)] +pub enum StreamWriterError { + #[error("Arrow writer failed: {0}")] + Writer(#[from] arrow_schema::ArrowError), + #[error("Io Error when creating new file: {0}")] + Io(#[from] std::io::Error), +} #[derive(Default)] pub struct Writer { @@ -176,14 +181,3 @@ impl WriterTable { Some(records) } } - -pub mod errors { - - #[derive(Debug, thiserror::Error)] - pub enum StreamWriterError { - #[error("Arrow writer failed: {0}")] - Writer(#[from] arrow_schema::ArrowError), - #[error("Io Error when creating new file: {0}")] - Io(#[from] std::io::Error), - } -} diff --git a/src/sync.rs b/src/sync.rs index 2a06d88aa..d1c04aa3a 100644 --- a/src/sync.rs +++ b/src/sync.rs @@ -24,6 +24,7 @@ use tokio::time::{interval, sleep, Duration}; use tracing::{error, info, warn}; use crate::option::CONFIG; +use crate::staging::STREAM_WRITERS; use crate::{storage, STORAGE_UPLOAD_INTERVAL}; pub async fn object_store_sync() -> ( @@ -97,7 +98,7 @@ pub async fn run_local_sync() -> ( scheduler .every((storage::LOCAL_SYNC_INTERVAL as u32).seconds()) .run(|| async { - crate::event::STREAM_WRITERS.unset_all(); + STREAM_WRITERS.unset_all(); }); loop { From c3991f6d8a6edd63cf43dfbae9302fd9fe7baedd Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Thu, 23 Jan 2025 17:37:31 +0530 Subject: [PATCH 08/34] chore: cargo fmt --- src/event/format/json.rs | 4 ++-- src/utils/arrow/merged_reader.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 71fcaffc7..5006be142 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -94,8 +94,8 @@ impl EventFormat for Event { }; if value_arr - .iter() - .any(|value| fields_mismatch(&schema, value, schema_version)) + .iter() + .any(|value| fields_mismatch(&schema, value, schema_version)) { return Err(anyhow!( "Could not process this event due to mismatch in datatype" diff --git a/src/utils/arrow/merged_reader.rs b/src/utils/arrow/merged_reader.rs index 7e506a121..32a2edb40 100644 --- a/src/utils/arrow/merged_reader.rs +++ b/src/utils/arrow/merged_reader.rs @@ -93,7 +93,7 @@ impl MergedReverseRecordReader { readers.push(reader); } - Self { readers } + Self { readers } } pub fn merged_iter( From 83f3259c6c01e6b36144d9de493b878d94af13fa Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Thu, 23 Jan 2025 19:50:26 +0530 Subject: [PATCH 09/34] refactor: Semantically ingestion happens into staging --- src/event/mod.rs | 12 +- src/handlers/http/health_check.rs | 4 +- src/handlers/http/logstream.rs | 6 +- .../http/modal/ingest/ingestor_logstream.rs | 4 +- .../http/modal/query/querier_logstream.rs | 6 +- src/metadata.rs | 4 +- src/query/mod.rs | 4 +- src/query/stream_schema_provider.rs | 4 +- src/staging/mod.rs | 428 +------------ src/staging/streams.rs | 565 ++++++++++++++++++ .../{writer/mem_writer.rs => writer.rs} | 22 +- src/staging/writer/file_writer.rs | 103 ---- src/staging/writer/mod.rs | 183 ------ src/storage/object_storage.rs | 4 +- src/sync.rs | 4 +- 15 files changed, 625 insertions(+), 728 deletions(-) create mode 100644 src/staging/streams.rs rename src/staging/{writer/mem_writer.rs => writer.rs} (89%) delete mode 100644 src/staging/writer/file_writer.rs delete mode 100644 src/staging/writer/mod.rs diff --git a/src/event/mod.rs b/src/event/mod.rs index d1c78b088..dd6014b1a 100644 --- a/src/event/mod.rs +++ b/src/event/mod.rs @@ -26,7 +26,7 @@ use std::sync::Arc; use tracing::error; use self::error::EventError; -use crate::{metadata, staging::STREAM_WRITERS, storage::StreamType}; +use crate::{metadata, staging::STAGING, storage::StreamType}; use chrono::NaiveDateTime; use std::collections::HashMap; @@ -64,13 +64,13 @@ impl Event { commit_schema(&self.stream_name, self.rb.schema())?; } - STREAM_WRITERS.append_to_local( + STAGING.append_to_local( &self.stream_name, &key, &self.rb, self.parsed_timestamp, &self.custom_partition_values, - &self.stream_type, + self.stream_type, )?; metadata::STREAM_INFO.update_stats( @@ -96,20 +96,20 @@ impl Event { pub fn process_unchecked(&self) -> Result<(), EventError> { let key = get_schema_key(&self.rb.schema().fields); - STREAM_WRITERS.append_to_local( + STAGING.append_to_local( &self.stream_name, &key, &self.rb, self.parsed_timestamp, &self.custom_partition_values, - &self.stream_type, + self.stream_type, )?; Ok(()) } pub fn clear(&self, stream_name: &str) { - STREAM_WRITERS.clear(stream_name); + STAGING.clear(stream_name); } } diff --git a/src/handlers/http/health_check.rs b/src/handlers/http/health_check.rs index b2965479f..855a6c9da 100644 --- a/src/handlers/http/health_check.rs +++ b/src/handlers/http/health_check.rs @@ -17,7 +17,7 @@ */ use crate::option::CONFIG; -use crate::staging::STREAM_WRITERS; +use crate::staging::STAGING; use actix_web::body::MessageBody; use actix_web::dev::{ServiceRequest, ServiceResponse}; use actix_web::error::ErrorServiceUnavailable; @@ -58,7 +58,7 @@ pub async fn shutdown() { *shutdown_flag = true; // Sync staging - STREAM_WRITERS.unset_all(); + STAGING.unset_all(); } pub async fn readiness() -> HttpResponse { diff --git a/src/handlers/http/logstream.rs b/src/handlers/http/logstream.rs index 4b96c4262..cb2293fb0 100644 --- a/src/handlers/http/logstream.rs +++ b/src/handlers/http/logstream.rs @@ -34,7 +34,7 @@ use crate::metrics::{EVENTS_INGESTED_DATE, EVENTS_INGESTED_SIZE_DATE, EVENTS_STO use crate::option::{Mode, CONFIG}; use crate::rbac::role::Action; use crate::rbac::Users; -use crate::staging::{Staging, STREAM_WRITERS}; +use crate::staging::{Stream, STAGING}; use crate::stats; use crate::stats::{event_labels_date, storage_size_labels_date, Stats}; use crate::storage::retention::Retention; @@ -69,7 +69,7 @@ pub async fn delete(stream_name: Path) -> Result) -> Result) -> Result) -> Result) -> Result Schema { - let staging_files = Staging::new(&CONFIG.options, stream_name).arrow_files(); + let staging_files = Stream::new(&CONFIG.options, stream_name).arrow_files(); let record_reader = MergedRecordReader::try_new(&staging_files).unwrap(); if record_reader.readers.is_empty() { return current_schema; diff --git a/src/query/mod.rs b/src/query/mod.rs index 0c64beab9..a16d62857 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -56,7 +56,7 @@ use crate::event; use crate::handlers::http::query::QueryError; use crate::metadata::STREAM_INFO; use crate::option::{Mode, CONFIG}; -use crate::staging::Staging; +use crate::staging::Stream; use crate::storage::{ObjectStorageProvider, ObjectStoreFormat, STREAM_ROOT_DIRECTORY}; use crate::utils::time::TimeRange; pub static QUERY_SESSION: Lazy = @@ -570,7 +570,7 @@ fn get_staging_prefixes( start: DateTime, end: DateTime, ) -> HashMap> { - let dir = Staging::new(&CONFIG.options, stream_name); + let dir = Stream::new(&CONFIG.options, stream_name); let mut files = dir.arrow_files_grouped_by_time(); files.retain(|k, _| path_intersects_query(k, start, end)); files diff --git a/src/query/stream_schema_provider.rs b/src/query/stream_schema_provider.rs index 6f938f1d1..9ca170c2a 100644 --- a/src/query/stream_schema_provider.rs +++ b/src/query/stream_schema_provider.rs @@ -19,7 +19,7 @@ use crate::catalog::manifest::File; use crate::hottier::HotTierManager; use crate::option::Mode; -use crate::staging::STREAM_WRITERS; +use crate::staging::STAGING; use crate::{ catalog::snapshot::{self, Snapshot}, storage::{ObjectStoreFormat, STREAM_ROOT_DIRECTORY}, @@ -441,7 +441,7 @@ impl TableProvider for StandardTableProvider { } if include_now(filters, &time_partition) { - if let Some(records) = STREAM_WRITERS.recordbatches_cloned(&self.stream, &self.schema) { + if let Some(records) = STAGING.recordbatches_cloned(&self.stream, &self.schema) { let reversed_mem_table = reversed_mem_table(records, self.schema.clone())?; let memory_exec = reversed_mem_table diff --git a/src/staging/mod.rs b/src/staging/mod.rs index c9d45006f..77b0d1209 100644 --- a/src/staging/mod.rs +++ b/src/staging/mod.rs @@ -18,49 +18,24 @@ */ use crate::{ - cli::Options, - event::DEFAULT_TIMESTAMP_KEY, - handlers::http::modal::{ingest_server::INGESTOR_META, IngestorMetadata, DEFAULT_VERSION}, - metrics, - option::{Config, Mode}, - storage::OBJECT_STORE_DATA_GRANULARITY, - utils::{ - arrow::merged_reader::MergedReverseRecordReader, get_ingestor_id, get_url, minute_to_slot, - }, + handlers::http::modal::{IngestorMetadata, DEFAULT_VERSION}, + option::Config, + utils::{get_ingestor_id, get_url}, }; use anyhow::anyhow; -use arrow_schema::{ArrowError, Schema}; +use arrow_schema::ArrowError; use base64::Engine; -use chrono::{NaiveDateTime, Timelike, Utc}; -use itertools::Itertools; use once_cell::sync::Lazy; -use parquet::{ - arrow::ArrowWriter, - basic::Encoding, - errors::ParquetError, - file::properties::{WriterProperties, WriterPropertiesBuilder}, - format::SortingColumn, - schema::types::ColumnPath, -}; -use rand::distributions::DistString; +use parquet::errors::ParquetError; use serde_json::Value as JsonValue; -use std::{ - collections::HashMap, - fs, - path::{Path, PathBuf}, - process, - sync::Arc, -}; +pub use streams::convert_disk_files_to_parquet; +pub use streams::{Stream, Streams}; use tracing::{error, info}; pub use writer::StreamWriterError; -use writer::WriterTable; +mod streams; mod writer; -const ARROW_FILE_EXTENSION: &str = "data.arrows"; - -pub static STREAM_WRITERS: Lazy = Lazy::new(WriterTable::default); - #[derive(Debug, thiserror::Error)] pub enum MoveDataError { #[error("Unable to create recordbatch stream")] @@ -73,281 +48,13 @@ pub enum MoveDataError { Create, } -#[derive(Debug)] -pub struct Staging<'a> { - pub data_path: PathBuf, - pub options: &'a Options, -} - -impl<'a> Staging<'a> { - pub fn new(options: &'a Options, stream_name: &str) -> Self { - Self { - data_path: options.local_stream_data_path(stream_name), - options, - } - } - pub fn path_by_current_time( - &self, - stream_hash: &str, - parsed_timestamp: NaiveDateTime, - custom_partition_values: &HashMap, - ) -> PathBuf { - let mut hostname = hostname::get().unwrap().into_string().unwrap(); - if self.options.mode == Mode::Ingest { - hostname.push_str(&INGESTOR_META.get_ingestor_id()); - } - let filename = format!( - "{}{stream_hash}.date={}.hour={:02}.minute={}.{}.{hostname}.{ARROW_FILE_EXTENSION}", - Utc::now().format("%Y%m%dT%H%M"), - parsed_timestamp.date(), - parsed_timestamp.hour(), - minute_to_slot(parsed_timestamp.minute(), OBJECT_STORE_DATA_GRANULARITY).unwrap(), - custom_partition_values - .iter() - .sorted_by_key(|v| v.0) - .map(|(key, value)| format!("{key}={value}")) - .join(".") - ); - self.data_path.join(filename) - } - - pub fn arrow_files(&self) -> Vec { - let Ok(dir) = self.data_path.read_dir() else { - return vec![]; - }; - - let paths = dir - .flatten() - .map(|file| file.path()) - .filter(|file| file.extension().is_some_and(|ext| ext.eq("arrows"))) - .sorted_by_key(|f| f.metadata().unwrap().modified().unwrap()) - .collect(); - - paths - } - - #[allow(dead_code)] - pub fn arrow_files_grouped_by_time(&self) -> HashMap> { - // hashmap - let mut grouped_arrow_file: HashMap> = HashMap::new(); - let arrow_files = self.arrow_files(); - for arrow_file_path in arrow_files { - let key = Self::arrow_path_to_parquet(&arrow_file_path, ""); - grouped_arrow_file - .entry(key) - .or_default() - .push(arrow_file_path); - } - - grouped_arrow_file - } - - pub fn arrow_files_grouped_exclude_time( - &self, - exclude: NaiveDateTime, - stream: &str, - shutdown_signal: bool, - ) -> HashMap> { - let mut grouped_arrow_file: HashMap> = HashMap::new(); - let mut arrow_files = self.arrow_files(); - - if !shutdown_signal { - arrow_files.retain(|path| { - !path - .file_name() - .unwrap() - .to_str() - .unwrap() - .starts_with(&exclude.format("%Y%m%dT%H%M").to_string()) - }); - } - - let random_string = - rand::distributions::Alphanumeric.sample_string(&mut rand::thread_rng(), 15); - for arrow_file_path in arrow_files { - if arrow_file_path.metadata().unwrap().len() == 0 { - error!( - "Invalid arrow file {:?} detected for stream {}, removing it", - &arrow_file_path, stream - ); - fs::remove_file(&arrow_file_path).unwrap(); - } else { - let key = Self::arrow_path_to_parquet(&arrow_file_path, &random_string); - grouped_arrow_file - .entry(key) - .or_default() - .push(arrow_file_path); - } - } - grouped_arrow_file - } - - pub fn parquet_files(&self) -> Vec { - let Ok(dir) = self.data_path.read_dir() else { - return vec![]; - }; - - dir.flatten() - .map(|file| file.path()) - .filter(|file| file.extension().is_some_and(|ext| ext.eq("parquet"))) - .collect() - } - - fn arrow_path_to_parquet(path: &Path, random_string: &str) -> PathBuf { - let filename = path.file_stem().unwrap().to_str().unwrap(); - let (_, filename) = filename.split_once('.').unwrap(); - assert!(filename.contains('.'), "contains the delim `.`"); - let filename_with_random_number = format!("{filename}.{random_string}.arrows"); - let mut parquet_path = path.to_owned(); - parquet_path.set_file_name(filename_with_random_number); - parquet_path.set_extension("parquet"); - parquet_path - } -} - -pub fn convert_disk_files_to_parquet( - stream: &str, - dir: &Staging, - time_partition: Option, - custom_partition: Option, - shutdown_signal: bool, -) -> Result, MoveDataError> { - let mut schemas = Vec::new(); - - let time = chrono::Utc::now().naive_utc(); - let staging_files = dir.arrow_files_grouped_exclude_time(time, stream, shutdown_signal); - if staging_files.is_empty() { - metrics::STAGING_FILES.with_label_values(&[stream]).set(0); - metrics::STORAGE_SIZE - .with_label_values(&["staging", stream, "arrows"]) - .set(0); - metrics::STORAGE_SIZE - .with_label_values(&["staging", stream, "parquet"]) - .set(0); - } - - // warn!("staging files-\n{staging_files:?}\n"); - for (parquet_path, files) in staging_files { - metrics::STAGING_FILES - .with_label_values(&[stream]) - .set(files.len() as i64); - - for file in &files { - let file_size = file.metadata().unwrap().len(); - let file_type = file.extension().unwrap().to_str().unwrap(); - - metrics::STORAGE_SIZE - .with_label_values(&["staging", stream, file_type]) - .add(file_size as i64); - } - - let record_reader = MergedReverseRecordReader::try_new(&files); - if record_reader.readers.is_empty() { - continue; - } - let merged_schema = record_reader.merged_schema(); - let mut index_time_partition: usize = 0; - if let Some(time_partition) = time_partition.as_ref() { - index_time_partition = merged_schema.index_of(time_partition).unwrap(); - } - let mut custom_partition_fields: HashMap = HashMap::new(); - if let Some(custom_partition) = custom_partition.as_ref() { - for custom_partition_field in custom_partition.split(',') { - let index = merged_schema.index_of(custom_partition_field).unwrap(); - custom_partition_fields.insert(custom_partition_field.to_string(), index); - } - } - let props = parquet_writer_props( - dir.options, - time_partition.clone(), - index_time_partition, - custom_partition_fields, - ) - .build(); - schemas.push(merged_schema.clone()); - let schema = Arc::new(merged_schema); - let parquet_file = fs::File::create(&parquet_path).map_err(|_| MoveDataError::Create)?; - let mut writer = ArrowWriter::try_new(&parquet_file, schema.clone(), Some(props))?; - for ref record in record_reader.merged_iter(schema, time_partition.clone()) { - writer.write(record)?; - } - - writer.close()?; - if parquet_file.metadata().unwrap().len() < parquet::file::FOOTER_SIZE as u64 { - error!( - "Invalid parquet file {:?} detected for stream {}, removing it", - &parquet_path, stream - ); - fs::remove_file(parquet_path).unwrap(); - } else { - for file in files { - // warn!("file-\n{file:?}\n"); - let file_size = file.metadata().unwrap().len(); - let file_type = file.extension().unwrap().to_str().unwrap(); - if fs::remove_file(file.clone()).is_err() { - error!("Failed to delete file. Unstable state"); - process::abort() - } - metrics::STORAGE_SIZE - .with_label_values(&["staging", stream, file_type]) - .sub(file_size as i64); - } - } - } - - if !schemas.is_empty() { - Ok(Some(Schema::try_merge(schemas).unwrap())) - } else { - Ok(None) - } -} - -pub fn parquet_writer_props( - options: &Options, - time_partition: Option, - index_time_partition: usize, - custom_partition_fields: HashMap, -) -> WriterPropertiesBuilder { - let index_time_partition: i32 = index_time_partition as i32; - let mut time_partition_field = DEFAULT_TIMESTAMP_KEY.to_string(); - if let Some(time_partition) = time_partition { - time_partition_field = time_partition; - } - let mut sorting_column_vec: Vec = Vec::new(); - sorting_column_vec.push(SortingColumn { - column_idx: index_time_partition, - descending: true, - nulls_first: true, - }); - let mut props = WriterProperties::builder() - .set_max_row_group_size(options.row_group_size) - .set_compression(options.parquet_compression.into()) - .set_column_encoding( - ColumnPath::new(vec![time_partition_field]), - Encoding::DELTA_BINARY_PACKED, - ); - - for (field, index) in custom_partition_fields { - let field = ColumnPath::new(vec![field]); - let encoding = Encoding::DELTA_BYTE_ARRAY; - props = props.set_column_encoding(field, encoding); - let sorting_column = SortingColumn { - column_idx: index as i32, - descending: true, - nulls_first: true, - }; - sorting_column_vec.push(sorting_column); - } - props = props.set_sorting_columns(Some(sorting_column_vec)); - - props -} +/// Staging is made up of multiple streams, each stream's context is housed in a single `Stream` object. +/// `STAGING` is a globally shared mapping of `Streams` that are in staging. +pub static STAGING: Lazy = Lazy::new(Streams::default); pub fn get_ingestor_info(config: &Config) -> anyhow::Result { - let path = PathBuf::from(&config.options.local_staging_path); - // all the files should be in the staging directory root - let entries = std::fs::read_dir(path)?; + let entries = std::fs::read_dir(&config.options.local_staging_path)?; let url = get_url(); let port = url.port().unwrap_or(80).to_string(); let url = url.to_string(); @@ -439,119 +146,10 @@ pub fn get_ingestor_info(config: &Config) -> anyhow::Result { /// /// * `ingestor_info`: The ingestor info to be stored. pub fn put_ingestor_info(config: &Config, info: IngestorMetadata) -> anyhow::Result<()> { - let path = PathBuf::from(&config.options.local_staging_path); let file_name = format!("ingestor.{}.json", info.ingestor_id); - let file_path = path.join(file_name); + let file_path = config.options.local_staging_path.join(file_name); std::fs::write(file_path, serde_json::to_vec(&info)?)?; Ok(()) } - -#[cfg(test)] -mod tests { - use chrono::NaiveDate; - use temp_dir::TempDir; - - use super::*; - - #[test] - fn test_storage_dir_new_with_valid_stream() { - let stream_name = "test_stream"; - - let options = Options::default(); - let storage_dir = Staging::new(&options, stream_name); - - assert_eq!( - storage_dir.data_path, - options.local_stream_data_path(stream_name) - ); - } - - #[test] - fn test_storage_dir_with_special_characters() { - let stream_name = "test_stream_!@#$%^&*()"; - - let options = Options::default(); - let storage_dir = Staging::new(&options, stream_name); - - assert_eq!( - storage_dir.data_path, - options.local_stream_data_path(stream_name) - ); - } - - #[test] - fn test_storage_dir_data_path_initialization() { - let stream_name = "example_stream"; - - let options = Options::default(); - let storage_dir = Staging::new(&options, stream_name); - - assert_eq!( - storage_dir.data_path, - options.local_stream_data_path(stream_name) - ); - } - - #[test] - fn test_storage_dir_with_alphanumeric_stream_name() { - let stream_name = "test123stream"; - - let options = Options::default(); - let storage_dir = Staging::new(&options, stream_name); - - assert_eq!( - storage_dir.data_path, - options.local_stream_data_path(stream_name) - ); - } - - #[test] - fn test_arrow_files_empty_directory() { - let temp_dir = TempDir::new().unwrap(); - - let options = Options { - local_staging_path: temp_dir.path().to_path_buf(), - ..Default::default() - }; - let storage_dir = Staging::new(&options, "test_stream"); - - let files = storage_dir.arrow_files(); - - assert!(files.is_empty()); - } - - #[test] - fn generate_correct_path_with_current_time_and_valid_parameters() { - let stream_name = "test_stream"; - let stream_hash = "abc123"; - let parsed_timestamp = NaiveDate::from_ymd_opt(2023, 10, 1) - .unwrap() - .and_hms_opt(12, 30, 0) - .unwrap(); - let mut custom_partition_values = HashMap::new(); - custom_partition_values.insert("key1".to_string(), "value1".to_string()); - custom_partition_values.insert("key2".to_string(), "value2".to_string()); - - let options = Options::default(); - let storage_dir = Staging::new(&options, stream_name); - - let expected_path = storage_dir.data_path.join(format!( - "{}{stream_hash}.date={}.hour={:02}.minute={}.key1=value1.key2=value2.{}.data.arrows", - Utc::now().format("%Y%m%dT%H%M"), - parsed_timestamp.date(), - parsed_timestamp.hour(), - minute_to_slot(parsed_timestamp.minute(), OBJECT_STORE_DATA_GRANULARITY).unwrap(), - hostname::get().unwrap().into_string().unwrap() - )); - - let generated_path = storage_dir.path_by_current_time( - stream_hash, - parsed_timestamp, - &custom_partition_values, - ); - - assert_eq!(generated_path, expected_path); - } -} diff --git a/src/staging/streams.rs b/src/staging/streams.rs new file mode 100644 index 000000000..6278616db --- /dev/null +++ b/src/staging/streams.rs @@ -0,0 +1,565 @@ +/* + * Parseable Server (C) 2022 - 2024 Parseable, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + */ + +use std::{ + collections::HashMap, + fs::{remove_file, OpenOptions}, + path::{Path, PathBuf}, + process, + sync::{Arc, Mutex, RwLock}, +}; + +use arrow_array::RecordBatch; +use arrow_ipc::writer::StreamWriter; +use arrow_schema::Schema; +use chrono::{NaiveDateTime, Timelike, Utc}; +use derive_more::{Deref, DerefMut}; +use itertools::Itertools; +use parquet::{ + arrow::ArrowWriter, + basic::Encoding, + file::properties::{WriterProperties, WriterPropertiesBuilder}, + format::SortingColumn, + schema::types::ColumnPath, +}; +use rand::distributions::DistString; +use tracing::error; + +use crate::{ + cli::Options, + event::DEFAULT_TIMESTAMP_KEY, + handlers::http::modal::ingest_server::INGESTOR_META, + metrics, + option::{Mode, CONFIG}, + storage::{StreamType, OBJECT_STORE_DATA_GRANULARITY}, + utils::{arrow::merged_reader::MergedReverseRecordReader, minute_to_slot}, +}; + +use super::{writer::Writer, MoveDataError, StreamWriterError}; + +const ARROW_FILE_EXTENSION: &str = "data.arrows"; + +pub struct Stream<'a> { + pub data_path: PathBuf, + pub options: &'a Options, + pub writer: Mutex, +} + +impl<'a> Stream<'a> { + pub fn new(options: &'a Options, stream_name: &str) -> Self { + Self { + data_path: options.local_stream_data_path(stream_name), + options, + writer: Mutex::new(Writer::default()), + } + } + + fn push( + &self, + schema_key: &str, + record: &RecordBatch, + parsed_timestamp: NaiveDateTime, + custom_partition_values: &HashMap, + stream_type: StreamType, + ) -> Result<(), StreamWriterError> { + let mut guard = self.writer.lock().unwrap(); + if self.options.mode != Mode::Query || stream_type == StreamType::Internal { + match guard.disk.get_mut(schema_key) { + Some(writer) => { + writer.write(record)?; + } + None => { + // entry is not present thus we create it + let file_path = self.path_by_current_time( + schema_key, + parsed_timestamp, + custom_partition_values, + ); + std::fs::create_dir_all(&self.data_path)?; + + let file = OpenOptions::new() + .create(true) + .append(true) + .open(&file_path)?; + + let mut writer = StreamWriter::try_new(file, &record.schema()) + .expect("File and RecordBatch both are checked"); + + writer.write(record)?; + guard.disk.insert(schema_key.to_owned(), writer); + } + }; + guard.mem.push(schema_key, record); + } else { + guard.mem.push(schema_key, record); + } + + Ok(()) + } + + pub fn path_by_current_time( + &self, + stream_hash: &str, + parsed_timestamp: NaiveDateTime, + custom_partition_values: &HashMap, + ) -> PathBuf { + let mut hostname = hostname::get().unwrap().into_string().unwrap(); + if self.options.mode == Mode::Ingest { + hostname.push_str(&INGESTOR_META.get_ingestor_id()); + } + let filename = format!( + "{}{stream_hash}.date={}.hour={:02}.minute={}.{}.{hostname}.{ARROW_FILE_EXTENSION}", + Utc::now().format("%Y%m%dT%H%M"), + parsed_timestamp.date(), + parsed_timestamp.hour(), + minute_to_slot(parsed_timestamp.minute(), OBJECT_STORE_DATA_GRANULARITY).unwrap(), + custom_partition_values + .iter() + .sorted_by_key(|v| v.0) + .map(|(key, value)| format!("{key}={value}")) + .join(".") + ); + self.data_path.join(filename) + } + + pub fn arrow_files(&self) -> Vec { + let Ok(dir) = self.data_path.read_dir() else { + return vec![]; + }; + + let paths = dir + .flatten() + .map(|file| file.path()) + .filter(|file| file.extension().is_some_and(|ext| ext.eq("arrows"))) + .sorted_by_key(|f| f.metadata().unwrap().modified().unwrap()) + .collect(); + + paths + } + + #[allow(dead_code)] + pub fn arrow_files_grouped_by_time(&self) -> HashMap> { + // hashmap + let mut grouped_arrow_file: HashMap> = HashMap::new(); + let arrow_files = self.arrow_files(); + for arrow_file_path in arrow_files { + let key = Self::arrow_path_to_parquet(&arrow_file_path, ""); + grouped_arrow_file + .entry(key) + .or_default() + .push(arrow_file_path); + } + + grouped_arrow_file + } + + pub fn arrow_files_grouped_exclude_time( + &self, + exclude: NaiveDateTime, + stream: &str, + shutdown_signal: bool, + ) -> HashMap> { + let mut grouped_arrow_file: HashMap> = HashMap::new(); + let mut arrow_files = self.arrow_files(); + + if !shutdown_signal { + arrow_files.retain(|path| { + !path + .file_name() + .unwrap() + .to_str() + .unwrap() + .starts_with(&exclude.format("%Y%m%dT%H%M").to_string()) + }); + } + + let random_string = + rand::distributions::Alphanumeric.sample_string(&mut rand::thread_rng(), 15); + for arrow_file_path in arrow_files { + if arrow_file_path.metadata().unwrap().len() == 0 { + error!( + "Invalid arrow file {:?} detected for stream {}, removing it", + &arrow_file_path, stream + ); + remove_file(&arrow_file_path).unwrap(); + } else { + let key = Self::arrow_path_to_parquet(&arrow_file_path, &random_string); + grouped_arrow_file + .entry(key) + .or_default() + .push(arrow_file_path); + } + } + grouped_arrow_file + } + + pub fn parquet_files(&self) -> Vec { + let Ok(dir) = self.data_path.read_dir() else { + return vec![]; + }; + + dir.flatten() + .map(|file| file.path()) + .filter(|file| file.extension().is_some_and(|ext| ext.eq("parquet"))) + .collect() + } + + fn arrow_path_to_parquet(path: &Path, random_string: &str) -> PathBuf { + let filename = path.file_stem().unwrap().to_str().unwrap(); + let (_, filename) = filename.split_once('.').unwrap(); + assert!(filename.contains('.'), "contains the delim `.`"); + let filename_with_random_number = format!("{filename}.{random_string}.arrows"); + let mut parquet_path = path.to_owned(); + parquet_path.set_file_name(filename_with_random_number); + parquet_path.set_extension("parquet"); + parquet_path + } + + fn recordbatches_cloned(&self, schema: &Arc) -> Vec { + self.writer.lock().unwrap().mem.recordbatch_cloned(schema) + } + + pub fn clear(&self) { + self.writer.lock().unwrap().mem.clear(); + } + + fn unset(self) { + let writer = self.writer.into_inner().unwrap(); + for mut writer in writer.disk.into_values() { + _ = writer.finish(); + } + } +} + +pub fn convert_disk_files_to_parquet( + stream: &str, + dir: &Stream, + time_partition: Option, + custom_partition: Option, + shutdown_signal: bool, +) -> Result, MoveDataError> { + let mut schemas = Vec::new(); + + let time = chrono::Utc::now().naive_utc(); + let staging_files = dir.arrow_files_grouped_exclude_time(time, stream, shutdown_signal); + if staging_files.is_empty() { + metrics::STAGING_FILES.with_label_values(&[stream]).set(0); + metrics::STORAGE_SIZE + .with_label_values(&["staging", stream, "arrows"]) + .set(0); + metrics::STORAGE_SIZE + .with_label_values(&["staging", stream, "parquet"]) + .set(0); + } + + // warn!("staging files-\n{staging_files:?}\n"); + for (parquet_path, files) in staging_files { + metrics::STAGING_FILES + .with_label_values(&[stream]) + .set(files.len() as i64); + + for file in &files { + let file_size = file.metadata().unwrap().len(); + let file_type = file.extension().unwrap().to_str().unwrap(); + + metrics::STORAGE_SIZE + .with_label_values(&["staging", stream, file_type]) + .add(file_size as i64); + } + + let record_reader = MergedReverseRecordReader::try_new(&files); + if record_reader.readers.is_empty() { + continue; + } + let merged_schema = record_reader.merged_schema(); + let mut index_time_partition: usize = 0; + if let Some(time_partition) = time_partition.as_ref() { + index_time_partition = merged_schema.index_of(time_partition).unwrap(); + } + let mut custom_partition_fields: HashMap = HashMap::new(); + if let Some(custom_partition) = custom_partition.as_ref() { + for custom_partition_field in custom_partition.split(',') { + let index = merged_schema.index_of(custom_partition_field).unwrap(); + custom_partition_fields.insert(custom_partition_field.to_string(), index); + } + } + let props = parquet_writer_props( + dir.options, + time_partition.clone(), + index_time_partition, + custom_partition_fields, + ) + .build(); + schemas.push(merged_schema.clone()); + let schema = Arc::new(merged_schema); + let parquet_file = OpenOptions::new() + .create(true) + .append(true) + .open(&parquet_path) + .map_err(|_| MoveDataError::Create)?; + let mut writer = ArrowWriter::try_new(&parquet_file, schema.clone(), Some(props))?; + for ref record in record_reader.merged_iter(schema, time_partition.clone()) { + writer.write(record)?; + } + + writer.close()?; + if parquet_file.metadata().unwrap().len() < parquet::file::FOOTER_SIZE as u64 { + error!( + "Invalid parquet file {:?} detected for stream {}, removing it", + &parquet_path, stream + ); + remove_file(parquet_path).unwrap(); + } else { + for file in files { + // warn!("file-\n{file:?}\n"); + let file_size = file.metadata().unwrap().len(); + let file_type = file.extension().unwrap().to_str().unwrap(); + if remove_file(file.clone()).is_err() { + error!("Failed to delete file. Unstable state"); + process::abort() + } + metrics::STORAGE_SIZE + .with_label_values(&["staging", stream, file_type]) + .sub(file_size as i64); + } + } + } + + if !schemas.is_empty() { + Ok(Some(Schema::try_merge(schemas).unwrap())) + } else { + Ok(None) + } +} + +fn parquet_writer_props( + options: &Options, + time_partition: Option, + index_time_partition: usize, + custom_partition_fields: HashMap, +) -> WriterPropertiesBuilder { + let index_time_partition: i32 = index_time_partition as i32; + let mut time_partition_field = DEFAULT_TIMESTAMP_KEY.to_string(); + if let Some(time_partition) = time_partition { + time_partition_field = time_partition; + } + let mut sorting_column_vec: Vec = Vec::new(); + sorting_column_vec.push(SortingColumn { + column_idx: index_time_partition, + descending: true, + nulls_first: true, + }); + let mut props = WriterProperties::builder() + .set_max_row_group_size(options.row_group_size) + .set_compression(options.parquet_compression.into()) + .set_column_encoding( + ColumnPath::new(vec![time_partition_field]), + Encoding::DELTA_BINARY_PACKED, + ); + + for (field, index) in custom_partition_fields { + let field = ColumnPath::new(vec![field]); + let encoding = Encoding::DELTA_BYTE_ARRAY; + props = props.set_column_encoding(field, encoding); + let sorting_column = SortingColumn { + column_idx: index as i32, + descending: true, + nulls_first: true, + }; + sorting_column_vec.push(sorting_column); + } + props = props.set_sorting_columns(Some(sorting_column_vec)); + + props +} + +#[derive(Deref, DerefMut, Default)] +pub struct Streams(RwLock>>); + +impl Streams { + // Concatenates record batches and puts them in memory store for each event. + pub fn append_to_local( + &self, + stream_name: &str, + schema_key: &str, + record: &RecordBatch, + parsed_timestamp: NaiveDateTime, + custom_partition_values: &HashMap, + stream_type: StreamType, + ) -> Result<(), StreamWriterError> { + if !self.read().unwrap().contains_key(stream_name) { + // Gets write privileges only for inserting a writer + self.write().unwrap().insert( + stream_name.to_owned(), + Stream::new(&CONFIG.options, stream_name), + ); + } + + // Updates the writer with only read privileges + self.read() + .unwrap() + .get(stream_name) + .expect("Stream exists") + .push( + schema_key, + record, + parsed_timestamp, + custom_partition_values, + stream_type, + ) + } + + pub fn clear(&self, stream_name: &str) { + if let Some(stream) = self.write().unwrap().get(stream_name) { + stream.clear(); + } + } + + pub fn delete_stream(&self, stream_name: &str) { + self.write().unwrap().remove(stream_name); + } + + pub fn unset_all(&self) { + let mut table = self.write().unwrap(); + let map = std::mem::take(&mut *table); + drop(table); + for staging in map.into_values() { + staging.unset() + } + } + + pub fn recordbatches_cloned( + &self, + stream_name: &str, + schema: &Arc, + ) -> Option> { + self.read() + .unwrap() + .get(stream_name) + .map(|staging| staging.recordbatches_cloned(schema)) + } +} + +#[cfg(test)] +mod tests { + use chrono::NaiveDate; + use temp_dir::TempDir; + + use super::*; + + #[test] + fn test_storage_dir_new_with_valid_stream() { + let stream_name = "test_stream"; + + let options = Options::default(); + let storage_dir = Stream::new(&options, stream_name); + + assert_eq!( + storage_dir.data_path, + options.local_stream_data_path(stream_name) + ); + } + + #[test] + fn test_storage_dir_with_special_characters() { + let stream_name = "test_stream_!@#$%^&*()"; + + let options = Options::default(); + let storage_dir = Stream::new(&options, stream_name); + + assert_eq!( + storage_dir.data_path, + options.local_stream_data_path(stream_name) + ); + } + + #[test] + fn test_storage_dir_data_path_initialization() { + let stream_name = "example_stream"; + + let options = Options::default(); + let storage_dir = Stream::new(&options, stream_name); + + assert_eq!( + storage_dir.data_path, + options.local_stream_data_path(stream_name) + ); + } + + #[test] + fn test_storage_dir_with_alphanumeric_stream_name() { + let stream_name = "test123stream"; + + let options = Options::default(); + let storage_dir = Stream::new(&options, stream_name); + + assert_eq!( + storage_dir.data_path, + options.local_stream_data_path(stream_name) + ); + } + + #[test] + fn test_arrow_files_empty_directory() { + let temp_dir = TempDir::new().unwrap(); + + let options = Options { + local_staging_path: temp_dir.path().to_path_buf(), + ..Default::default() + }; + let storage_dir = Stream::new(&options, "test_stream"); + + let files = storage_dir.arrow_files(); + + assert!(files.is_empty()); + } + + #[test] + fn generate_correct_path_with_current_time_and_valid_parameters() { + let stream_name = "test_stream"; + let stream_hash = "abc123"; + let parsed_timestamp = NaiveDate::from_ymd_opt(2023, 10, 1) + .unwrap() + .and_hms_opt(12, 30, 0) + .unwrap(); + let mut custom_partition_values = HashMap::new(); + custom_partition_values.insert("key1".to_string(), "value1".to_string()); + custom_partition_values.insert("key2".to_string(), "value2".to_string()); + + let options = Options::default(); + let storage_dir = Stream::new(&options, stream_name); + + let expected_path = storage_dir.data_path.join(format!( + "{}{stream_hash}.date={}.hour={:02}.minute={}.key1=value1.key2=value2.{}.data.arrows", + Utc::now().format("%Y%m%dT%H%M"), + parsed_timestamp.date(), + parsed_timestamp.hour(), + minute_to_slot(parsed_timestamp.minute(), OBJECT_STORE_DATA_GRANULARITY).unwrap(), + hostname::get().unwrap().into_string().unwrap() + )); + + let generated_path = storage_dir.path_by_current_time( + stream_hash, + parsed_timestamp, + &custom_partition_values, + ); + + assert_eq!(generated_path, expected_path); + } +} diff --git a/src/staging/writer/mem_writer.rs b/src/staging/writer.rs similarity index 89% rename from src/staging/writer/mem_writer.rs rename to src/staging/writer.rs index d24077333..e8d3eff9c 100644 --- a/src/staging/writer/mem_writer.rs +++ b/src/staging/writer.rs @@ -14,17 +14,37 @@ * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . * + * */ -use std::{collections::HashSet, sync::Arc}; +use std::{ + collections::{HashMap, HashSet}, + fs::File, + sync::Arc, +}; use arrow_array::RecordBatch; +use arrow_ipc::writer::StreamWriter; use arrow_schema::Schema; use arrow_select::concat::concat_batches; use itertools::Itertools; use crate::utils::arrow::adapt_batch; +#[derive(Debug, thiserror::Error)] +pub enum StreamWriterError { + #[error("Arrow writer failed: {0}")] + Writer(#[from] arrow_schema::ArrowError), + #[error("Io Error when creating new file: {0}")] + Io(#[from] std::io::Error), +} + +#[derive(Default)] +pub struct Writer { + pub mem: MemWriter<16384>, + pub disk: HashMap>, +} + /// Structure to keep recordbatches in memory. /// /// Any new schema is updated in the schema map. diff --git a/src/staging/writer/file_writer.rs b/src/staging/writer/file_writer.rs deleted file mode 100644 index 4b8e013d1..000000000 --- a/src/staging/writer/file_writer.rs +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Parseable Server (C) 2022 - 2024 Parseable, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - * - */ - -use arrow_array::RecordBatch; -use arrow_ipc::writer::StreamWriter; -use derive_more::{Deref, DerefMut}; -use std::collections::HashMap; -use std::fs::{File, OpenOptions}; -use std::path::PathBuf; - -use super::StreamWriterError; -use crate::option::CONFIG; -use crate::staging::Staging; -use chrono::NaiveDateTime; - -pub struct ArrowWriter { - #[allow(dead_code)] - pub file_path: PathBuf, - pub writer: StreamWriter, -} - -#[derive(Deref, DerefMut, Default)] -pub struct FileWriter(HashMap); - -impl FileWriter { - // append to a existing stream - pub fn push( - &mut self, - stream_name: &str, - schema_key: &str, - record: &RecordBatch, - parsed_timestamp: NaiveDateTime, - custom_partition_values: &HashMap, - ) -> Result<(), StreamWriterError> { - match self.get_mut(schema_key) { - Some(writer) => { - writer.writer.write(record)?; - } - // entry is not present thus we create it - None => { - // this requires mutable borrow of the map so we drop this read lock and wait for write lock - let (path, writer) = init_new_stream_writer_file( - stream_name, - schema_key, - record, - parsed_timestamp, - custom_partition_values, - )?; - self.insert( - schema_key.to_owned(), - ArrowWriter { - file_path: path, - writer, - }, - ); - } - }; - - Ok(()) - } - - pub fn close_all(self) { - for mut writer in self.0.into_values() { - _ = writer.writer.finish(); - } - } -} - -fn init_new_stream_writer_file( - stream_name: &str, - schema_key: &str, - record: &RecordBatch, - parsed_timestamp: NaiveDateTime, - custom_partition_values: &HashMap, -) -> Result<(PathBuf, StreamWriter), StreamWriterError> { - let dir = Staging::new(&CONFIG.options, stream_name); - let path = dir.path_by_current_time(schema_key, parsed_timestamp, custom_partition_values); - std::fs::create_dir_all(dir.data_path)?; - - let file = OpenOptions::new().create(true).append(true).open(&path)?; - - let mut stream_writer = StreamWriter::try_new(file, &record.schema()) - .expect("File and RecordBatch both are checked"); - - stream_writer.write(record)?; - Ok((path, stream_writer)) -} diff --git a/src/staging/writer/mod.rs b/src/staging/writer/mod.rs deleted file mode 100644 index 472f6ad44..000000000 --- a/src/staging/writer/mod.rs +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Parseable Server (C) 2022 - 2024 Parseable, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - * - */ - -mod file_writer; -mod mem_writer; - -use std::{ - collections::HashMap, - sync::{Arc, Mutex, RwLock}, -}; - -use crate::{ - option::{Mode, CONFIG}, - storage::StreamType, -}; - -use self::{file_writer::FileWriter, mem_writer::MemWriter}; -use arrow_array::RecordBatch; -use arrow_schema::Schema; -use chrono::NaiveDateTime; -use derive_more::{Deref, DerefMut}; - -#[derive(Debug, thiserror::Error)] -pub enum StreamWriterError { - #[error("Arrow writer failed: {0}")] - Writer(#[from] arrow_schema::ArrowError), - #[error("Io Error when creating new file: {0}")] - Io(#[from] std::io::Error), -} - -#[derive(Default)] -pub struct Writer { - pub mem: MemWriter<16384>, - pub disk: FileWriter, -} - -impl Writer { - fn push( - &mut self, - stream_name: &str, - schema_key: &str, - rb: &RecordBatch, - parsed_timestamp: NaiveDateTime, - custom_partition_values: &HashMap, - ) -> Result<(), StreamWriterError> { - self.disk.push( - stream_name, - schema_key, - rb, - parsed_timestamp, - custom_partition_values, - )?; - self.mem.push(schema_key, rb); - Ok(()) - } - - fn push_mem(&mut self, schema_key: &str, rb: &RecordBatch) -> Result<(), StreamWriterError> { - self.mem.push(schema_key, rb); - Ok(()) - } -} - -#[derive(Deref, DerefMut, Default)] -pub struct WriterTable(RwLock>>); - -impl WriterTable { - // Concatenates record batches and puts them in memory store for each event. - pub fn append_to_local( - &self, - stream_name: &str, - schema_key: &str, - record: &RecordBatch, - parsed_timestamp: NaiveDateTime, - custom_partition_values: &HashMap, - stream_type: &StreamType, - ) -> Result<(), StreamWriterError> { - if !self.read().unwrap().contains_key(stream_name) { - // Gets write privileges only for inserting a writer - self.write() - .unwrap() - .insert(stream_name.to_owned(), Mutex::new(Writer::default())); - } - - // Updates the writer with only read privileges - self.handle_existing_writer( - stream_name, - schema_key, - record, - parsed_timestamp, - custom_partition_values, - stream_type, - )?; - - Ok(()) - } - - /// Update writer for stream when it already exists - fn handle_existing_writer( - &self, - stream_name: &str, - schema_key: &str, - record: &RecordBatch, - parsed_timestamp: NaiveDateTime, - custom_partition_values: &HashMap, - stream_type: &StreamType, - ) -> Result<(), StreamWriterError> { - let hashmap_guard = self.read().unwrap(); - let mut writer = hashmap_guard - .get(stream_name) - .expect("Stream exists") - .lock() - .unwrap(); - if CONFIG.options.mode != Mode::Query || *stream_type == StreamType::Internal { - writer.push( - stream_name, - schema_key, - record, - parsed_timestamp, - custom_partition_values, - )?; - } else { - writer.push_mem(stream_name, record)?; - } - - Ok(()) - } - - pub fn clear(&self, stream_name: &str) { - let map = self.write().unwrap(); - if let Some(writer) = map.get(stream_name) { - let w = &mut writer.lock().unwrap().mem; - w.clear(); - } - } - - pub fn delete_stream(&self, stream_name: &str) { - self.write().unwrap().remove(stream_name); - } - - pub fn unset_all(&self) { - let mut table = self.write().unwrap(); - let map = std::mem::take(&mut *table); - drop(table); - for writer in map.into_values() { - let writer = writer.into_inner().unwrap(); - writer.disk.close_all(); - } - } - - pub fn recordbatches_cloned( - &self, - stream_name: &str, - schema: &Arc, - ) -> Option> { - let records = self - .0 - .read() - .unwrap() - .get(stream_name)? - .lock() - .unwrap() - .mem - .recordbatch_cloned(schema); - - Some(records) - } -} diff --git a/src/storage/object_storage.rs b/src/storage/object_storage.rs index 8656e3eb4..a5a06710a 100644 --- a/src/storage/object_storage.rs +++ b/src/storage/object_storage.rs @@ -31,7 +31,7 @@ use crate::handlers::http::users::{CORRELATION_DIR, DASHBOARDS_DIR, FILTER_DIR, use crate::metadata::SchemaVersion; use crate::metrics::{EVENTS_STORAGE_SIZE_DATE, LIFETIME_EVENTS_STORAGE_SIZE}; use crate::option::Mode; -use crate::staging::{convert_disk_files_to_parquet, Staging}; +use crate::staging::{convert_disk_files_to_parquet, Stream}; use crate::{ alerts::Alerts, catalog::{self, manifest::Manifest, snapshot::Snapshot}, @@ -551,7 +551,7 @@ pub trait ObjectStorage: Debug + Send + Sync + 'static { let custom_partition = STREAM_INFO .get_custom_partition(stream) .map_err(|err| ObjectStorageError::UnhandledError(Box::new(err)))?; - let dir = Staging::new(&CONFIG.options, stream); + let dir = Stream::new(&CONFIG.options, stream); let schema = convert_disk_files_to_parquet( stream, &dir, diff --git a/src/sync.rs b/src/sync.rs index d1c04aa3a..c45030c07 100644 --- a/src/sync.rs +++ b/src/sync.rs @@ -24,7 +24,7 @@ use tokio::time::{interval, sleep, Duration}; use tracing::{error, info, warn}; use crate::option::CONFIG; -use crate::staging::STREAM_WRITERS; +use crate::staging::STAGING; use crate::{storage, STORAGE_UPLOAD_INTERVAL}; pub async fn object_store_sync() -> ( @@ -98,7 +98,7 @@ pub async fn run_local_sync() -> ( scheduler .every((storage::LOCAL_SYNC_INTERVAL as u32).seconds()) .run(|| async { - STREAM_WRITERS.unset_all(); + STAGING.unset_all(); }); loop { From f0b18e954f729dfb2634039b12d2b73b90307fdc Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Thu, 23 Jan 2025 20:10:44 +0530 Subject: [PATCH 10/34] test: empty staging --- src/staging/streams.rs | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/staging/streams.rs b/src/staging/streams.rs index 69807e691..5e0b93828 100644 --- a/src/staging/streams.rs +++ b/src/staging/streams.rs @@ -530,7 +530,7 @@ mod tests { let storage_dir = Stream::new(&options, stream_name); let expected_path = storage_dir.data_path.join(format!( - "{}{stream_hash}.date={}.hour={:02}.minute={}.key1=value1.key2=value2.{}.data.arrows", + "{}{stream_hash}.date={}.hour={:02}.minute={}.key1=value1.key2=value2.{}{ARROW_FILE_EXTENSION}", Utc::now().format("%Y%m%dT%H%M"), parsed_timestamp.date(), parsed_timestamp.hour(), @@ -546,4 +546,29 @@ mod tests { assert_eq!(generated_path, expected_path); } + + #[test] + fn test_convert_files_with_empty_staging() -> Result<(), MoveDataError> { + let temp_dir = TempDir::new()?; + let options = Options { + local_staging_path: temp_dir.path().to_path_buf(), + ..Default::default() + }; + let stream = "test_stream".to_string(); + let storage_dir = Stream::new(&options, &stream); + let result = convert_disk_files_to_parquet(&stream, &storage_dir, None, None, false)?; + assert!(result.is_none()); + // Verify metrics were set to 0 + let staging_files = metrics::STAGING_FILES.with_label_values(&[&stream]).get(); + assert_eq!(staging_files, 0); + let storage_size_arrows = metrics::STORAGE_SIZE + .with_label_values(&["staging", &stream, "arrows"]) + .get(); + assert_eq!(storage_size_arrows, 0); + let storage_size_parquet = metrics::STORAGE_SIZE + .with_label_values(&["staging", &stream, "parquet"]) + .get(); + assert_eq!(storage_size_parquet, 0); + Ok(()) + } } From d15055d2ac2bcaafafe7be643e52d70f1add7076 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Fri, 24 Jan 2025 20:33:08 +0530 Subject: [PATCH 11/34] fix: keep streams in memory --- src/handlers/http/health_check.rs | 2 +- src/staging/streams.rs | 17 ++++++++--------- src/sync.rs | 2 +- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/handlers/http/health_check.rs b/src/handlers/http/health_check.rs index 855a6c9da..c9e55b5d5 100644 --- a/src/handlers/http/health_check.rs +++ b/src/handlers/http/health_check.rs @@ -58,7 +58,7 @@ pub async fn shutdown() { *shutdown_flag = true; // Sync staging - STAGING.unset_all(); + STAGING.flush_all(); } pub async fn readiness() -> HttpResponse { diff --git a/src/staging/streams.rs b/src/staging/streams.rs index 5e0b93828..167e61f15 100644 --- a/src/staging/streams.rs +++ b/src/staging/streams.rs @@ -223,9 +223,9 @@ impl<'a> Stream<'a> { self.writer.lock().unwrap().mem.clear(); } - fn unset(self) { - let writer = self.writer.into_inner().unwrap(); - for mut writer in writer.disk.into_values() { + fn flush(&self) { + let mut writer = self.writer.lock().unwrap(); + for writer in writer.disk.values_mut() { _ = writer.finish(); } } @@ -419,12 +419,11 @@ impl Streams { self.write().unwrap().remove(stream_name); } - pub fn unset_all(&self) { - let mut table = self.write().unwrap(); - let map = std::mem::take(&mut *table); - drop(table); - for staging in map.into_values() { - staging.unset() + pub fn flush_all(&self) { + let streams = self.read().unwrap(); + + for staging in streams.values() { + staging.flush() } } diff --git a/src/sync.rs b/src/sync.rs index c45030c07..d47ac441f 100644 --- a/src/sync.rs +++ b/src/sync.rs @@ -98,7 +98,7 @@ pub async fn run_local_sync() -> ( scheduler .every((storage::LOCAL_SYNC_INTERVAL as u32).seconds()) .run(|| async { - STAGING.unset_all(); + STAGING.flush_all(); }); loop { From 3154fb3d5e90adec8bf967804048c11dca9d81c0 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Fri, 24 Jan 2025 20:33:58 +0530 Subject: [PATCH 12/34] style: call it staging --- src/staging/streams.rs | 45 ++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/src/staging/streams.rs b/src/staging/streams.rs index 167e61f15..3f859b64f 100644 --- a/src/staging/streams.rs +++ b/src/staging/streams.rs @@ -447,53 +447,53 @@ mod tests { use super::*; #[test] - fn test_storage_dir_new_with_valid_stream() { + fn test_staging_new_with_valid_stream() { let stream_name = "test_stream"; let options = Options::default(); - let storage_dir = Stream::new(&options, stream_name); + let staging = Stream::new(&options, stream_name); assert_eq!( - storage_dir.data_path, + staging.data_path, options.local_stream_data_path(stream_name) ); } #[test] - fn test_storage_dir_with_special_characters() { + fn test_staging_with_special_characters() { let stream_name = "test_stream_!@#$%^&*()"; let options = Options::default(); - let storage_dir = Stream::new(&options, stream_name); + let staging = Stream::new(&options, stream_name); assert_eq!( - storage_dir.data_path, + staging.data_path, options.local_stream_data_path(stream_name) ); } #[test] - fn test_storage_dir_data_path_initialization() { + fn test_staging_data_path_initialization() { let stream_name = "example_stream"; let options = Options::default(); - let storage_dir = Stream::new(&options, stream_name); + let staging = Stream::new(&options, stream_name); assert_eq!( - storage_dir.data_path, + staging.data_path, options.local_stream_data_path(stream_name) ); } #[test] - fn test_storage_dir_with_alphanumeric_stream_name() { + fn test_staging_with_alphanumeric_stream_name() { let stream_name = "test123stream"; let options = Options::default(); - let storage_dir = Stream::new(&options, stream_name); + let staging = Stream::new(&options, stream_name); assert_eq!( - storage_dir.data_path, + staging.data_path, options.local_stream_data_path(stream_name) ); } @@ -506,9 +506,9 @@ mod tests { local_staging_path: temp_dir.path().to_path_buf(), ..Default::default() }; - let storage_dir = Stream::new(&options, "test_stream"); + let staging = Stream::new(&options, "test_stream"); - let files = storage_dir.arrow_files(); + let files = staging.arrow_files(); assert!(files.is_empty()); } @@ -526,9 +526,9 @@ mod tests { custom_partition_values.insert("key2".to_string(), "value2".to_string()); let options = Options::default(); - let storage_dir = Stream::new(&options, stream_name); + let staging = Stream::new(&options, stream_name); - let expected_path = storage_dir.data_path.join(format!( + let expected_path = staging.data_path.join(format!( "{}{stream_hash}.date={}.hour={:02}.minute={}.key1=value1.key2=value2.{}{ARROW_FILE_EXTENSION}", Utc::now().format("%Y%m%dT%H%M"), parsed_timestamp.date(), @@ -537,25 +537,22 @@ mod tests { hostname::get().unwrap().into_string().unwrap() )); - let generated_path = storage_dir.path_by_current_time( - stream_hash, - parsed_timestamp, - &custom_partition_values, - ); + let generated_path = + staging.path_by_current_time(stream_hash, parsed_timestamp, &custom_partition_values); assert_eq!(generated_path, expected_path); } #[test] - fn test_convert_files_with_empty_staging() -> Result<(), MoveDataError> { + fn test_convert_to_parquet_with_empty_staging() -> Result<(), MoveDataError> { let temp_dir = TempDir::new()?; let options = Options { local_staging_path: temp_dir.path().to_path_buf(), ..Default::default() }; let stream = "test_stream".to_string(); - let storage_dir = Stream::new(&options, &stream); - let result = convert_disk_files_to_parquet(&stream, &storage_dir, None, None, false)?; + let staging = Stream::new(&options, &stream); + let result = convert_disk_files_to_parquet(&stream, &staging, None, None, false)?; assert!(result.is_none()); // Verify metrics were set to 0 let staging_files = metrics::STAGING_FILES.with_label_values(&[&stream]).get(); From d47f3252e80276699ab92efa3381831b4bb5c2b1 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Fri, 24 Jan 2025 11:51:54 +0530 Subject: [PATCH 13/34] doc: make it clear what the code is doing --- src/event/mod.rs | 6 +-- src/query/stream_schema_provider.rs | 3 +- src/staging/streams.rs | 70 +++++++++++------------------ 3 files changed, 30 insertions(+), 49 deletions(-) diff --git a/src/event/mod.rs b/src/event/mod.rs index dd6014b1a..2f0299728 100644 --- a/src/event/mod.rs +++ b/src/event/mod.rs @@ -64,8 +64,7 @@ impl Event { commit_schema(&self.stream_name, self.rb.schema())?; } - STAGING.append_to_local( - &self.stream_name, + STAGING.get_or_create_stream(&self.stream_name).push( &key, &self.rb, self.parsed_timestamp, @@ -96,8 +95,7 @@ impl Event { pub fn process_unchecked(&self) -> Result<(), EventError> { let key = get_schema_key(&self.rb.schema().fields); - STAGING.append_to_local( - &self.stream_name, + STAGING.get_or_create_stream(&self.stream_name).push( &key, &self.rb, self.parsed_timestamp, diff --git a/src/query/stream_schema_provider.rs b/src/query/stream_schema_provider.rs index 9ca170c2a..d2303ddfd 100644 --- a/src/query/stream_schema_provider.rs +++ b/src/query/stream_schema_provider.rs @@ -441,7 +441,8 @@ impl TableProvider for StandardTableProvider { } if include_now(filters, &time_partition) { - if let Some(records) = STAGING.recordbatches_cloned(&self.stream, &self.schema) { + if let Some(staging) = STAGING.get_stream(&self.stream) { + let records = staging.recordbatches_cloned(&self.schema); let reversed_mem_table = reversed_mem_table(records, self.schema.clone())?; let memory_exec = reversed_mem_table diff --git a/src/staging/streams.rs b/src/staging/streams.rs index 3f859b64f..6328cbce5 100644 --- a/src/staging/streams.rs +++ b/src/staging/streams.rs @@ -55,6 +55,9 @@ use super::{writer::Writer, MoveDataError, StreamWriterError}; const ARROW_FILE_EXTENSION: &str = "data.arrows"; +pub type StreamRef<'a> = Arc>; + +/// State of staging associated with a single stream of data in parseable. pub struct Stream<'a> { pub data_path: PathBuf, pub options: &'a Options, @@ -62,15 +65,16 @@ pub struct Stream<'a> { } impl<'a> Stream<'a> { - pub fn new(options: &'a Options, stream_name: &str) -> Self { - Self { + pub fn new(options: &'a Options, stream_name: &str) -> StreamRef<'a> { + Arc::new(Self { data_path: options.local_stream_data_path(stream_name), options, writer: Mutex::new(Writer::default()), - } + }) } - fn push( + // Concatenates record batches and puts them in memory store for each event. + pub fn push( &self, schema_key: &str, record: &RecordBatch, @@ -215,7 +219,7 @@ impl<'a> Stream<'a> { parquet_path } - fn recordbatches_cloned(&self, schema: &Arc) -> Vec { + pub fn recordbatches_cloned(&self, schema: &Arc) -> Vec { self.writer.lock().unwrap().mem.recordbatch_cloned(schema) } @@ -374,39 +378,28 @@ fn parquet_writer_props( } #[derive(Deref, DerefMut, Default)] -pub struct Streams(RwLock>>); +pub struct Streams(RwLock>>); impl Streams { - // Concatenates record batches and puts them in memory store for each event. - pub fn append_to_local( - &self, - stream_name: &str, - schema_key: &str, - record: &RecordBatch, - parsed_timestamp: NaiveDateTime, - custom_partition_values: &HashMap, - stream_type: StreamType, - ) -> Result<(), StreamWriterError> { - if !self.read().unwrap().contains_key(stream_name) { - // Gets write privileges only for inserting a writer - self.write().unwrap().insert( - stream_name.to_owned(), - Stream::new(&CONFIG.options, stream_name), - ); + /// Try to get the handle of a stream in staging, if it doesn't exist return `None`. + pub fn get_stream(&self, stream_name: &str) -> Option> { + self.read().unwrap().get(stream_name).cloned() + } + + /// Get the handle to a stream in staging, create one if it doesn't exist + pub fn get_or_create_stream(&self, stream_name: &str) -> StreamRef<'static> { + if let Some(staging) = self.get_stream(stream_name) { + return staging; } - // Updates the writer with only read privileges - self.read() + let staging = Stream::new(&CONFIG.options, stream_name); + + // Gets write privileges only for creating the stream when it doesn't already exist. + self.write() .unwrap() - .get(stream_name) - .expect("Stream exists") - .push( - schema_key, - record, - parsed_timestamp, - custom_partition_values, - stream_type, - ) + .insert(stream_name.to_owned(), staging.clone()); + + staging } pub fn clear(&self, stream_name: &str) { @@ -426,17 +419,6 @@ impl Streams { staging.flush() } } - - pub fn recordbatches_cloned( - &self, - stream_name: &str, - schema: &Arc, - ) -> Option> { - self.read() - .unwrap() - .get(stream_name) - .map(|staging| staging.recordbatches_cloned(schema)) - } } #[cfg(test)] From 481c98546ae46bbfe6e5717759d55a393c8c8b3f Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Fri, 24 Jan 2025 21:12:36 +0530 Subject: [PATCH 14/34] refactor: move code to where it truly belongs --- src/event/mod.rs | 4 ---- src/handlers/airplane.rs | 8 ++++++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/event/mod.rs b/src/event/mod.rs index 2f0299728..1dec203c6 100644 --- a/src/event/mod.rs +++ b/src/event/mod.rs @@ -105,10 +105,6 @@ impl Event { Ok(()) } - - pub fn clear(&self, stream_name: &str) { - STAGING.clear(stream_name); - } } pub fn get_schema_key(fields: &[Arc]) -> String { diff --git a/src/handlers/airplane.rs b/src/handlers/airplane.rs index 5edfdba21..448c986ef 100644 --- a/src/handlers/airplane.rs +++ b/src/handlers/airplane.rs @@ -39,6 +39,7 @@ use crate::handlers::livetail::cross_origin_config; use crate::metrics::QUERY_EXECUTE_TIME; use crate::option::CONFIG; use crate::query::{TableScanVisitor, QUERY_SESSION}; +use crate::staging::STAGING; use crate::utils::arrow::flight::{ append_temporary_events, get_query_from_ticket, into_flight_data, run_do_get_rpc, send_to_ingester, @@ -231,10 +232,12 @@ impl FlightService for AirServiceImpl { .collect::>(); let schema = Schema::try_merge(schemas).map_err(|err| Status::internal(err.to_string()))?; */ + // Taxi out airplane let out = into_flight_data(records); - if let Some(event) = event { - event.clear(&stream_name); + if event.is_some() { + // Clear staging of stream once airplane has taxied + STAGING.clear(&stream_name); } let time = time.elapsed().as_secs_f64(); @@ -242,6 +245,7 @@ impl FlightService for AirServiceImpl { .with_label_values(&[&format!("flight-query-{}", stream_name)]) .observe(time); + // Airplane takes off 🛫 out } From f89c143ca00041a2c7dc2dc642c9a9348de8594f Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Fri, 24 Jan 2025 21:58:33 +0530 Subject: [PATCH 15/34] use `STAGING` --- src/storage/object_storage.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/storage/object_storage.rs b/src/storage/object_storage.rs index a5a06710a..eb42a17e8 100644 --- a/src/storage/object_storage.rs +++ b/src/storage/object_storage.rs @@ -31,7 +31,7 @@ use crate::handlers::http::users::{CORRELATION_DIR, DASHBOARDS_DIR, FILTER_DIR, use crate::metadata::SchemaVersion; use crate::metrics::{EVENTS_STORAGE_SIZE_DATE, LIFETIME_EVENTS_STORAGE_SIZE}; use crate::option::Mode; -use crate::staging::{convert_disk_files_to_parquet, Stream}; +use crate::staging::{convert_disk_files_to_parquet, STAGING}; use crate::{ alerts::Alerts, catalog::{self, manifest::Manifest, snapshot::Snapshot}, @@ -551,10 +551,10 @@ pub trait ObjectStorage: Debug + Send + Sync + 'static { let custom_partition = STREAM_INFO .get_custom_partition(stream) .map_err(|err| ObjectStorageError::UnhandledError(Box::new(err)))?; - let dir = Stream::new(&CONFIG.options, stream); + let staging = STAGING.get_or_create_stream(stream); let schema = convert_disk_files_to_parquet( stream, - &dir, + &staging, time_partition, custom_partition.clone(), shutdown_signal, @@ -570,8 +570,7 @@ pub trait ObjectStorage: Debug + Send + Sync + 'static { } } - let parquet_files = dir.parquet_files(); - for file in parquet_files { + for file in staging.parquet_files() { let filename = file .file_name() .expect("only parquet files are returned by iterator") From b750c0f593376943ff4740b035692221b866390f Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Fri, 24 Jan 2025 22:01:41 +0530 Subject: [PATCH 16/34] don't request what you already know --- src/staging/mod.rs | 3 +-- src/staging/streams.rs | 39 ++++++++++++++++++++--------------- src/storage/object_storage.rs | 1 - 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/src/staging/mod.rs b/src/staging/mod.rs index 77b0d1209..15d07feb5 100644 --- a/src/staging/mod.rs +++ b/src/staging/mod.rs @@ -28,8 +28,7 @@ use base64::Engine; use once_cell::sync::Lazy; use parquet::errors::ParquetError; use serde_json::Value as JsonValue; -pub use streams::convert_disk_files_to_parquet; -pub use streams::{Stream, Streams}; +pub use streams::{convert_disk_files_to_parquet, Stream, Streams}; use tracing::{error, info}; pub use writer::StreamWriterError; diff --git a/src/staging/streams.rs b/src/staging/streams.rs index 6328cbce5..846cd2e91 100644 --- a/src/staging/streams.rs +++ b/src/staging/streams.rs @@ -59,15 +59,20 @@ pub type StreamRef<'a> = Arc>; /// State of staging associated with a single stream of data in parseable. pub struct Stream<'a> { + pub stream_name: String, pub data_path: PathBuf, pub options: &'a Options, pub writer: Mutex, } impl<'a> Stream<'a> { - pub fn new(options: &'a Options, stream_name: &str) -> StreamRef<'a> { + pub fn new(options: &'a Options, stream_name: impl Into) -> StreamRef<'a> { + let stream_name = stream_name.into(); + let data_path = options.local_stream_data_path(&stream_name); + Arc::new(Self { - data_path: options.local_stream_data_path(stream_name), + stream_name, + data_path, options, writer: Mutex::new(Writer::default()), }) @@ -160,7 +165,6 @@ impl<'a> Stream<'a> { pub fn arrow_files_grouped_exclude_time( &self, exclude: NaiveDateTime, - stream: &str, shutdown_signal: bool, ) -> HashMap> { let mut grouped_arrow_file: HashMap> = HashMap::new(); @@ -183,7 +187,7 @@ impl<'a> Stream<'a> { if arrow_file_path.metadata().unwrap().len() == 0 { error!( "Invalid arrow file {:?} detected for stream {}, removing it", - &arrow_file_path, stream + &arrow_file_path, self.stream_name ); remove_file(&arrow_file_path).unwrap(); } else { @@ -235,9 +239,8 @@ impl<'a> Stream<'a> { } } -pub fn convert_disk_files_to_parquet( - stream: &str, - dir: &Stream, +pub fn convert_disk_files_to_parquet<'a>( + staging: &StreamRef<'a>, time_partition: Option, custom_partition: Option, shutdown_signal: bool, @@ -245,21 +248,23 @@ pub fn convert_disk_files_to_parquet( let mut schemas = Vec::new(); let time = chrono::Utc::now().naive_utc(); - let staging_files = dir.arrow_files_grouped_exclude_time(time, stream, shutdown_signal); + let staging_files = staging.arrow_files_grouped_exclude_time(time, shutdown_signal); if staging_files.is_empty() { - metrics::STAGING_FILES.with_label_values(&[stream]).set(0); + metrics::STAGING_FILES + .with_label_values(&[&staging.stream_name]) + .set(0); metrics::STORAGE_SIZE - .with_label_values(&["staging", stream, "arrows"]) + .with_label_values(&["staging", &staging.stream_name, "arrows"]) .set(0); metrics::STORAGE_SIZE - .with_label_values(&["staging", stream, "parquet"]) + .with_label_values(&["staging", &staging.stream_name, "parquet"]) .set(0); } // warn!("staging files-\n{staging_files:?}\n"); for (parquet_path, files) in staging_files { metrics::STAGING_FILES - .with_label_values(&[stream]) + .with_label_values(&[&staging.stream_name]) .set(files.len() as i64); for file in &files { @@ -267,7 +272,7 @@ pub fn convert_disk_files_to_parquet( let file_type = file.extension().unwrap().to_str().unwrap(); metrics::STORAGE_SIZE - .with_label_values(&["staging", stream, file_type]) + .with_label_values(&["staging", &staging.stream_name, file_type]) .add(file_size as i64); } @@ -288,7 +293,7 @@ pub fn convert_disk_files_to_parquet( } } let props = parquet_writer_props( - dir.options, + staging.options, time_partition.clone(), index_time_partition, custom_partition_fields, @@ -310,7 +315,7 @@ pub fn convert_disk_files_to_parquet( if parquet_file.metadata().unwrap().len() < parquet::file::FOOTER_SIZE as u64 { error!( "Invalid parquet file {:?} detected for stream {}, removing it", - &parquet_path, stream + &parquet_path, &staging.stream_name ); remove_file(parquet_path).unwrap(); } else { @@ -323,7 +328,7 @@ pub fn convert_disk_files_to_parquet( process::abort() } metrics::STORAGE_SIZE - .with_label_values(&["staging", stream, file_type]) + .with_label_values(&["staging", &staging.stream_name, file_type]) .sub(file_size as i64); } } @@ -534,7 +539,7 @@ mod tests { }; let stream = "test_stream".to_string(); let staging = Stream::new(&options, &stream); - let result = convert_disk_files_to_parquet(&stream, &staging, None, None, false)?; + let result = convert_disk_files_to_parquet(&staging, None, None, false)?; assert!(result.is_none()); // Verify metrics were set to 0 let staging_files = metrics::STAGING_FILES.with_label_values(&[&stream]).get(); diff --git a/src/storage/object_storage.rs b/src/storage/object_storage.rs index eb42a17e8..6b91985cb 100644 --- a/src/storage/object_storage.rs +++ b/src/storage/object_storage.rs @@ -553,7 +553,6 @@ pub trait ObjectStorage: Debug + Send + Sync + 'static { .map_err(|err| ObjectStorageError::UnhandledError(Box::new(err)))?; let staging = STAGING.get_or_create_stream(stream); let schema = convert_disk_files_to_parquet( - stream, &staging, time_partition, custom_partition.clone(), From 747c79e548acc27ec090cf5fe43c168bfa808da0 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Fri, 24 Jan 2025 22:34:52 +0530 Subject: [PATCH 17/34] refactor: convert into method --- src/staging/mod.rs | 2 +- src/staging/streams.rs | 232 +++++++++++++++++----------------- src/storage/object_storage.rs | 16 +-- 3 files changed, 123 insertions(+), 127 deletions(-) diff --git a/src/staging/mod.rs b/src/staging/mod.rs index 15d07feb5..8b5175e90 100644 --- a/src/staging/mod.rs +++ b/src/staging/mod.rs @@ -28,7 +28,7 @@ use base64::Engine; use once_cell::sync::Lazy; use parquet::errors::ParquetError; use serde_json::Value as JsonValue; -pub use streams::{convert_disk_files_to_parquet, Stream, Streams}; +pub use streams::{Stream, Streams}; use tracing::{error, info}; pub use writer::StreamWriterError; diff --git a/src/staging/streams.rs b/src/staging/streams.rs index 846cd2e91..af33ed69b 100644 --- a/src/staging/streams.rs +++ b/src/staging/streams.rs @@ -237,149 +237,145 @@ impl<'a> Stream<'a> { _ = writer.finish(); } } -} - -pub fn convert_disk_files_to_parquet<'a>( - staging: &StreamRef<'a>, - time_partition: Option, - custom_partition: Option, - shutdown_signal: bool, -) -> Result, MoveDataError> { - let mut schemas = Vec::new(); - - let time = chrono::Utc::now().naive_utc(); - let staging_files = staging.arrow_files_grouped_exclude_time(time, shutdown_signal); - if staging_files.is_empty() { - metrics::STAGING_FILES - .with_label_values(&[&staging.stream_name]) - .set(0); - metrics::STORAGE_SIZE - .with_label_values(&["staging", &staging.stream_name, "arrows"]) - .set(0); - metrics::STORAGE_SIZE - .with_label_values(&["staging", &staging.stream_name, "parquet"]) - .set(0); - } - - // warn!("staging files-\n{staging_files:?}\n"); - for (parquet_path, files) in staging_files { - metrics::STAGING_FILES - .with_label_values(&[&staging.stream_name]) - .set(files.len() as i64); - - for file in &files { - let file_size = file.metadata().unwrap().len(); - let file_type = file.extension().unwrap().to_str().unwrap(); + pub fn convert_disk_files_to_parquet( + &self, + time_partition: Option<&String>, + custom_partition: Option<&String>, + shutdown_signal: bool, + ) -> Result, MoveDataError> { + let mut schemas = Vec::new(); + + let time = chrono::Utc::now().naive_utc(); + let staging_files = self.arrow_files_grouped_exclude_time(time, shutdown_signal); + if staging_files.is_empty() { + metrics::STAGING_FILES + .with_label_values(&[&self.stream_name]) + .set(0); + metrics::STORAGE_SIZE + .with_label_values(&["staging", &self.stream_name, "arrows"]) + .set(0); metrics::STORAGE_SIZE - .with_label_values(&["staging", &staging.stream_name, file_type]) - .add(file_size as i64); + .with_label_values(&["staging", &self.stream_name, "parquet"]) + .set(0); } - let record_reader = MergedReverseRecordReader::try_new(&files); - if record_reader.readers.is_empty() { - continue; - } - let merged_schema = record_reader.merged_schema(); - let mut index_time_partition: usize = 0; - if let Some(time_partition) = time_partition.as_ref() { - index_time_partition = merged_schema.index_of(time_partition).unwrap(); - } - let mut custom_partition_fields: HashMap = HashMap::new(); - if let Some(custom_partition) = custom_partition.as_ref() { - for custom_partition_field in custom_partition.split(',') { - let index = merged_schema.index_of(custom_partition_field).unwrap(); - custom_partition_fields.insert(custom_partition_field.to_string(), index); - } - } - let props = parquet_writer_props( - staging.options, - time_partition.clone(), - index_time_partition, - custom_partition_fields, - ) - .build(); - schemas.push(merged_schema.clone()); - let schema = Arc::new(merged_schema); - let parquet_file = OpenOptions::new() - .create(true) - .append(true) - .open(&parquet_path) - .map_err(|_| MoveDataError::Create)?; - let mut writer = ArrowWriter::try_new(&parquet_file, schema.clone(), Some(props))?; - for ref record in record_reader.merged_iter(schema, time_partition.clone()) { - writer.write(record)?; - } + // warn!("staging files-\n{staging_files:?}\n"); + for (parquet_path, files) in staging_files { + metrics::STAGING_FILES + .with_label_values(&[&self.stream_name]) + .set(files.len() as i64); - writer.close()?; - if parquet_file.metadata().unwrap().len() < parquet::file::FOOTER_SIZE as u64 { - error!( - "Invalid parquet file {:?} detected for stream {}, removing it", - &parquet_path, &staging.stream_name - ); - remove_file(parquet_path).unwrap(); - } else { - for file in files { - // warn!("file-\n{file:?}\n"); + for file in &files { let file_size = file.metadata().unwrap().len(); let file_type = file.extension().unwrap().to_str().unwrap(); - if remove_file(file.clone()).is_err() { - error!("Failed to delete file. Unstable state"); - process::abort() - } + metrics::STORAGE_SIZE - .with_label_values(&["staging", &staging.stream_name, file_type]) - .sub(file_size as i64); + .with_label_values(&["staging", &self.stream_name, file_type]) + .add(file_size as i64); + } + + let record_reader = MergedReverseRecordReader::try_new(&files); + if record_reader.readers.is_empty() { + continue; + } + let merged_schema = record_reader.merged_schema(); + + let props = parquet_writer_props( + self.options, + &merged_schema, + time_partition, + custom_partition, + ) + .build(); + schemas.push(merged_schema.clone()); + let schema = Arc::new(merged_schema); + let parquet_file = OpenOptions::new() + .create(true) + .append(true) + .open(&parquet_path) + .map_err(|_| MoveDataError::Create)?; + let mut writer = ArrowWriter::try_new(&parquet_file, schema.clone(), Some(props))?; + for ref record in record_reader.merged_iter(schema, time_partition.cloned()) { + writer.write(record)?; + } + + writer.close()?; + if parquet_file.metadata().unwrap().len() < parquet::file::FOOTER_SIZE as u64 { + error!( + "Invalid parquet file {:?} detected for stream {}, removing it", + &parquet_path, &self.stream_name + ); + remove_file(parquet_path).unwrap(); + } else { + for file in files { + // warn!("file-\n{file:?}\n"); + let file_size = file.metadata().unwrap().len(); + let file_type = file.extension().unwrap().to_str().unwrap(); + if remove_file(file.clone()).is_err() { + error!("Failed to delete file. Unstable state"); + process::abort() + } + metrics::STORAGE_SIZE + .with_label_values(&["staging", &self.stream_name, file_type]) + .sub(file_size as i64); + } } } - } - if !schemas.is_empty() { + if schemas.is_empty() { + return Ok(None); + } + Ok(Some(Schema::try_merge(schemas).unwrap())) - } else { - Ok(None) } } fn parquet_writer_props( options: &Options, - time_partition: Option, - index_time_partition: usize, - custom_partition_fields: HashMap, + merged_schema: &Schema, + time_partition: Option<&String>, + custom_partition: Option<&String>, ) -> WriterPropertiesBuilder { - let index_time_partition: i32 = index_time_partition as i32; - let mut time_partition_field = DEFAULT_TIMESTAMP_KEY.to_string(); - if let Some(time_partition) = time_partition { - time_partition_field = time_partition; - } - let mut sorting_column_vec: Vec = Vec::new(); - sorting_column_vec.push(SortingColumn { - column_idx: index_time_partition, - descending: true, - nulls_first: true, - }); + // Determine time partition field + let time_partition_field = time_partition.map_or(DEFAULT_TIMESTAMP_KEY, |tp| tp.as_str()); + + // Find time partition index + let time_partition_idx = merged_schema.index_of(time_partition_field).unwrap_or(0); + let mut props = WriterProperties::builder() .set_max_row_group_size(options.row_group_size) .set_compression(options.parquet_compression.into()) .set_column_encoding( - ColumnPath::new(vec![time_partition_field]), + ColumnPath::new(vec![time_partition_field.to_string()]), Encoding::DELTA_BINARY_PACKED, ); - for (field, index) in custom_partition_fields { - let field = ColumnPath::new(vec![field]); - let encoding = Encoding::DELTA_BYTE_ARRAY; - props = props.set_column_encoding(field, encoding); - let sorting_column = SortingColumn { - column_idx: index as i32, - descending: true, - nulls_first: true, - }; - sorting_column_vec.push(sorting_column); + // Create sorting columns + let mut sorting_column_vec = vec![SortingColumn { + column_idx: time_partition_idx as i32, + descending: true, + nulls_first: true, + }]; + + // Describe custom partition column encodings and sorting + if let Some(custom_partition) = custom_partition { + for partition in custom_partition.split(',') { + if let Ok(idx) = merged_schema.index_of(partition) { + let column_path = ColumnPath::new(vec![partition.to_string()]); + props = props.set_column_encoding(column_path, Encoding::DELTA_BYTE_ARRAY); + + sorting_column_vec.push(SortingColumn { + column_idx: idx as i32, + descending: true, + nulls_first: true, + }); + } + } } - props = props.set_sorting_columns(Some(sorting_column_vec)); - props + // Set sorting columns + props.set_sorting_columns(Some(sorting_column_vec)) } #[derive(Deref, DerefMut, Default)] @@ -538,8 +534,8 @@ mod tests { ..Default::default() }; let stream = "test_stream".to_string(); - let staging = Stream::new(&options, &stream); - let result = convert_disk_files_to_parquet(&staging, None, None, false)?; + let result = + Stream::new(&options, &stream).convert_disk_files_to_parquet(None, None, false)?; assert!(result.is_none()); // Verify metrics were set to 0 let staging_files = metrics::STAGING_FILES.with_label_values(&[&stream]).get(); diff --git a/src/storage/object_storage.rs b/src/storage/object_storage.rs index 6b91985cb..da372b5ed 100644 --- a/src/storage/object_storage.rs +++ b/src/storage/object_storage.rs @@ -31,7 +31,7 @@ use crate::handlers::http::users::{CORRELATION_DIR, DASHBOARDS_DIR, FILTER_DIR, use crate::metadata::SchemaVersion; use crate::metrics::{EVENTS_STORAGE_SIZE_DATE, LIFETIME_EVENTS_STORAGE_SIZE}; use crate::option::Mode; -use crate::staging::{convert_disk_files_to_parquet, STAGING}; +use crate::staging::STAGING; use crate::{ alerts::Alerts, catalog::{self, manifest::Manifest, snapshot::Snapshot}, @@ -552,13 +552,13 @@ pub trait ObjectStorage: Debug + Send + Sync + 'static { .get_custom_partition(stream) .map_err(|err| ObjectStorageError::UnhandledError(Box::new(err)))?; let staging = STAGING.get_or_create_stream(stream); - let schema = convert_disk_files_to_parquet( - &staging, - time_partition, - custom_partition.clone(), - shutdown_signal, - ) - .map_err(|err| ObjectStorageError::UnhandledError(Box::new(err)))?; + let schema = staging + .convert_disk_files_to_parquet( + time_partition.as_ref(), + custom_partition.as_ref(), + shutdown_signal, + ) + .map_err(|err| ObjectStorageError::UnhandledError(Box::new(err)))?; if let Some(schema) = schema { let static_schema_flag = STREAM_INFO From 771b28e6bffb505cb4bad02bc64907d04a3a31e6 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Fri, 24 Jan 2025 22:43:33 +0530 Subject: [PATCH 18/34] test: fix path expectation --- src/staging/streams.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/staging/streams.rs b/src/staging/streams.rs index af33ed69b..41f377ded 100644 --- a/src/staging/streams.rs +++ b/src/staging/streams.rs @@ -512,7 +512,7 @@ mod tests { let staging = Stream::new(&options, stream_name); let expected_path = staging.data_path.join(format!( - "{}{stream_hash}.date={}.hour={:02}.minute={}.key1=value1.key2=value2.{}{ARROW_FILE_EXTENSION}", + "{}{stream_hash}.date={}.hour={:02}.minute={}.key1=value1.key2=value2.{}.{ARROW_FILE_EXTENSION}", Utc::now().format("%Y%m%dT%H%M"), parsed_timestamp.date(), parsed_timestamp.hour(), From 64d3803b3de328a6430184cdadb5f7ce6b726970 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Wed, 29 Jan 2025 17:41:16 +0530 Subject: [PATCH 19/34] refactor: move out unrelated code --- src/handlers/http/modal/ingest_server.rs | 100 +++++++++++++++++++- src/handlers/http/modal/mod.rs | 16 ++++ src/migration/metadata_migration.rs | 4 +- src/query/stream_schema_provider.rs | 2 +- src/staging/mod.rs | 112 +---------------------- 5 files changed, 116 insertions(+), 118 deletions(-) diff --git a/src/handlers/http/modal/ingest_server.rs b/src/handlers/http/modal/ingest_server.rs index 5e02b9726..bafb58080 100644 --- a/src/handlers/http/modal/ingest_server.rs +++ b/src/handlers/http/modal/ingest_server.rs @@ -23,6 +23,7 @@ use super::server::Server; use super::IngestorMetadata; use super::OpenIdClient; use super::ParseableServer; +use super::DEFAULT_VERSION; use crate::analytics; use crate::handlers::airplane; use crate::handlers::http::ingest; @@ -34,13 +35,14 @@ use crate::metrics; use crate::migration; use crate::migration::metadata_migration::migrate_ingester_metadata; use crate::rbac::role::Action; -use crate::staging; use crate::storage::object_storage::ingestor_metadata_path; use crate::storage::object_storage::parseable_json_path; use crate::storage::ObjectStorageError; use crate::storage::PARSEABLE_ROOT_DIRECTORY; use crate::sync; +use crate::utils::get_ingestor_id; +use crate::utils::get_url; use crate::{handlers::http::base_path, option::CONFIG}; use actix_web::web; use actix_web::web::resource; @@ -53,10 +55,100 @@ use relative_path::RelativePathBuf; use serde_json::Value; use tokio::sync::oneshot; use tracing::error; +use tracing::info; + +/// Metadata associated with this ingestor server +pub static INGESTOR_META: Lazy = Lazy::new(|| { + // all the files should be in the staging directory root + let entries = + std::fs::read_dir(&CONFIG.options.local_staging_path).expect("Couldn't read from file"); + let url = get_url(); + let port = url.port().unwrap_or(80).to_string(); + let url = url.to_string(); + + for entry in entries { + // cause the staging directory will have only one file with ingestor in the name + // so the JSON Parse should not error unless the file is corrupted + let path = entry.expect("Should be a directory entry").path(); + let flag = path + .file_name() + .unwrap_or_default() + .to_str() + .unwrap_or_default() + .contains("ingestor"); + + if flag { + // get the ingestor metadata from staging + let text = std::fs::read(path).expect("File should be present"); + let mut meta: Value = serde_json::from_slice(&text).expect("Valid JSON"); + + // migrate the staging meta + let obj = meta + .as_object_mut() + .expect("Could Not parse Ingestor Metadata Json"); + + if obj.get("flight_port").is_none() { + obj.insert( + "flight_port".to_owned(), + Value::String(CONFIG.options.flight_port.to_string()), + ); + } + + let mut meta: IngestorMetadata = + serde_json::from_value(meta).expect("Couldn't write to disk"); + + // compare url endpoint and port + if meta.domain_name != url { + info!( + "Domain Name was Updated. Old: {} New: {}", + meta.domain_name, url + ); + meta.domain_name = url; + } + + if meta.port != port { + info!("Port was Updated. Old: {} New: {}", meta.port, port); + meta.port = port; + } + + let token = base64::prelude::BASE64_STANDARD.encode(format!( + "{}:{}", + CONFIG.options.username, CONFIG.options.password + )); + + let token = format!("Basic {}", token); + + if meta.token != token { + // TODO: Update the message to be more informative with username and password + info!( + "Credentials were Updated. Old: {} New: {}", + meta.token, token + ); + meta.token = token; + } + + meta.put_on_disk(CONFIG.staging_dir()) + .expect("Couldn't write to disk"); + return meta; + } + } -/// ! have to use a guard before using it -pub static INGESTOR_META: Lazy = - Lazy::new(|| staging::get_ingestor_info(&CONFIG).expect("Should Be valid Json")); + let store = CONFIG.storage().get_object_store(); + let out = IngestorMetadata::new( + port, + url, + DEFAULT_VERSION.to_string(), + store.get_bucket_name(), + &CONFIG.options.username, + &CONFIG.options.password, + get_ingestor_id(), + CONFIG.options.flight_port.to_string(), + ); + + out.put_on_disk(CONFIG.staging_dir()) + .expect("Should Be valid Json"); + out +}); pub struct IngestServer; diff --git a/src/handlers/http/modal/mod.rs b/src/handlers/http/modal/mod.rs index 89fc7021e..d36372b2b 100644 --- a/src/handlers/http/modal/mod.rs +++ b/src/handlers/http/modal/mod.rs @@ -24,6 +24,7 @@ pub mod server; pub mod ssl_acceptor; pub mod utils; +use std::path::Path; use std::sync::Arc; use actix_web::middleware::from_fn; @@ -203,6 +204,21 @@ impl IngestorMetadata { pub fn get_ingestor_id(&self) -> String { self.ingestor_id.clone() } + + /// Puts the ingestor info into the staging. + /// + /// This function takes the ingestor info as a parameter and stores it in staging. + /// # Parameters + /// + /// * `staging_path`: Staging root directory. + pub fn put_on_disk(&self, staging_path: &Path) -> anyhow::Result<()> { + let file_name = format!("ingestor.{}.json", self.ingestor_id); + let file_path = staging_path.join(file_name); + + std::fs::write(file_path, serde_json::to_vec(&self)?)?; + + Ok(()) + } } #[cfg(test)] diff --git a/src/migration/metadata_migration.rs b/src/migration/metadata_migration.rs index ce25862ec..5de34a690 100644 --- a/src/migration/metadata_migration.rs +++ b/src/migration/metadata_migration.rs @@ -21,7 +21,7 @@ use rand::distributions::DistString; use serde_json::{json, Map, Value as JsonValue}; use crate::{ - handlers::http::modal::IngestorMetadata, option::CONFIG, staging, + handlers::http::modal::IngestorMetadata, option::CONFIG, storage::object_storage::ingestor_metadata_path, }; @@ -196,7 +196,7 @@ pub async fn migrate_ingester_metadata() -> anyhow::Result = Lazy::new(Streams::default); - -pub fn get_ingestor_info(config: &Config) -> anyhow::Result { - // all the files should be in the staging directory root - let entries = std::fs::read_dir(&config.options.local_staging_path)?; - let url = get_url(); - let port = url.port().unwrap_or(80).to_string(); - let url = url.to_string(); - - for entry in entries { - // cause the staging directory will have only one file with ingestor in the name - // so the JSON Parse should not error unless the file is corrupted - let path = entry?.path(); - let flag = path - .file_name() - .unwrap_or_default() - .to_str() - .unwrap_or_default() - .contains("ingestor"); - - if flag { - // get the ingestor metadata from staging - let mut meta: JsonValue = serde_json::from_slice(&std::fs::read(path)?)?; - - // migrate the staging meta - let obj = meta - .as_object_mut() - .ok_or_else(|| anyhow!("Could Not parse Ingestor Metadata Json"))?; - - if obj.get("flight_port").is_none() { - obj.insert( - "flight_port".to_owned(), - JsonValue::String(config.options.flight_port.to_string()), - ); - } - - let mut meta: IngestorMetadata = serde_json::from_value(meta)?; - - // compare url endpoint and port - if meta.domain_name != url { - info!( - "Domain Name was Updated. Old: {} New: {}", - meta.domain_name, url - ); - meta.domain_name = url; - } - - if meta.port != port { - info!("Port was Updated. Old: {} New: {}", meta.port, port); - meta.port = port; - } - - let token = base64::prelude::BASE64_STANDARD.encode(format!( - "{}:{}", - config.options.username, config.options.password - )); - - let token = format!("Basic {}", token); - - if meta.token != token { - // TODO: Update the message to be more informative with username and password - info!( - "Credentials were Updated. Old: {} New: {}", - meta.token, token - ); - meta.token = token; - } - - put_ingestor_info(config, meta.clone())?; - return Ok(meta); - } - } - - let store = config.storage().get_object_store(); - let out = IngestorMetadata::new( - port, - url, - DEFAULT_VERSION.to_string(), - store.get_bucket_name(), - &config.options.username, - &config.options.password, - get_ingestor_id(), - config.options.flight_port.to_string(), - ); - - put_ingestor_info(config, out.clone())?; - Ok(out) -} - -/// Puts the ingestor info into the staging. -/// -/// This function takes the ingestor info as a parameter and stores it in staging. -/// # Parameters -/// -/// * `ingestor_info`: The ingestor info to be stored. -pub fn put_ingestor_info(config: &Config, info: IngestorMetadata) -> anyhow::Result<()> { - let file_name = format!("ingestor.{}.json", info.ingestor_id); - let file_path = config.options.local_staging_path.join(file_name); - - std::fs::write(file_path, serde_json::to_vec(&info)?)?; - - Ok(()) -} From 0676ca5877e195d12543f667c202811d6b70353d Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Wed, 29 Jan 2025 17:48:53 +0530 Subject: [PATCH 20/34] refactor: merge error types --- src/event/mod.rs | 4 ++-- src/staging/mod.rs | 10 +++------- src/staging/streams.rs | 10 +++++----- src/staging/writer.rs | 8 -------- 4 files changed, 10 insertions(+), 22 deletions(-) diff --git a/src/event/mod.rs b/src/event/mod.rs index 1dec203c6..c0dadc42a 100644 --- a/src/event/mod.rs +++ b/src/event/mod.rs @@ -135,13 +135,13 @@ pub mod error { use arrow_schema::ArrowError; use crate::metadata::error::stream_info::MetadataError; - use crate::staging::StreamWriterError; + use crate::staging::StagingError; use crate::storage::ObjectStorageError; #[derive(Debug, thiserror::Error)] pub enum EventError { #[error("Stream Writer Failed: {0}")] - StreamWriter(#[from] StreamWriterError), + StreamWriter(#[from] StagingError), #[error("Metadata Error: {0}")] Metadata(#[from] MetadataError), #[error("Stream Writer Failed: {0}")] diff --git a/src/staging/mod.rs b/src/staging/mod.rs index cc1f9a582..12033c7e6 100644 --- a/src/staging/mod.rs +++ b/src/staging/mod.rs @@ -17,22 +17,18 @@ * */ -use arrow_schema::ArrowError; use once_cell::sync::Lazy; -use parquet::errors::ParquetError; pub use streams::{Stream, Streams}; -use tracing::error; -pub use writer::StreamWriterError; mod streams; mod writer; #[derive(Debug, thiserror::Error)] -pub enum MoveDataError { +pub enum StagingError { #[error("Unable to create recordbatch stream")] - Arrow(#[from] ArrowError), + Arrow(#[from] arrow_schema::ArrowError), #[error("Could not generate parquet file")] - Parquet(#[from] ParquetError), + Parquet(#[from] parquet::errors::ParquetError), #[error("IO Error {0}")] ObjectStorage(#[from] std::io::Error), #[error("Could not generate parquet file")] diff --git a/src/staging/streams.rs b/src/staging/streams.rs index 41f377ded..f6f56e561 100644 --- a/src/staging/streams.rs +++ b/src/staging/streams.rs @@ -51,7 +51,7 @@ use crate::{ utils::{arrow::merged_reader::MergedReverseRecordReader, minute_to_slot}, }; -use super::{writer::Writer, MoveDataError, StreamWriterError}; +use super::{writer::Writer, StagingError}; const ARROW_FILE_EXTENSION: &str = "data.arrows"; @@ -86,7 +86,7 @@ impl<'a> Stream<'a> { parsed_timestamp: NaiveDateTime, custom_partition_values: &HashMap, stream_type: StreamType, - ) -> Result<(), StreamWriterError> { + ) -> Result<(), StagingError> { let mut guard = self.writer.lock().unwrap(); if self.options.mode != Mode::Query || stream_type == StreamType::Internal { match guard.disk.get_mut(schema_key) { @@ -243,7 +243,7 @@ impl<'a> Stream<'a> { time_partition: Option<&String>, custom_partition: Option<&String>, shutdown_signal: bool, - ) -> Result, MoveDataError> { + ) -> Result, StagingError> { let mut schemas = Vec::new(); let time = chrono::Utc::now().naive_utc(); @@ -294,7 +294,7 @@ impl<'a> Stream<'a> { .create(true) .append(true) .open(&parquet_path) - .map_err(|_| MoveDataError::Create)?; + .map_err(|_| StagingError::Create)?; let mut writer = ArrowWriter::try_new(&parquet_file, schema.clone(), Some(props))?; for ref record in record_reader.merged_iter(schema, time_partition.cloned()) { writer.write(record)?; @@ -527,7 +527,7 @@ mod tests { } #[test] - fn test_convert_to_parquet_with_empty_staging() -> Result<(), MoveDataError> { + fn test_convert_to_parquet_with_empty_staging() -> Result<(), StagingError> { let temp_dir = TempDir::new()?; let options = Options { local_staging_path: temp_dir.path().to_path_buf(), diff --git a/src/staging/writer.rs b/src/staging/writer.rs index e8d3eff9c..d295d6509 100644 --- a/src/staging/writer.rs +++ b/src/staging/writer.rs @@ -31,14 +31,6 @@ use itertools::Itertools; use crate::utils::arrow::adapt_batch; -#[derive(Debug, thiserror::Error)] -pub enum StreamWriterError { - #[error("Arrow writer failed: {0}")] - Writer(#[from] arrow_schema::ArrowError), - #[error("Io Error when creating new file: {0}")] - Io(#[from] std::io::Error), -} - #[derive(Default)] pub struct Writer { pub mem: MemWriter<16384>, From 0298ecdd06140c4d6c9b34cea7fd0f4c6a144833 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Wed, 29 Jan 2025 18:43:11 +0530 Subject: [PATCH 21/34] fix: actually close the writers --- src/staging/streams.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/staging/streams.rs b/src/staging/streams.rs index f6f56e561..812058e3f 100644 --- a/src/staging/streams.rs +++ b/src/staging/streams.rs @@ -233,7 +233,7 @@ impl<'a> Stream<'a> { fn flush(&self) { let mut writer = self.writer.lock().unwrap(); - for writer in writer.disk.values_mut() { + for (_, mut writer) in writer.disk.drain() { _ = writer.finish(); } } From 8d30f51186235a56cde0134d2352c4f51f178588 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Wed, 29 Jan 2025 18:51:00 +0530 Subject: [PATCH 22/34] test: multiple arrow to parquet --- src/staging/streams.rs | 70 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/src/staging/streams.rs b/src/staging/streams.rs index 812058e3f..e77c71e04 100644 --- a/src/staging/streams.rs +++ b/src/staging/streams.rs @@ -424,7 +424,9 @@ impl Streams { #[cfg(test)] mod tests { - use chrono::NaiveDate; + use arrow_array::{Int32Array, StringArray, TimestampMillisecondArray}; + use arrow_schema::{DataType, Field, TimeUnit}; + use chrono::{NaiveDate, TimeDelta}; use temp_dir::TempDir; use super::*; @@ -550,4 +552,70 @@ mod tests { assert_eq!(storage_size_parquet, 0); Ok(()) } + + #[test] + fn convert_multiple_arrow_files_to_parquet() { + let temp_dir = TempDir::new().unwrap(); + let stream_name = "test_stream"; + let options = Options { + local_staging_path: temp_dir.path().to_path_buf(), + row_group_size: 1048576, + ..Default::default() + }; + let staging: Arc> = Stream::new(&options, stream_name); + + // Create test arrow files + let schema = Schema::new(vec![ + Field::new( + DEFAULT_TIMESTAMP_KEY, + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new("id", DataType::Int32, false), + Field::new("value", DataType::Utf8, false), + ]); + + for i in 0..3 { + let past = Utc::now() + .checked_sub_signed(TimeDelta::minutes(10 - i)) + .unwrap() + .naive_utc(); + let batch = RecordBatch::try_new( + Arc::new(schema.clone()), + vec![ + Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3])), + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec!["a", "b", "c"])), + ], + ) + .unwrap(); + staging + .push( + "abc", + &batch, + past, + &HashMap::new(), + StreamType::UserDefined, + ) + .unwrap(); + staging.flush(); + } + // verify the arrow files exist in staging + assert_eq!(staging.arrow_files().len(), 3); + drop(staging); + + // Start with a fresh staging + let staging: Arc> = Stream::new(&options, stream_name); + let result = staging + .convert_disk_files_to_parquet(None, None, true) + .unwrap(); + + assert!(result.is_some()); + let result_schema = result.unwrap(); + assert_eq!(result_schema.fields().len(), 3); + + // Verify parquet files were created and the arrow files deleted + assert_eq!(staging.parquet_files().len(), 3); + assert_eq!(staging.arrow_files().len(), 0); + } } From dc634dcffc70bf5080364878a37bd198b4dc31b2 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Wed, 29 Jan 2025 19:10:57 +0530 Subject: [PATCH 23/34] refactor: improve code reuse/ readability --- src/utils/mod.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/utils/mod.rs b/src/utils/mod.rs index f9c37e6b3..3cd4fc76c 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -55,6 +55,9 @@ pub fn minute_to_slot(minute: u32, data_granularity: u32) -> Option { Some(format!("{block_start:02}-{block_end:02}")) } +type Prefix = String; + +/// Representation of a time period using which files can be retreived from object storage pub struct TimePeriod { start: DateTime, end: DateTime, @@ -70,7 +73,7 @@ impl TimePeriod { } } - pub fn generate_prefixes(&self) -> Vec { + pub fn generate_prefixes(self) -> Vec { let mut prefixes = vec![]; self.generate_date_prefixes(&mut prefixes); @@ -101,7 +104,7 @@ impl TimePeriod { return; } - let push_prefix = |block: u32, prefixes: &mut Vec<_>| { + let mut push_prefix = |block: u32| { if let Some(minute_slot) = minute_to_slot(block * self.data_granularity, self.data_granularity) { @@ -111,13 +114,13 @@ impl TimePeriod { }; for block in start_block..end_block { - push_prefix(block, prefixes); + push_prefix(block); } // NOTE: for block sizes larger than a minute ensure // ensure last block is considered if self.data_granularity > 1 { - push_prefix(end_block, prefixes); + push_prefix(end_block); } } From 589d6e2b495c7b66ab288abf551c32cf13f5f85f Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Wed, 29 Jan 2025 21:09:46 +0530 Subject: [PATCH 24/34] refactor + doc: `generate_prefix` working --- src/correlation.rs | 7 +- src/utils/mod.rs | 205 ++++++++++++++++++++++++++++----------------- 2 files changed, 130 insertions(+), 82 deletions(-) diff --git a/src/correlation.rs b/src/correlation.rs index 7ffde6a8a..22974ee95 100644 --- a/src/correlation.rs +++ b/src/correlation.rs @@ -129,10 +129,9 @@ impl Correlations { .await?; // Update in memory - self.write().await.insert( - correlation.id.to_owned(), - correlation.clone(), - ); + self.write() + .await + .insert(correlation.id.to_owned(), correlation.clone()); Ok(correlation) } diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 3cd4fc76c..55f3d58b5 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -31,7 +31,7 @@ use crate::rbac::role::{Action, Permission}; use crate::rbac::Users; use actix::extract_session_key_from_req; use actix_web::HttpRequest; -use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Timelike, Utc}; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, TimeDelta, Timelike, Utc}; use regex::Regex; use sha2::{Digest, Sha256}; use std::env; @@ -57,6 +57,22 @@ pub fn minute_to_slot(minute: u32, data_granularity: u32) -> Option { type Prefix = String; +#[derive(Clone, Copy)] +struct TimeBounds { + start_date: NaiveDate, + start_hour: u32, + start_minute: u32, + end_date: NaiveDate, + end_hour: u32, + end_minute: u32, +} + +impl TimeBounds { + fn spans_full_day(&self) -> bool { + self.end_hour - self.start_hour >= 24 + } +} + /// Representation of a time period using which files can be retreived from object storage pub struct TimePeriod { start: DateTime, @@ -73,42 +89,127 @@ impl TimePeriod { } } + /// Generates prefixes for the time period, e.g: + /// 1. ("2022-06-11T23:00:01+00:00", "2022-06-12T01:59:59+00:00") => ["date=2022-06-11/hour=23/", "date=2022-06-12/hour=00/", "date=2022-06-12/hour=01/""] + /// 2. ("2022-06-11T15:59:00+00:00", "2022-06-11T17:01:00+00:00") => ["date=2022-06-11/hour=15/minute=59/", "date=2022-06-11/hour=16/", "date=2022-06-11/hour=17/minute=00/"] pub fn generate_prefixes(self) -> Vec { let mut prefixes = vec![]; - self.generate_date_prefixes(&mut prefixes); + let time_bounds = self.calculate_time_bounds(); + let mut current_date = time_bounds.start_date; + + while current_date <= time_bounds.end_date { + self.process_date(current_date, time_bounds, &mut prefixes); + current_date += TimeDelta::days(1); + } prefixes } - fn generate_minute_prefixes( + fn calculate_time_bounds(&self) -> TimeBounds { + TimeBounds { + start_date: self.start.date_naive(), + start_hour: self.start.hour(), + start_minute: self.start.minute(), + end_date: self.end.date_naive(), + end_hour: self.end.hour(), + end_minute: self.end.minute() + u32::from(self.end.second() > 0), + } + } + + fn process_date(&self, date: NaiveDate, bounds: TimeBounds, prefixes: &mut Vec) { + let prefix = format!("date={date}/"); + let is_start = date == bounds.start_date; + let is_end = date == bounds.end_date; + + if !is_start && !is_end { + prefixes.push(prefix); + return; + } + + let time_bounds = self.get_time_bounds(is_start, is_end, bounds); + if time_bounds.spans_full_day() { + prefixes.push(prefix); + return; + } + + self.process_hours(prefix, time_bounds, prefixes); + } + + fn process_hours( &self, + date_prefix: String, + time_bounds: TimeBounds, prefixes: &mut Vec, - prefix: &str, - start_minute: u32, - end_minute: u32, ) { - if start_minute == end_minute { + for hour in time_bounds.start_hour..=time_bounds.end_hour { + if hour == 24 { + break; + } + + let hour_prefix = format!("{date_prefix}hour={hour:02}/"); + let is_start_hour = hour == time_bounds.start_hour; + let is_end_hour = hour == time_bounds.end_hour; + + if !is_start_hour && !is_end_hour { + prefixes.push(hour_prefix); + continue; + } + + self.process_minutes( + hour_prefix, + is_start_hour, + is_end_hour, + time_bounds, + prefixes, + ); + } + } + + fn process_minutes( + &self, + hour_prefix: String, + is_start_hour: bool, + is_end_hour: bool, + mut time_bounds: TimeBounds, + prefixes: &mut Vec, + ) { + if !is_start_hour { + time_bounds.start_minute = 0; + } + if !is_end_hour { + time_bounds.end_minute = 60; + }; + + if time_bounds.start_minute == time_bounds.end_minute { return; } let (start_block, end_block) = ( - start_minute / self.data_granularity, - end_minute / self.data_granularity, + time_bounds.start_minute / self.data_granularity, + time_bounds.end_minute / self.data_granularity, ); let forbidden_block = 60 / self.data_granularity; - - // ensure both start and end are within the same hour, else return prefix as is if end_block - start_block >= forbidden_block { - prefixes.push(prefix.to_owned()); + prefixes.push(hour_prefix); return; } + self.generate_minute_prefixes(hour_prefix, start_block, end_block, prefixes); + } + + fn generate_minute_prefixes( + &self, + hour_prefix: String, + start_block: u32, + end_block: u32, + prefixes: &mut Vec, + ) { let mut push_prefix = |block: u32| { if let Some(minute_slot) = minute_to_slot(block * self.data_granularity, self.data_granularity) { - let prefix = prefix.to_owned() + &format!("minute={minute_slot}/",); + let prefix = format!("{hour_prefix}minute={minute_slot}/"); prefixes.push(prefix); } }; @@ -117,80 +218,28 @@ impl TimePeriod { push_prefix(block); } - // NOTE: for block sizes larger than a minute ensure - // ensure last block is considered + // Handle last block for granularity > 1 if self.data_granularity > 1 { push_prefix(end_block); } } - fn generate_hour_prefixes( + fn get_time_bounds( &self, - prefixes: &mut Vec, - prefix: &str, - start_hour: u32, - start_minute: u32, - end_hour: u32, - end_minute: u32, - ) { - // ensure both start and end are within the same day - if end_hour - start_hour >= 24 { - prefixes.push(prefix.to_owned()); - return; - } - - for hour in start_hour..=end_hour { - if hour == 24 { - break; - } - let prefix = prefix.to_owned() + &format!("hour={hour:02}/"); - let is_start = hour == start_hour; - let is_end = hour == end_hour; - - if is_start || is_end { - self.generate_minute_prefixes( - prefixes, - &prefix, - if is_start { start_minute } else { 0 }, - if is_end { end_minute } else { 60 }, - ); - } else { - prefixes.push(prefix); - } + is_start: bool, + is_end: bool, + mut time_bounds: TimeBounds, + ) -> TimeBounds { + if !is_start { + time_bounds.start_hour = 0; + time_bounds.start_minute = 0; } - } - fn generate_date_prefixes(&self, prefixes: &mut Vec) { - let end_minute = self.end.minute() + u32::from(self.end.second() > 0); - let start_date = self.start.date_naive(); - let end_date = self.end.date_naive(); - let start_time = (self.start.hour(), self.start.minute()); - let end_time = (self.end.hour(), end_minute); - let mut date = start_date; - - while date <= end_date { - let prefix = format!("date={date}/"); - let is_start = date == start_date; - let is_end = date == end_date; - - if is_start || is_end { - let ((start_hour, start_minute), (end_hour, end_minute)) = ( - if is_start { start_time } else { (0, 0) }, - if is_end { end_time } else { (24, 60) }, - ); - self.generate_hour_prefixes( - prefixes, - &prefix, - start_hour, - start_minute, - end_hour, - end_minute, - ); - } else { - prefixes.push(prefix); - } - date = date.succ_opt().unwrap(); + if !is_end { + time_bounds.end_hour = 24; + time_bounds.end_minute = 60; } + time_bounds } } From 43fa624dd3f05723d437ba39b6047f749e57546f Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Thu, 30 Jan 2025 01:55:26 +0530 Subject: [PATCH 25/34] revert timeperiod changes --- src/utils/mod.rs | 249 +++++++++++++++++++++-------------------------- 1 file changed, 112 insertions(+), 137 deletions(-) diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 55f3d58b5..14fd4bcd7 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -31,7 +31,7 @@ use crate::rbac::role::{Action, Permission}; use crate::rbac::Users; use actix::extract_session_key_from_req; use actix_web::HttpRequest; -use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, TimeDelta, Timelike, Utc}; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Timelike, Utc}; use regex::Regex; use sha2::{Digest, Sha256}; use std::env; @@ -55,25 +55,22 @@ pub fn minute_to_slot(minute: u32, data_granularity: u32) -> Option { Some(format!("{block_start:02}-{block_end:02}")) } -type Prefix = String; +pub fn date_to_prefix(date: NaiveDate) -> String { + let date = format!("date={date}/"); + date.replace("UTC", "") +} -#[derive(Clone, Copy)] -struct TimeBounds { - start_date: NaiveDate, - start_hour: u32, - start_minute: u32, - end_date: NaiveDate, - end_hour: u32, - end_minute: u32, +pub fn hour_to_prefix(hour: u32) -> String { + format!("hour={hour:02}/") } -impl TimeBounds { - fn spans_full_day(&self) -> bool { - self.end_hour - self.start_hour >= 24 - } +pub fn minute_to_prefix(minute: u32, data_granularity: u32) -> Option { + Some(format!( + "minute={}/", + minute_to_slot(minute, data_granularity)? + )) } -/// Representation of a time period using which files can be retreived from object storage pub struct TimePeriod { start: DateTime, end: DateTime, @@ -89,157 +86,135 @@ impl TimePeriod { } } - /// Generates prefixes for the time period, e.g: - /// 1. ("2022-06-11T23:00:01+00:00", "2022-06-12T01:59:59+00:00") => ["date=2022-06-11/hour=23/", "date=2022-06-12/hour=00/", "date=2022-06-12/hour=01/""] - /// 2. ("2022-06-11T15:59:00+00:00", "2022-06-11T17:01:00+00:00") => ["date=2022-06-11/hour=15/minute=59/", "date=2022-06-11/hour=16/", "date=2022-06-11/hour=17/minute=00/"] - pub fn generate_prefixes(self) -> Vec { - let mut prefixes = vec![]; - let time_bounds = self.calculate_time_bounds(); - let mut current_date = time_bounds.start_date; - - while current_date <= time_bounds.end_date { - self.process_date(current_date, time_bounds, &mut prefixes); - current_date += TimeDelta::days(1); - } - - prefixes - } - - fn calculate_time_bounds(&self) -> TimeBounds { - TimeBounds { - start_date: self.start.date_naive(), - start_hour: self.start.hour(), - start_minute: self.start.minute(), - end_date: self.end.date_naive(), - end_hour: self.end.hour(), - end_minute: self.end.minute() + u32::from(self.end.second() > 0), - } - } - - fn process_date(&self, date: NaiveDate, bounds: TimeBounds, prefixes: &mut Vec) { - let prefix = format!("date={date}/"); - let is_start = date == bounds.start_date; - let is_end = date == bounds.end_date; - - if !is_start && !is_end { - prefixes.push(prefix); - return; - } - - let time_bounds = self.get_time_bounds(is_start, is_end, bounds); - if time_bounds.spans_full_day() { - prefixes.push(prefix); - return; - } - - self.process_hours(prefix, time_bounds, prefixes); - } - - fn process_hours( - &self, - date_prefix: String, - time_bounds: TimeBounds, - prefixes: &mut Vec, - ) { - for hour in time_bounds.start_hour..=time_bounds.end_hour { - if hour == 24 { - break; - } - - let hour_prefix = format!("{date_prefix}hour={hour:02}/"); - let is_start_hour = hour == time_bounds.start_hour; - let is_end_hour = hour == time_bounds.end_hour; - - if !is_start_hour && !is_end_hour { - prefixes.push(hour_prefix); - continue; - } - - self.process_minutes( - hour_prefix, - is_start_hour, - is_end_hour, - time_bounds, - prefixes, - ); - } + pub fn generate_prefixes(&self) -> Vec { + let end_minute = self.end.minute() + u32::from(self.end.second() > 0); + self.generate_date_prefixes( + self.start.date_naive(), + self.end.date_naive(), + (self.start.hour(), self.start.minute()), + (self.end.hour(), end_minute), + ) } - fn process_minutes( + pub fn generate_minute_prefixes( &self, - hour_prefix: String, - is_start_hour: bool, - is_end_hour: bool, - mut time_bounds: TimeBounds, - prefixes: &mut Vec, - ) { - if !is_start_hour { - time_bounds.start_minute = 0; - } - if !is_end_hour { - time_bounds.end_minute = 60; - }; - - if time_bounds.start_minute == time_bounds.end_minute { - return; + prefix: &str, + start_minute: u32, + end_minute: u32, + ) -> Vec { + if start_minute == end_minute { + return vec![]; } let (start_block, end_block) = ( - time_bounds.start_minute / self.data_granularity, - time_bounds.end_minute / self.data_granularity, + start_minute / self.data_granularity, + end_minute / self.data_granularity, ); let forbidden_block = 60 / self.data_granularity; + + // ensure both start and end are within the same hour, else return prefix as is if end_block - start_block >= forbidden_block { - prefixes.push(hour_prefix); - return; + return vec![prefix.to_owned()]; } - self.generate_minute_prefixes(hour_prefix, start_block, end_block, prefixes); - } + let mut prefixes = vec![]; - fn generate_minute_prefixes( - &self, - hour_prefix: String, - start_block: u32, - end_block: u32, - prefixes: &mut Vec, - ) { - let mut push_prefix = |block: u32| { - if let Some(minute_slot) = - minute_to_slot(block * self.data_granularity, self.data_granularity) + let push_prefix = |block: u32, prefixes: &mut Vec<_>| { + if let Some(minute_prefix) = + minute_to_prefix(block * self.data_granularity, self.data_granularity) { - let prefix = format!("{hour_prefix}minute={minute_slot}/"); + let prefix = prefix.to_owned() + &minute_prefix; prefixes.push(prefix); } }; for block in start_block..end_block { - push_prefix(block); + push_prefix(block, &mut prefixes); } - // Handle last block for granularity > 1 + // NOTE: for block sizes larger than a minute ensure + // ensure last block is considered if self.data_granularity > 1 { - push_prefix(end_block); + push_prefix(end_block, &mut prefixes); } + + prefixes } - fn get_time_bounds( + pub fn generate_hour_prefixes( &self, - is_start: bool, - is_end: bool, - mut time_bounds: TimeBounds, - ) -> TimeBounds { - if !is_start { - time_bounds.start_hour = 0; - time_bounds.start_minute = 0; + prefix: &str, + start_hour: u32, + start_minute: u32, + end_hour: u32, + end_minute: u32, + ) -> Vec { + // ensure both start and end are within the same day + if end_hour - start_hour >= 24 { + return vec![prefix.to_owned()]; + } + + let mut prefixes = vec![]; + + for hour in start_hour..=end_hour { + if hour == 24 { + break; + } + let prefix = prefix.to_owned() + &hour_to_prefix(hour); + let is_start = hour == start_hour; + let is_end = hour == end_hour; + + if is_start || is_end { + let minute_prefixes = self.generate_minute_prefixes( + &prefix, + if is_start { start_minute } else { 0 }, + if is_end { end_minute } else { 60 }, + ); + prefixes.extend(minute_prefixes); + } else { + prefixes.push(prefix); + } } - if !is_end { - time_bounds.end_hour = 24; - time_bounds.end_minute = 60; + prefixes + } + + pub fn generate_date_prefixes( + &self, + start_date: NaiveDate, + end_date: NaiveDate, + start_time: (u32, u32), + end_time: (u32, u32), + ) -> Vec { + let mut prefixes = vec![]; + let mut date = start_date; + + while date <= end_date { + let prefix = date_to_prefix(date); + let is_start = date == start_date; + let is_end = date == end_date; + + if is_start || is_end { + let ((start_hour, start_minute), (end_hour, end_minute)) = ( + if is_start { start_time } else { (0, 0) }, + if is_end { end_time } else { (24, 60) }, + ); + let hour_prefixes = self.generate_hour_prefixes( + &prefix, + start_hour, + start_minute, + end_hour, + end_minute, + ); + prefixes.extend(hour_prefixes); + } else { + prefixes.push(prefix); + } + date = date.succ_opt().unwrap(); } - time_bounds + + prefixes } } From 321d08b2aabcf401790bb0a99c21a000fc0abbd9 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Thu, 30 Jan 2025 14:56:32 +0530 Subject: [PATCH 26/34] fix: ensure memory is also flushed --- src/staging/streams.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/staging/streams.rs b/src/staging/streams.rs index e77c71e04..afb6cf222 100644 --- a/src/staging/streams.rs +++ b/src/staging/streams.rs @@ -232,8 +232,16 @@ impl<'a> Stream<'a> { } fn flush(&self) { - let mut writer = self.writer.lock().unwrap(); - for (_, mut writer) in writer.disk.drain() { + let mut disk_writers = { + let mut writer = self.writer.lock().unwrap(); + // Flush memory + writer.mem.clear(); + // Take schema -> disk writer mapping + std::mem::take(&mut writer.disk) + }; + + // Flush disk + for writer in disk_writers.values_mut() { _ = writer.finish(); } } From 407d4f3a56306340e4804ed1e229a82ca8755224 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Thu, 30 Jan 2025 15:00:27 +0530 Subject: [PATCH 27/34] refactor: vec already knows the length --- src/staging/writer.rs | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/staging/writer.rs b/src/staging/writer.rs index d295d6509..b3b01fdbe 100644 --- a/src/staging/writer.rs +++ b/src/staging/writer.rs @@ -79,12 +79,11 @@ impl MemWriter { self.schema_map.clear(); self.read_buffer.clear(); self.mutable_buffer.inner.clear(); - self.mutable_buffer.rows = 0; } pub fn recordbatch_cloned(&self, schema: &Arc) -> Vec { let mut read_buffer = self.read_buffer.clone(); - if self.mutable_buffer.rows > 0 { + if self.mutable_buffer.inner.len() > 0 { let rb = concat_records(schema, &self.mutable_buffer.inner); read_buffer.push(rb) } @@ -105,13 +104,12 @@ fn concat_records(schema: &Arc, record: &[RecordBatch]) -> RecordBatch { #[derive(Debug, Default)] pub struct MutableBuffer { pub inner: Vec, - pub rows: usize, } impl MutableBuffer { fn push(&mut self, rb: &RecordBatch) -> Option> { - if self.rows + rb.num_rows() >= N { - let left = N - self.rows; + if self.inner.len() + rb.num_rows() >= N { + let left = N - self.inner.len(); let right = rb.num_rows() - left; let left_slice = rb.slice(0, left); let right_slice = if left < rb.num_rows() { @@ -123,16 +121,13 @@ impl MutableBuffer { // take all records let src = Vec::with_capacity(self.inner.len()); let inner = std::mem::replace(&mut self.inner, src); - self.rows = 0; if let Some(right_slice) = right_slice { - self.rows = right_slice.num_rows(); self.inner.push(right_slice); } Some(inner) } else { - self.rows += rb.num_rows(); self.inner.push(rb.clone()); None } From d20b66b5e58034a70ebbd4d74dfed97465453b4a Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Thu, 30 Jan 2025 15:07:34 +0530 Subject: [PATCH 28/34] ci: lint fix --- src/staging/writer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/staging/writer.rs b/src/staging/writer.rs index b3b01fdbe..c43252f14 100644 --- a/src/staging/writer.rs +++ b/src/staging/writer.rs @@ -83,7 +83,7 @@ impl MemWriter { pub fn recordbatch_cloned(&self, schema: &Arc) -> Vec { let mut read_buffer = self.read_buffer.clone(); - if self.mutable_buffer.inner.len() > 0 { + if !self.mutable_buffer.inner.is_empty() { let rb = concat_records(schema, &self.mutable_buffer.inner); read_buffer.push(rb) } From 42d2013216e816d4a583e84e26dfdc267f9bf8e6 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Thu, 30 Jan 2025 17:41:14 +0530 Subject: [PATCH 29/34] refactor: update schema from storage --- src/metadata.rs | 20 ++++---------------- src/staging/streams.rs | 17 ++++++++++++++++- src/utils/arrow/mod.rs | 1 - 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/metadata.rs b/src/metadata.rs index 1d17c9da6..b0722bf61 100644 --- a/src/metadata.rs +++ b/src/metadata.rs @@ -36,11 +36,9 @@ use crate::metrics::{ EVENTS_INGESTED_SIZE_DATE, EVENTS_STORAGE_SIZE_DATE, LIFETIME_EVENTS_INGESTED, LIFETIME_EVENTS_INGESTED_SIZE, }; -use crate::option::CONFIG; -use crate::staging::Stream; +use crate::staging::STAGING; use crate::storage::retention::Retention; use crate::storage::{ObjectStorage, ObjectStoreFormat, StreamType}; -use crate::utils::arrow::MergedRecordReader; use derive_more::{Deref, DerefMut}; // TODO: make return type be of 'static lifetime instead of cloning @@ -403,18 +401,6 @@ impl StreamInfo { } } -fn update_schema_from_staging(stream_name: &str, current_schema: Schema) -> Schema { - let staging_files = Stream::new(&CONFIG.options, stream_name).arrow_files(); - let record_reader = MergedRecordReader::try_new(&staging_files).unwrap(); - if record_reader.readers.is_empty() { - return current_schema; - } - - let schema = record_reader.merged_schema(); - - Schema::try_merge(vec![schema, current_schema]).unwrap() -} - ///this function updates the data type of time partition field /// from utf-8 to timestamp if it is not already timestamp /// and updates the schema in the storage @@ -486,7 +472,9 @@ pub async fn load_stream_metadata_on_server_start( load_daily_metrics(&snapshot.manifest_list, stream_name); let alerts = storage.get_alerts(stream_name).await?; - let schema = update_schema_from_staging(stream_name, schema); + let schema = STAGING + .get_or_create_stream(stream_name) + .updated_schema(schema); let schema = HashMap::from_iter( schema .fields diff --git a/src/staging/streams.rs b/src/staging/streams.rs index afb6cf222..fb9094687 100644 --- a/src/staging/streams.rs +++ b/src/staging/streams.rs @@ -48,7 +48,10 @@ use crate::{ metrics, option::{Mode, CONFIG}, storage::{StreamType, OBJECT_STORE_DATA_GRANULARITY}, - utils::{arrow::merged_reader::MergedReverseRecordReader, minute_to_slot}, + utils::{ + arrow::merged_reader::{MergedRecordReader, MergedReverseRecordReader}, + minute_to_slot, + }, }; use super::{writer::Writer, StagingError}; @@ -337,6 +340,18 @@ impl<'a> Stream<'a> { Ok(Some(Schema::try_merge(schemas).unwrap())) } + + pub fn updated_schema(&self, current_schema: Schema) -> Schema { + let staging_files = self.arrow_files(); + let record_reader = MergedRecordReader::try_new(&staging_files).unwrap(); + if record_reader.readers.is_empty() { + return current_schema; + } + + let schema = record_reader.merged_schema(); + + Schema::try_merge(vec![schema, current_schema]).unwrap() + } } fn parquet_writer_props( diff --git a/src/utils/arrow/mod.rs b/src/utils/arrow/mod.rs index 2cbdbf0a5..908358fd9 100644 --- a/src/utils/arrow/mod.rs +++ b/src/utils/arrow/mod.rs @@ -54,7 +54,6 @@ pub mod reverse_reader; use anyhow::Result; pub use batch_adapter::adapt_batch; -pub use merged_reader::MergedRecordReader; use serde_json::{Map, Value}; /// Replaces columns in a record batch with new arrays. From 8b874bcc95efb06b0aa998c78c2e60399482b5b8 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Thu, 30 Jan 2025 17:42:58 +0530 Subject: [PATCH 30/34] refactor: reader is not a util --- src/query/stream_schema_provider.rs | 2 +- src/staging/mod.rs | 1 + .../reverse_reader.rs => staging/reader.rs} | 145 ++++++++++++++-- src/staging/streams.rs | 11 +- src/utils/arrow/merged_reader.rs | 158 ------------------ src/utils/arrow/mod.rs | 15 +- 6 files changed, 153 insertions(+), 179 deletions(-) rename src/{utils/arrow/reverse_reader.rs => staging/reader.rs} (69%) delete mode 100644 src/utils/arrow/merged_reader.rs diff --git a/src/query/stream_schema_provider.rs b/src/query/stream_schema_provider.rs index 24b18f6c1..ae1593f16 100644 --- a/src/query/stream_schema_provider.rs +++ b/src/query/stream_schema_provider.rs @@ -581,7 +581,7 @@ fn reversed_mem_table( records[..].reverse(); records .iter_mut() - .for_each(|batch| *batch = crate::utils::arrow::reverse_reader::reverse(batch)); + .for_each(|batch| *batch = crate::utils::arrow::reverse(batch)); MemTable::try_new(schema, vec![records]) } diff --git a/src/staging/mod.rs b/src/staging/mod.rs index 12033c7e6..72d30640d 100644 --- a/src/staging/mod.rs +++ b/src/staging/mod.rs @@ -20,6 +20,7 @@ use once_cell::sync::Lazy; pub use streams::{Stream, Streams}; +mod reader; mod streams; mod writer; diff --git a/src/utils/arrow/reverse_reader.rs b/src/staging/reader.rs similarity index 69% rename from src/utils/arrow/reverse_reader.rs rename to src/staging/reader.rs index bce5cd695..6df0dc324 100644 --- a/src/utils/arrow/reverse_reader.rs +++ b/src/staging/reader.rs @@ -14,17 +14,148 @@ * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . * + * */ use std::{ + fs::{remove_file, File}, io::{self, BufReader, Read, Seek, SeekFrom}, + path::PathBuf, + sync::Arc, vec::IntoIter, }; -use arrow_array::{RecordBatch, UInt64Array}; +use arrow_array::{RecordBatch, TimestampMillisecondArray}; use arrow_ipc::{reader::StreamReader, root_as_message_unchecked, MessageHeader}; -use arrow_select::take::take; +use arrow_schema::Schema; use byteorder::{LittleEndian, ReadBytesExt}; +use itertools::kmerge_by; +use tracing::error; + +use crate::{ + event::DEFAULT_TIMESTAMP_KEY, + utils::arrow::{adapt_batch, reverse}, +}; + +#[derive(Debug)] +pub struct MergedRecordReader { + pub readers: Vec>>, +} + +impl MergedRecordReader { + pub fn try_new(files: &[PathBuf]) -> Result { + let mut readers = Vec::with_capacity(files.len()); + + for file in files { + //remove empty files before reading + if file.metadata().unwrap().len() == 0 { + error!("Invalid file detected, removing it: {:?}", file); + remove_file(file).unwrap(); + } else { + let Ok(reader) = + StreamReader::try_new(BufReader::new(File::open(file).unwrap()), None) + else { + error!("Invalid file detected, ignoring it: {:?}", file); + continue; + }; + + readers.push(reader); + } + } + + Ok(Self { readers }) + } + + pub fn merged_schema(&self) -> Schema { + Schema::try_merge( + self.readers + .iter() + .map(|reader| reader.schema().as_ref().clone()), + ) + .unwrap() + } +} + +#[derive(Debug)] +pub struct MergedReverseRecordReader { + pub readers: Vec>>>, +} + +impl MergedReverseRecordReader { + pub fn try_new(files: &[PathBuf]) -> Self { + let mut readers = Vec::with_capacity(files.len()); + for file in files { + let Ok(reader) = get_reverse_reader(File::open(file).unwrap()) else { + error!("Invalid file detected, ignoring it: {:?}", file); + continue; + }; + + readers.push(reader); + } + + Self { readers } + } + + pub fn merged_iter( + self, + schema: Arc, + time_partition: Option, + ) -> impl Iterator { + let adapted_readers = self.readers.into_iter().map(|reader| reader.flatten()); + kmerge_by(adapted_readers, move |a: &RecordBatch, b: &RecordBatch| { + // Capture time_partition by value + let a_time = get_timestamp_millis(a, time_partition.clone()); + let b_time = get_timestamp_millis(b, time_partition.clone()); + a_time > b_time + }) + .map(|batch| reverse(&batch)) + .map(move |batch| adapt_batch(&schema, &batch)) + } + + pub fn merged_schema(&self) -> Schema { + Schema::try_merge( + self.readers + .iter() + .map(|reader| reader.schema().as_ref().clone()), + ) + .unwrap() + } +} + +fn get_timestamp_millis(batch: &RecordBatch, time_partition: Option) -> i64 { + match time_partition { + Some(time_partition) => { + let time_partition = time_partition.as_str(); + match batch.column_by_name(time_partition) { + Some(column) => column + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + None => get_default_timestamp_millis(batch), + } + } + None => get_default_timestamp_millis(batch), + } +} +fn get_default_timestamp_millis(batch: &RecordBatch) -> i64 { + match batch + .column(0) + .as_any() + .downcast_ref::() + { + // Ideally we expect the first column to be a timestamp (because we add the timestamp column first in the writer) + Some(array) => array.value(0), + // In case the first column is not a timestamp, we fallback to look for default timestamp column across all columns + None => batch + .column_by_name(DEFAULT_TIMESTAMP_KEY) + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + } +} /// OffsetReader takes in a reader and list of offset and sizes and /// provides a reader over the file by reading only the offsets @@ -142,16 +273,6 @@ pub fn get_reverse_reader( Ok(StreamReader::try_new(BufReader::new(OffsetReader::new(reader, messages)), None).unwrap()) } -pub fn reverse(rb: &RecordBatch) -> RecordBatch { - let indices = UInt64Array::from_iter_values((0..rb.num_rows()).rev().map(|x| x as u64)); - let arrays = rb - .columns() - .iter() - .map(|col| take(&col, &indices, None).unwrap()) - .collect(); - RecordBatch::try_new(rb.schema(), arrays).unwrap() -} - // return limit for fn find_limit_and_type( reader: &mut (impl Read + Seek), diff --git a/src/staging/streams.rs b/src/staging/streams.rs index fb9094687..2714388cd 100644 --- a/src/staging/streams.rs +++ b/src/staging/streams.rs @@ -48,13 +48,14 @@ use crate::{ metrics, option::{Mode, CONFIG}, storage::{StreamType, OBJECT_STORE_DATA_GRANULARITY}, - utils::{ - arrow::merged_reader::{MergedRecordReader, MergedReverseRecordReader}, - minute_to_slot, - }, + utils::minute_to_slot, }; -use super::{writer::Writer, StagingError}; +use super::{ + reader::{MergedRecordReader, MergedReverseRecordReader}, + writer::Writer, + StagingError, +}; const ARROW_FILE_EXTENSION: &str = "data.arrows"; diff --git a/src/utils/arrow/merged_reader.rs b/src/utils/arrow/merged_reader.rs deleted file mode 100644 index 32a2edb40..000000000 --- a/src/utils/arrow/merged_reader.rs +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Parseable Server (C) 2022 - 2024 Parseable, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - * - */ - -use arrow_array::{RecordBatch, TimestampMillisecondArray}; -use arrow_ipc::reader::StreamReader; -use arrow_schema::Schema; -use itertools::kmerge_by; -use std::{ - fs::{self, File}, - io::BufReader, - path::PathBuf, - sync::Arc, -}; -use tracing::error; - -use super::{ - adapt_batch, - reverse_reader::{reverse, OffsetReader}, -}; -use crate::{event::DEFAULT_TIMESTAMP_KEY, utils}; - -#[derive(Debug)] -pub struct MergedRecordReader { - pub readers: Vec>>, -} - -impl MergedRecordReader { - pub fn try_new(files: &[PathBuf]) -> Result { - let mut readers = Vec::with_capacity(files.len()); - - for file in files { - //remove empty files before reading - if file.metadata().unwrap().len() == 0 { - error!("Invalid file detected, removing it: {:?}", file); - fs::remove_file(file).unwrap(); - } else { - let Ok(reader) = - StreamReader::try_new(BufReader::new(File::open(file).unwrap()), None) - else { - error!("Invalid file detected, ignoring it: {:?}", file); - continue; - }; - - readers.push(reader); - } - } - - Ok(Self { readers }) - } - - pub fn merged_schema(&self) -> Schema { - Schema::try_merge( - self.readers - .iter() - .map(|reader| reader.schema().as_ref().clone()), - ) - .unwrap() - } -} - -#[derive(Debug)] -pub struct MergedReverseRecordReader { - pub readers: Vec>>>, -} - -impl MergedReverseRecordReader { - pub fn try_new(files: &[PathBuf]) -> Self { - let mut readers = Vec::with_capacity(files.len()); - for file in files { - let Ok(reader) = - utils::arrow::reverse_reader::get_reverse_reader(File::open(file).unwrap()) - else { - error!("Invalid file detected, ignoring it: {:?}", file); - continue; - }; - - readers.push(reader); - } - - Self { readers } - } - - pub fn merged_iter( - self, - schema: Arc, - time_partition: Option, - ) -> impl Iterator { - let adapted_readers = self.readers.into_iter().map(|reader| reader.flatten()); - kmerge_by(adapted_readers, move |a: &RecordBatch, b: &RecordBatch| { - // Capture time_partition by value - let a_time = get_timestamp_millis(a, time_partition.clone()); - let b_time = get_timestamp_millis(b, time_partition.clone()); - a_time > b_time - }) - .map(|batch| reverse(&batch)) - .map(move |batch| adapt_batch(&schema, &batch)) - } - - pub fn merged_schema(&self) -> Schema { - Schema::try_merge( - self.readers - .iter() - .map(|reader| reader.schema().as_ref().clone()), - ) - .unwrap() - } -} - -fn get_timestamp_millis(batch: &RecordBatch, time_partition: Option) -> i64 { - match time_partition { - Some(time_partition) => { - let time_partition = time_partition.as_str(); - match batch.column_by_name(time_partition) { - Some(column) => column - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - None => get_default_timestamp_millis(batch), - } - } - None => get_default_timestamp_millis(batch), - } -} -fn get_default_timestamp_millis(batch: &RecordBatch) -> i64 { - match batch - .column(0) - .as_any() - .downcast_ref::() - { - // Ideally we expect the first column to be a timestamp (because we add the timestamp column first in the writer) - Some(array) => array.value(0), - // In case the first column is not a timestamp, we fallback to look for default timestamp column across all columns - None => batch - .column_by_name(DEFAULT_TIMESTAMP_KEY) - .unwrap() - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - } -} diff --git a/src/utils/arrow/mod.rs b/src/utils/arrow/mod.rs index 908358fd9..3cdc5193c 100644 --- a/src/utils/arrow/mod.rs +++ b/src/utils/arrow/mod.rs @@ -42,15 +42,14 @@ use std::sync::Arc; -use arrow_array::{Array, RecordBatch, TimestampMillisecondArray}; +use arrow_array::{Array, RecordBatch, TimestampMillisecondArray, UInt64Array}; use arrow_schema::Schema; +use arrow_select::take::take; use chrono::Utc; use itertools::Itertools; pub mod batch_adapter; pub mod flight; -pub mod merged_reader; -pub mod reverse_reader; use anyhow::Result; pub use batch_adapter::adapt_batch; @@ -138,6 +137,16 @@ pub fn get_timestamp_array(size: usize) -> TimestampMillisecondArray { TimestampMillisecondArray::from_value(Utc::now().timestamp_millis(), size) } +pub fn reverse(rb: &RecordBatch) -> RecordBatch { + let indices = UInt64Array::from_iter_values((0..rb.num_rows()).rev().map(|x| x as u64)); + let arrays = rb + .columns() + .iter() + .map(|col| take(&col, &indices, None).unwrap()) + .collect(); + RecordBatch::try_new(rb.schema(), arrays).unwrap() +} + #[cfg(test)] mod tests { use std::sync::Arc; From 9be628136003d7d00ce0e4526f0e6ae0b1737890 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Thu, 30 Jan 2025 18:18:52 +0530 Subject: [PATCH 31/34] test: same minute ingestion --- src/staging/streams.rs | 68 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/src/staging/streams.rs b/src/staging/streams.rs index 2714388cd..7860ff0fd 100644 --- a/src/staging/streams.rs +++ b/src/staging/streams.rs @@ -578,7 +578,7 @@ mod tests { } #[test] - fn convert_multiple_arrow_files_to_parquet() { + fn different_minutes_multiple_arrow_files_to_parquet() { let temp_dir = TempDir::new().unwrap(); let stream_name = "test_stream"; let options = Options { @@ -642,4 +642,70 @@ mod tests { assert_eq!(staging.parquet_files().len(), 3); assert_eq!(staging.arrow_files().len(), 0); } + + #[test] + fn same_minute_multiple_arrow_files_to_parquet() { + let temp_dir = TempDir::new().unwrap(); + let stream_name = "test_stream"; + let options = Options { + local_staging_path: temp_dir.path().to_path_buf(), + row_group_size: 1048576, + ..Default::default() + }; + let staging: Arc> = Stream::new(&options, stream_name); + + // Create test arrow files + let schema = Schema::new(vec![ + Field::new( + DEFAULT_TIMESTAMP_KEY, + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new("id", DataType::Int32, false), + Field::new("value", DataType::Utf8, false), + ]); + + let past = Utc::now() + .checked_sub_signed(TimeDelta::minutes(10)) + .unwrap() + .naive_utc(); + for _ in 0..3 { + let batch = RecordBatch::try_new( + Arc::new(schema.clone()), + vec![ + Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3])), + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec!["a", "b", "c"])), + ], + ) + .unwrap(); + staging + .push( + "abc", + &batch, + past, + &HashMap::new(), + StreamType::UserDefined, + ) + .unwrap(); + staging.flush(); + } + // verify the arrow files exist in staging + assert_eq!(staging.arrow_files().len(), 1); + drop(staging); + + // Start with a fresh staging + let staging: Arc> = Stream::new(&options, stream_name); + let result = staging + .convert_disk_files_to_parquet(None, None, true) + .unwrap(); + + assert!(result.is_some()); + let result_schema = result.unwrap(); + assert_eq!(result_schema.fields().len(), 3); + + // Verify parquet files were created and the arrow files deleted + assert_eq!(staging.parquet_files().len(), 1); + assert_eq!(staging.arrow_files().len(), 0); + } } From 3c67d974b8b65626273d24506b8dbdc52bd697a7 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Thu, 30 Jan 2025 22:20:36 +0530 Subject: [PATCH 32/34] fix: empty custom partitioning --- src/staging/streams.rs | 42 +++++++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/src/staging/streams.rs b/src/staging/streams.rs index 7860ff0fd..8c14aa648 100644 --- a/src/staging/streams.rs +++ b/src/staging/streams.rs @@ -137,7 +137,7 @@ impl<'a> Stream<'a> { hostname.push_str(&INGESTOR_META.get_ingestor_id()); } let filename = format!( - "{}{stream_hash}.date={}.hour={:02}.minute={}.{}.{hostname}.{ARROW_FILE_EXTENSION}", + "{}{stream_hash}.date={}.hour={:02}.minute={}.{}{hostname}.{ARROW_FILE_EXTENSION}", Utc::now().format("%Y%m%dT%H%M"), parsed_timestamp.date(), parsed_timestamp.hour(), @@ -145,8 +145,8 @@ impl<'a> Stream<'a> { custom_partition_values .iter() .sorted_by_key(|v| v.0) - .map(|(key, value)| format!("{key}={value}")) - .join(".") + .map(|(key, value)| format!("{key}={value}.")) + .join("") ); self.data_path.join(filename) } @@ -523,7 +523,35 @@ mod tests { } #[test] - fn generate_correct_path_with_current_time_and_valid_parameters() { + fn generate_correct_path_with_current_time_and_no_custom_partitioning() { + let stream_name = "test_stream"; + let stream_hash = "abc123"; + let parsed_timestamp = NaiveDate::from_ymd_opt(2023, 10, 1) + .unwrap() + .and_hms_opt(12, 30, 0) + .unwrap(); + let custom_partition_values = HashMap::new(); + + let options = Options::default(); + let staging = Stream::new(&options, stream_name); + + let expected_path = staging.data_path.join(format!( + "{}{stream_hash}.date={}.hour={:02}.minute={}.{}.{ARROW_FILE_EXTENSION}", + Utc::now().format("%Y%m%dT%H%M"), + parsed_timestamp.date(), + parsed_timestamp.hour(), + minute_to_slot(parsed_timestamp.minute(), OBJECT_STORE_DATA_GRANULARITY).unwrap(), + hostname::get().unwrap().into_string().unwrap() + )); + + let generated_path = + staging.path_by_current_time(stream_hash, parsed_timestamp, &custom_partition_values); + + assert_eq!(generated_path, expected_path); + } + + #[test] + fn generate_correct_path_with_current_time_and_custom_partitioning() { let stream_name = "test_stream"; let stream_hash = "abc123"; let parsed_timestamp = NaiveDate::from_ymd_opt(2023, 10, 1) @@ -586,7 +614,7 @@ mod tests { row_group_size: 1048576, ..Default::default() }; - let staging: Arc> = Stream::new(&options, stream_name); + let staging = Stream::new(&options, stream_name); // Create test arrow files let schema = Schema::new(vec![ @@ -629,7 +657,7 @@ mod tests { drop(staging); // Start with a fresh staging - let staging: Arc> = Stream::new(&options, stream_name); + let staging = Stream::new(&options, stream_name); let result = staging .convert_disk_files_to_parquet(None, None, true) .unwrap(); @@ -695,7 +723,7 @@ mod tests { drop(staging); // Start with a fresh staging - let staging: Arc> = Stream::new(&options, stream_name); + let staging = Stream::new(&options, stream_name); let result = staging .convert_disk_files_to_parquet(None, None, true) .unwrap(); From d7158c3b1bdffc756246ca58cb2a630978466149 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Thu, 30 Jan 2025 23:22:21 +0530 Subject: [PATCH 33/34] test: miss the current minute --- src/staging/streams.rs | 126 ++++++++++++++++++++++++++--------------- 1 file changed, 80 insertions(+), 46 deletions(-) diff --git a/src/staging/streams.rs b/src/staging/streams.rs index 8c14aa648..240d728f6 100644 --- a/src/staging/streams.rs +++ b/src/staging/streams.rs @@ -448,10 +448,13 @@ impl Streams { #[cfg(test)] mod tests { + use std::time::Duration; + use arrow_array::{Int32Array, StringArray, TimestampMillisecondArray}; use arrow_schema::{DataType, Field, TimeUnit}; use chrono::{NaiveDate, TimeDelta}; use temp_dir::TempDir; + use tokio::time::sleep; use super::*; @@ -605,6 +608,32 @@ mod tests { Ok(()) } + fn write_log(staging: &StreamRef, schema: &Schema, mins: i64) { + let time: NaiveDateTime = Utc::now() + .checked_sub_signed(TimeDelta::minutes(mins)) + .unwrap() + .naive_utc(); + let batch = RecordBatch::try_new( + Arc::new(schema.clone()), + vec![ + Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3])), + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec!["a", "b", "c"])), + ], + ) + .unwrap(); + staging + .push( + "abc", + &batch, + time, + &HashMap::new(), + StreamType::UserDefined, + ) + .unwrap(); + staging.flush(); + } + #[test] fn different_minutes_multiple_arrow_files_to_parquet() { let temp_dir = TempDir::new().unwrap(); @@ -628,29 +657,7 @@ mod tests { ]); for i in 0..3 { - let past = Utc::now() - .checked_sub_signed(TimeDelta::minutes(10 - i)) - .unwrap() - .naive_utc(); - let batch = RecordBatch::try_new( - Arc::new(schema.clone()), - vec![ - Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3])), - Arc::new(Int32Array::from(vec![1, 2, 3])), - Arc::new(StringArray::from(vec!["a", "b", "c"])), - ], - ) - .unwrap(); - staging - .push( - "abc", - &batch, - past, - &HashMap::new(), - StreamType::UserDefined, - ) - .unwrap(); - staging.flush(); + write_log(&staging, &schema, i); } // verify the arrow files exist in staging assert_eq!(staging.arrow_files().len(), 3); @@ -693,30 +700,8 @@ mod tests { Field::new("value", DataType::Utf8, false), ]); - let past = Utc::now() - .checked_sub_signed(TimeDelta::minutes(10)) - .unwrap() - .naive_utc(); for _ in 0..3 { - let batch = RecordBatch::try_new( - Arc::new(schema.clone()), - vec![ - Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3])), - Arc::new(Int32Array::from(vec![1, 2, 3])), - Arc::new(StringArray::from(vec!["a", "b", "c"])), - ], - ) - .unwrap(); - staging - .push( - "abc", - &batch, - past, - &HashMap::new(), - StreamType::UserDefined, - ) - .unwrap(); - staging.flush(); + write_log(&staging, &schema, 0); } // verify the arrow files exist in staging assert_eq!(staging.arrow_files().len(), 1); @@ -736,4 +721,53 @@ mod tests { assert_eq!(staging.parquet_files().len(), 1); assert_eq!(staging.arrow_files().len(), 0); } + + #[tokio::test] + async fn miss_current_arrow_file_when_converting_to_parquet() { + let temp_dir = TempDir::new().unwrap(); + let stream_name = "test_stream"; + let options = Options { + local_staging_path: temp_dir.path().to_path_buf(), + row_group_size: 1048576, + ..Default::default() + }; + let staging = Stream::new(&options, stream_name); + + // Create test arrow files + let schema = Schema::new(vec![ + Field::new( + DEFAULT_TIMESTAMP_KEY, + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new("id", DataType::Int32, false), + Field::new("value", DataType::Utf8, false), + ]); + + // 2 logs in the previous minutes + for i in 0..2 { + write_log(&staging, &schema, i); + } + sleep(Duration::from_secs(60)).await; + + write_log(&staging, &schema, 0); + + // verify the arrow files exist in staging + assert_eq!(staging.arrow_files().len(), 2); + drop(staging); + + // Start with a fresh staging + let staging = Stream::new(&options, stream_name); + let result = staging + .convert_disk_files_to_parquet(None, None, false) + .unwrap(); + + assert!(result.is_some()); + let result_schema = result.unwrap(); + assert_eq!(result_schema.fields().len(), 3); + + // Verify parquet files were created and the arrow file left + assert_eq!(staging.parquet_files().len(), 2); + assert_eq!(staging.arrow_files().len(), 1); + } } From e2d1ca3bcb5a6b5c49a6e759bf64342ceb413a23 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Thu, 30 Jan 2025 23:38:04 +0530 Subject: [PATCH 34/34] fixes --- src/staging/streams.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/staging/streams.rs b/src/staging/streams.rs index 240d728f6..967f6d682 100644 --- a/src/staging/streams.rs +++ b/src/staging/streams.rs @@ -118,11 +118,10 @@ impl<'a> Stream<'a> { guard.disk.insert(schema_key.to_owned(), writer); } }; - guard.mem.push(schema_key, record); - } else { - guard.mem.push(schema_key, record); } + guard.mem.push(schema_key, record); + Ok(()) } @@ -753,7 +752,7 @@ mod tests { write_log(&staging, &schema, 0); // verify the arrow files exist in staging - assert_eq!(staging.arrow_files().len(), 2); + assert_eq!(staging.arrow_files().len(), 3); drop(staging); // Start with a fresh staging