Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 62 additions & 9 deletions server/src/event/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,52 @@
*/

mod file_writer;
mod mem_writer;

use std::{
collections::HashMap,
sync::{Mutex, RwLock},
sync::{Arc, Mutex, RwLock},
};

use self::{errors::StreamWriterError, file_writer::FileWriter};
use arrow_array::RecordBatch;
use crate::utils;

use self::{errors::StreamWriterError, file_writer::FileWriter, mem_writer::MemWriter};
use arrow_array::{RecordBatch, TimestampMillisecondArray};
use arrow_schema::Schema;
use chrono::Utc;
use derive_more::{Deref, DerefMut};
use once_cell::sync::Lazy;

pub static STREAM_WRITERS: Lazy<WriterTable> = Lazy::new(WriterTable::default);

#[derive(Default)]
pub struct Writer {
pub mem: MemWriter<16384>,
pub disk: FileWriter,
}

impl Writer {
fn push(
&mut self,
stream_name: &str,
schema_key: &str,
rb: RecordBatch,
) -> Result<(), StreamWriterError> {
let rb = utils::arrow::replace_columns(
rb.schema(),
&rb,
&[0],
&[Arc::new(get_timestamp_array(rb.num_rows()))],
);

self.disk.push(stream_name, schema_key, &rb)?;
self.mem.push(schema_key, rb);
Ok(())
}
}

#[derive(Deref, DerefMut, Default)]
pub struct WriterTable(RwLock<HashMap<String, Mutex<FileWriter>>>);
pub struct WriterTable(RwLock<HashMap<String, Mutex<Writer>>>);

impl WriterTable {
// append to a existing stream
Expand All @@ -49,7 +80,7 @@ impl WriterTable {
stream_writer
.lock()
.unwrap()
.push(stream_name, schema_key, &record)?;
.push(stream_name, schema_key, record)?;
}
None => {
drop(hashmap_guard);
Expand All @@ -60,10 +91,10 @@ impl WriterTable {
writer
.lock()
.unwrap()
.push(stream_name, schema_key, &record)?;
.push(stream_name, schema_key, record)?;
} else {
let mut writer = FileWriter::default();
writer.push(stream_name, schema_key, &record)?;
let mut writer = Writer::default();
writer.push(stream_name, schema_key, record)?;
map.insert(stream_name.to_owned(), Mutex::new(writer));
}
}
Expand All @@ -81,9 +112,31 @@ impl WriterTable {
drop(table);
for writer in map.into_values() {
let writer = writer.into_inner().unwrap();
writer.close_all();
writer.disk.close_all();
}
}

pub fn recordbatches_cloned(
&self,
stream_name: &str,
schema: &Arc<Schema>,
) -> Option<Vec<RecordBatch>> {
let records = self
.0
.read()
.unwrap()
.get(stream_name)?
.lock()
.unwrap()
.mem
.recordbatch_cloned(schema);

Some(records)
}
}

fn get_timestamp_array(size: usize) -> TimestampMillisecondArray {
TimestampMillisecondArray::from_value(Utc::now().timestamp_millis(), size)
}

pub mod errors {
Expand Down
25 changes: 6 additions & 19 deletions server/src/event/writer/file_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,15 @@
*
*/

use arrow_array::{RecordBatch, TimestampMillisecondArray};
use arrow_ipc::writer::StreamWriter;
use chrono::Utc;
use derive_more::{Deref, DerefMut};
use std::collections::HashMap;
use std::fs::{File, OpenOptions};
use std::path::PathBuf;
use std::sync::Arc;

use arrow_array::RecordBatch;
use arrow_ipc::writer::StreamWriter;
use derive_more::{Deref, DerefMut};

use crate::storage::staging::StorageDir;
use crate::utils;

use super::errors::StreamWriterError;

Expand All @@ -47,24 +45,17 @@ impl FileWriter {
schema_key: &str,
record: &RecordBatch,
) -> Result<(), StreamWriterError> {
let record = utils::arrow::replace_columns(
record.schema(),
record,
&[0],
&[Arc::new(get_timestamp_array(record.num_rows()))],
);

match self.get_mut(schema_key) {
Some(writer) => {
writer
.writer
.write(&record)
.write(record)
.map_err(StreamWriterError::Writer)?;
}
// entry is not present thus we create it
None => {
// this requires mutable borrow of the map so we drop this read lock and wait for write lock
let (path, writer) = init_new_stream_writer_file(stream_name, schema_key, &record)?;
let (path, writer) = init_new_stream_writer_file(stream_name, schema_key, record)?;
self.insert(
schema_key.to_owned(),
ArrowWriter {
Expand All @@ -85,10 +76,6 @@ impl FileWriter {
}
}

fn get_timestamp_array(size: usize) -> TimestampMillisecondArray {
TimestampMillisecondArray::from_value(Utc::now().timestamp_millis(), size)
}

fn init_new_stream_writer_file(
stream_name: &str,
schema_key: &str,
Expand Down
120 changes: 120 additions & 0 deletions server/src/event/writer/mem_writer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/*
* Parseable Server (C) 2022 - 2023 Parseable, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/

use std::{collections::HashSet, sync::Arc};

use arrow_array::RecordBatch;
use arrow_schema::Schema;
use arrow_select::concat::concat_batches;
use itertools::Itertools;

use crate::utils::arrow::adapt_batch;

/// Structure to keep recordbatches in memory.
///
/// Any new schema is updated in the schema map.
/// Recordbatches are pushed to mutable buffer first and then concated together and pushed to read buffer
#[derive(Debug)]
pub struct MemWriter<const N: usize> {
schema: Schema,
// for checking uniqueness of schema
schema_map: HashSet<String>,
read_buffer: Vec<RecordBatch>,
mutable_buffer: MutableBuffer<N>,
}

impl<const N: usize> Default for MemWriter<N> {
fn default() -> Self {
Self {
schema: Schema::empty(),
schema_map: HashSet::default(),
read_buffer: Vec::default(),
mutable_buffer: MutableBuffer::default(),
}
}
}

impl<const N: usize> MemWriter<N> {
pub fn push(&mut self, schema_key: &str, rb: RecordBatch) {
if !self.schema_map.contains(schema_key) {
self.schema_map.insert(schema_key.to_owned());
self.schema = Schema::try_merge([self.schema.clone(), (*rb.schema()).clone()]).unwrap();
}

if let Some(record) = self.mutable_buffer.push(rb) {
let record = concat_records(&Arc::new(self.schema.clone()), &record);
self.read_buffer.push(record);
}
}

pub fn recordbatch_cloned(&self, schema: &Arc<Schema>) -> Vec<RecordBatch> {
let mut read_buffer = self.read_buffer.clone();
if self.mutable_buffer.rows > 0 {
let rb = concat_records(schema, &self.mutable_buffer.inner);
read_buffer.push(rb)
}

read_buffer
.into_iter()
.map(|rb| adapt_batch(schema, &rb))
.collect()
}
}

fn concat_records(schema: &Arc<Schema>, record: &[RecordBatch]) -> RecordBatch {
let records = record.iter().map(|x| adapt_batch(schema, x)).collect_vec();
let record = concat_batches(schema, records.iter()).unwrap();
record
}

#[derive(Debug, Default)]
struct MutableBuffer<const N: usize> {
pub inner: Vec<RecordBatch>,
pub rows: usize,
}

impl<const N: usize> MutableBuffer<N> {
fn push(&mut self, rb: RecordBatch) -> Option<Vec<RecordBatch>> {
if self.rows + rb.num_rows() >= N {
let left = N - self.rows;
let right = rb.num_rows() - left;
let left_slice = rb.slice(0, left);
let right_slice = if left < rb.num_rows() {
Some(rb.slice(left, right))
} else {
None
};
self.inner.push(left_slice);
// take all records
let src = Vec::with_capacity(self.inner.len());
let inner = std::mem::replace(&mut self.inner, src);
self.rows = 0;

if let Some(right_slice) = right_slice {
self.rows = right_slice.num_rows();
self.inner.push(right_slice);
}

Some(inner)
} else {
self.rows += rb.num_rows();
self.inner.push(rb);
None
}
}
}
Loading