Skip to content

Commit 0db764c

Browse files
author
Dai Dao
authored
1099 cli ingest cleanup cache (#1211)
* clean up cache after indexing * add unit test; make const variables * test changes
1 parent 5f9a71e commit 0db764c

File tree

9 files changed

+105
-11
lines changed

9 files changed

+105
-11
lines changed

quickwit-cli/src/index.rs

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,9 @@ use quickwit_actors::{ActorHandle, ObservationType, Universe};
3333
use quickwit_common::uri::Uri;
3434
use quickwit_common::{run_checklist, GREEN_COLOR};
3535
use quickwit_config::{IndexConfig, IndexerConfig, SourceConfig, SourceParams};
36-
use quickwit_core::{create_index, delete_index, garbage_collect_index, reset_index};
36+
use quickwit_core::{
37+
clean_split_cache, create_index, delete_index, garbage_collect_index, reset_index,
38+
};
3739
use quickwit_doc_mapper::tag_pruning::match_tag_field_name;
3840
use quickwit_indexing::actors::{IndexingPipeline, IndexingServer};
3941
use quickwit_indexing::models::{
@@ -82,6 +84,8 @@ pub fn build_index_command<'a>() -> Command<'a> {
8284
.required(false),
8385
arg!(--overwrite "Overwrites pre-existing index.")
8486
.required(false),
87+
arg!(--clean_cache "Clean up local cache splits after indexing.")
88+
.required(false),
8589
])
8690
)
8791
.subcommand(
@@ -198,6 +202,7 @@ pub struct IngestDocsArgs {
198202
pub config_uri: Uri,
199203
pub data_dir: Option<PathBuf>,
200204
pub overwrite: bool,
205+
pub clean_cache: bool,
201206
}
202207

203208
#[derive(Debug, PartialEq, Eq)]
@@ -330,13 +335,15 @@ impl IndexCliCommand {
330335
.expect("`config` is a required arg.")?;
331336
let data_dir = matches.value_of("data-dir").map(PathBuf::from);
332337
let overwrite = matches.is_present("overwrite");
338+
let clean_cache = matches.is_present("clean_cache");
333339

334340
Ok(Self::Ingest(IngestDocsArgs {
335341
index_id,
336342
input_path_opt,
337343
overwrite,
338344
config_uri,
339345
data_dir,
346+
clean_cache,
340347
}))
341348
}
342349

@@ -782,7 +789,7 @@ pub async fn ingest_docs_cli(args: IngestDocsArgs) -> anyhow::Result<()> {
782789
};
783790
let universe = Universe::new();
784791
let indexing_server = IndexingServer::new(
785-
config.data_dir_path,
792+
config.clone().data_dir_path,
786793
indexer_config,
787794
metastore,
788795
storage_resolver,
@@ -818,6 +825,17 @@ pub async fn ingest_docs_cli(args: IngestDocsArgs) -> anyhow::Result<()> {
818825
args.index_id
819826
);
820827
}
828+
829+
if args.clean_cache {
830+
println!("Cleaning up split cache ...");
831+
clean_split_cache(
832+
&config.data_dir_path,
833+
index_metadata.index_id.clone(),
834+
INGEST_SOURCE_ID.to_string(),
835+
)
836+
.await?;
837+
}
838+
821839
Ok(())
822840
}
823841

quickwit-cli/src/main.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ mod tests {
223223
input_path_opt: None,
224224
overwrite: false,
225225
data_dir: None,
226+
clean_cache: false,
226227
})) if &index_id == "wikipedia"
227228
&& config_uri == Uri::try_new("file:///config.yaml").unwrap()
228229
));
@@ -247,6 +248,7 @@ mod tests {
247248
input_path_opt: None,
248249
overwrite: true,
249250
data_dir: None,
251+
clean_cache: false
250252
})) if &index_id == "wikipedia"
251253
&& config_uri == Uri::try_new("file:///config.yaml").unwrap()
252254
));

quickwit-cli/tests/cli.rs

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ use predicates::prelude::*;
3030
use quickwit_cli::index::{create_index_cli, search_index, CreateIndexArgs, SearchIndexArgs};
3131
use quickwit_common::rand::append_random_suffix;
3232
use quickwit_common::uri::Uri;
33+
use quickwit_core::get_cache_path;
34+
use quickwit_indexing::source::INGEST_SOURCE_ID;
3335
use quickwit_metastore::{quickwit_metastore_uri_resolver, Metastore};
3436
use serde_json::{json, Number, Value};
3537
use serial_test::serial;
@@ -50,13 +52,14 @@ fn create_logs_index(test_env: &TestEnv) {
5052
.success();
5153
}
5254

53-
fn ingest_docs(input_path: &Path, test_env: &TestEnv) {
55+
fn ingest_docs_with_options(input_path: &Path, test_env: &TestEnv, options: &str) {
5456
make_command(
5557
format!(
56-
"index ingest --index {} --input-path {} --config {}",
58+
"index ingest --index {} --input-path {} --config {} {}",
5759
test_env.index_id,
5860
input_path.display(),
5961
test_env.resource_files["config"].display(),
62+
options
6063
)
6164
.as_str(),
6265
)
@@ -69,6 +72,10 @@ fn ingest_docs(input_path: &Path, test_env: &TestEnv) {
6972
));
7073
}
7174

75+
fn ingest_docs(input_path: &Path, test_env: &TestEnv) {
76+
ingest_docs_with_options(input_path, test_env, "");
77+
}
78+
7279
#[test]
7380
fn test_cmd_help() -> anyhow::Result<()> {
7481
let mut cmd = make_command("--help");
@@ -187,14 +194,44 @@ fn test_cmd_ingest_on_non_existing_file() -> Result<()> {
187194
Ok(())
188195
}
189196

197+
#[test]
198+
fn test_cmd_ingest_clean_cache() -> Result<()> {
199+
let index_id = append_random_suffix("test-index-clean-cache");
200+
let test_env = create_test_env(index_id, TestStorageType::LocalFileSystem)?;
201+
create_logs_index(&test_env);
202+
203+
ingest_docs_with_options(
204+
test_env.resource_files["logs"].as_path(),
205+
&test_env,
206+
"--clean_cache",
207+
);
208+
209+
// check cache path
210+
let cache_path = get_cache_path(
211+
&test_env.data_dir_path,
212+
&test_env.index_id,
213+
INGEST_SOURCE_ID,
214+
);
215+
assert_eq!(false, cache_path.exists());
216+
217+
Ok(())
218+
}
219+
190220
#[test]
191221
fn test_cmd_ingest_simple() -> Result<()> {
192222
let index_id = append_random_suffix("test-index-simple");
193223
let test_env = create_test_env(index_id, TestStorageType::LocalFileSystem)?;
194224
create_logs_index(&test_env);
195-
196225
ingest_docs(test_env.resource_files["logs"].as_path(), &test_env);
197226

227+
// check cache path
228+
let cache_path = get_cache_path(
229+
&test_env.data_dir_path,
230+
&test_env.index_id,
231+
INGEST_SOURCE_ID,
232+
);
233+
assert_eq!(true, cache_path.exists());
234+
198235
// Using piped input
199236
let log_path = test_env.resource_files["logs"].clone();
200237
make_command(

quickwit-core/src/index.rs

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,21 @@
1717
// You should have received a copy of the GNU Affero General Public License
1818
// along with this program. If not, see <http://www.gnu.org/licenses/>.
1919

20+
use std::path::{Path, PathBuf};
2021
use std::sync::Arc;
2122
use std::time::Duration;
2223

24+
use quickwit_indexing::actors::INDEXING;
25+
use quickwit_indexing::models::CACHE;
2326
use quickwit_indexing::{
2427
delete_splits_with_files, run_garbage_collect, FileEntry, IndexingSplitStore,
2528
};
2629
use quickwit_metastore::{
2730
quickwit_metastore_uri_resolver, IndexMetadata, Metastore, SplitMetadata, SplitState,
2831
};
2932
use quickwit_storage::{quickwit_storage_uri_resolver, Storage};
30-
use tracing::error;
33+
use tokio::fs;
34+
use tracing::{error, info};
3135

3236
/// Creates an index at `index-path` extracted from `metastore_uri`. The command fails if an index
3337
/// already exists at `index-path`.
@@ -114,6 +118,32 @@ pub async fn delete_index(
114118
Ok(deleted_entries)
115119
}
116120

121+
/// Helper function to get the cache path.
122+
pub fn get_cache_path(data_dir_path: &Path, index_id: &str, source_id: &str) -> PathBuf {
123+
data_dir_path
124+
.join(INDEXING)
125+
.join(index_id)
126+
.join(source_id)
127+
.join(CACHE)
128+
}
129+
130+
/// Cleans up split cache in local split store.
131+
///
132+
/// * `data_dir_path` - Path to directory where data (tmp data, splits kept for caching purpose) is
133+
/// persisted.
134+
/// * `index_id` - The target index Id.
135+
/// * `source_id` - The source Id.
136+
pub async fn clean_split_cache(
137+
data_dir_path: &Path,
138+
index_id: String,
139+
source_id: String,
140+
) -> anyhow::Result<()> {
141+
let cache_path = get_cache_path(data_dir_path, &index_id, &source_id);
142+
info!(cache_path = %cache_path.as_path().display(), "cache_path");
143+
fs::remove_dir_all(cache_path.as_path()).await?;
144+
Ok(())
145+
}
146+
117147
/// Detect all dangling splits and associated files from the index and removes them.
118148
///
119149
/// * `metastore_uri` - The metastore URI for accessing the metastore.

quickwit-core/src/lib.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@
2828
2929
mod index;
3030

31-
pub use index::{create_index, delete_index, garbage_collect_index, reset_index};
31+
pub use index::{
32+
clean_split_cache, create_index, delete_index, garbage_collect_index, get_cache_path,
33+
reset_index,
34+
};
3235

3336
#[cfg(test)]
3437
mod tests {

quickwit-indexing/src/actors/indexing_server.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ use crate::models::{
3838
};
3939
use crate::{IndexingPipeline, IndexingPipelineParams, IndexingStatistics};
4040

41+
pub const INDEXING: &str = "indexing";
42+
4143
#[derive(Error, Debug)]
4244
pub enum IndexingServerError {
4345
#[error("Indexing pipeline `{index_id}` for source `{source_id}` does not exist.")]
@@ -82,7 +84,7 @@ impl IndexingServer {
8284
storage_resolver: StorageUriResolver,
8385
) -> IndexingServer {
8486
Self {
85-
indexing_dir_path: data_dir_path.join("indexing"),
87+
indexing_dir_path: data_dir_path.join(INDEXING),
8688
split_store_max_num_bytes: indexer_config.split_store_max_num_bytes.get_bytes()
8789
as usize,
8890
split_store_max_num_splits: indexer_config.split_store_max_num_splits,

quickwit-indexing/src/actors/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ mod publisher;
2727
mod uploader;
2828

2929
pub use indexing_pipeline::{IndexingPipeline, IndexingPipelineHandler, IndexingPipelineParams};
30-
pub use indexing_server::IndexingServer;
30+
pub use indexing_server::{IndexingServer, INDEXING};
3131
use tantivy::schema::{Field, FieldType};
3232
mod merge_executor;
3333
mod merge_planner;

quickwit-indexing/src/models/indexing_directory.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ use tokio::fs;
2727

2828
use super::ScratchDirectory;
2929

30+
pub const CACHE: &str = "cache";
31+
3032
/// Root of an [`IndexingDirectory`].
3133
#[derive(Clone)]
3234
enum Root {
@@ -54,7 +56,7 @@ pub struct IndexingDirectory {
5456
impl IndexingDirectory {
5557
pub async fn create_in_dir<P: AsRef<Path>>(dir_path: P) -> anyhow::Result<IndexingDirectory> {
5658
// Create cache directory if does not exist.
57-
let cache_directory_path = dir_path.as_ref().join("cache");
59+
let cache_directory_path = dir_path.as_ref().join(CACHE);
5860
fs::create_dir_all(&cache_directory_path)
5961
.await
6062
.with_context(|| {

quickwit-indexing/src/models/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ mod raw_doc_batch;
2929
mod scratch_directory;
3030

3131
pub use indexed_split::{IndexedSplit, IndexedSplitBatch};
32-
pub use indexing_directory::IndexingDirectory;
32+
pub use indexing_directory::{IndexingDirectory, CACHE};
3333
pub use indexing_server_message::{
3434
DetachPipeline, IndexingPipelineId, ObservePipeline, SpawnMergePipeline, SpawnPipeline,
3535
SpawnPipelinesForIndex,

0 commit comments

Comments
 (0)