From b3a183f6bd8afc250dd2aa85488f837b87b90d40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Po=C5=BAniak?= Date: Wed, 10 Sep 2025 01:44:45 -0700 Subject: [PATCH] Add possibility to specify config_file instead of configuration name pattern, fix --describe --- README.md | 65 ++++++++++++++++++++++++++++ run.py | 126 +++++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 166 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index fce96fb4..d6cba1de 100644 --- a/README.md +++ b/README.md @@ -225,6 +225,15 @@ python run.py --engines redis-default-simple --datasets random-100 python run.py --engines redis-default-simple --datasets glove-25-angular python run.py --engines "*-m-16-*" --datasets "glove-*" +# Using custom engine configurations from a JSON file +python run.py --engines-file custom_engines.json --datasets glove-25-angular + +# Get information about available engines (with pattern matching) +python run.py --engines "*redis*" --describe engines --verbose + +# Get information about engines from a custom file +python run.py --engines-file custom_engines.json --describe engines --verbose + # Docker usage (recommended) docker run --rm -v $(pwd)/results:/app/results --network=host \ redis/vector-db-benchmark:latest \ @@ -237,6 +246,62 @@ python run.py --help Command allows you to specify wildcards for engines and datasets. Results of the benchmarks are stored in the `./results/` directory. +## Using Custom Engine Configurations + +The benchmark tool supports two ways to specify which engine configurations to use: + +### 1. Pattern Matching (Default) +Use the `--engines` flag with wildcard patterns to select configurations from the `experiments/configurations/` directory: + +```bash +python run.py --engines "*redis*" --datasets glove-25-angular +python run.py --engines "qdrant-m-*" --datasets random-100 +``` + +### 2. Custom Configuration File +Use the `--engines-file` flag to specify a JSON file containing custom engine configurations: + +```bash +python run.py --engines-file my_engines.json --datasets glove-25-angular +``` + +The JSON file should contain an array of engine configuration objects. Each configuration must have a `name` field and follow the same structure as configurations in `experiments/configurations/`: + +```json +[ + { + "name": "my-custom-redis-config", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "hnsw", + "data_type": "FLOAT32", + "hnsw_config": { + "M": 16, + "DISTANCE_METRIC": "L2", + "EF_CONSTRUCTION": 200 + } + }, + "search_params": [ + { + "parallel": 1, + "top": 10, + "search_params": { + "ef": 100, + "data_type": "FLOAT32" + } + } + ], + "upload_params": { + "parallel": 16, + "data_type": "FLOAT32" + } + } +] +``` + +**Note:** You cannot use both `--engines` and `--engines-file` at the same time. + ## How to update benchmark parameters? Each engine has a configuration file, which is used to define the parameters for the benchmark. diff --git a/run.py b/run.py index 0a1bd876..b155a31d 100644 --- a/run.py +++ b/run.py @@ -1,4 +1,6 @@ import fnmatch +import json +import os import traceback import warnings from typing import List @@ -17,9 +19,54 @@ app = typer.Typer() +def load_engines(engines: List[str], engines_file: str = None) -> dict: + """Load engine configurations from file or pattern matching.""" + # Check if both engines and engines_file are provided + if engines != ["*"] and engines_file is not None: + typer.echo("Error: Cannot use both --engines and --engines-file at the same time.", err=True) + raise typer.Exit(1) + + # Load engine configurations + if engines_file is not None: + # Load engines from specified file + if not os.path.exists(engines_file): + typer.echo(f"Error: Engines file '{engines_file}' not found.", err=True) + raise typer.Exit(1) + + try: + with open(engines_file, 'r') as f: + engines_from_file = json.load(f) + + # Convert list of engine configs to dictionary with name as key + selected_engines = {} + for config in engines_from_file: + if 'name' not in config: + typer.echo(f"Error: Engine configuration missing 'name' field in {engines_file}", err=True) + raise typer.Exit(1) + selected_engines[config['name']] = config + + except json.JSONDecodeError as e: + typer.echo(f"Error: Invalid JSON in engines file '{engines_file}': {e}", err=True) + raise typer.Exit(1) + except Exception as e: + typer.echo(f"Error reading engines file '{engines_file}': {e}", err=True) + raise typer.Exit(1) + else: + # Load engines using pattern matching (original behavior) + all_engines = read_engine_configs() + selected_engines = { + name: config + for name, config in all_engines.items() + if any(fnmatch.fnmatch(name, engine) for engine in engines) + } + + return selected_engines + + @app.command() def run( engines: List[str] = typer.Option(["*"]), + engines_file: str = typer.Option(None, help="Path to JSON file containing engine configurations to use instead of searching by pattern"), datasets: List[str] = typer.Option(["*"]), parallels: List[int] = typer.Option([]), host: str = "localhost", @@ -36,8 +83,14 @@ def run( verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed information when using --describe"), ): """ - Example: + Examples: + # Use pattern matching to select engines (original behavior) python3 run.py --engines *-m-16-* --engines qdrant-* --datasets glove-* + + # Use engines from a specific JSON file + python3 run.py --engines-file my_engines.json --datasets glove-* + + # Describe available options python3 run.py --describe datasets python3 run.py --describe engines --verbose """ @@ -47,20 +100,22 @@ def run( describe_datasets(datasets[0] if datasets != ["*"] else "*", verbose) return elif describe.lower() == "engines": - describe_engines(engines[0] if engines != ["*"] else "*", verbose) + # Load engines using same logic as main function + selected_engines = load_engines(engines, engines_file) + # For describe engines, we'll pass all loaded engines or filter by pattern + if engines_file is not None: + # When using engines_file, show all engines from the file + describe_engines_with_configs(selected_engines, "*", verbose) + else: + # When using pattern matching, use the pattern + describe_engines_with_configs(selected_engines, engines[0] if engines != ["*"] else "*", verbose) return else: typer.echo(f"Error: Unknown describe target '{describe}'. Use 'datasets' or 'engines'.", err=True) raise typer.Exit(1) - all_engines = read_engine_configs() all_datasets = read_dataset_config() - - selected_engines = { - name: config - for name, config in all_engines.items() - if any(fnmatch.fnmatch(name, engine) for engine in engines) - } + selected_engines = load_engines(engines, engines_file) selected_datasets = { name: config @@ -263,18 +318,12 @@ def get_sort_key(item): typer.echo("\nUse --verbose for detailed information") -def describe_engines(filter_pattern: str = "*", verbose: bool = False): - """Display information about available engines.""" - try: - all_engines = read_engine_configs() - except Exception as e: - typer.echo(f"Error reading engine configuration: {e}", err=True) - raise typer.Exit(1) - +def describe_engines_with_configs(engines_dict: dict, filter_pattern: str = "*", verbose: bool = False): + """Display information about engines from provided configurations.""" # Filter engines filtered_engines = { name: config - for name, config in all_engines.items() + for name, config in engines_dict.items() if fnmatch.fnmatch(name, filter_pattern) } @@ -296,11 +345,23 @@ def describe_engines(filter_pattern: str = "*", verbose: bool = False): if 'search_params' in config: search_params = config['search_params'] typer.echo(f" Search Params:") - for param, values in search_params.items(): - if isinstance(values, list): - typer.echo(f" {param}: {values}") - else: - typer.echo(f" {param}: {values}") + if isinstance(search_params, list): + for i, param_config in enumerate(search_params): + typer.echo(f" Config {i+1}:") + for param, value in param_config.items(): + if isinstance(value, dict): + typer.echo(f" {param}:") + for subparam, subvalue in value.items(): + typer.echo(f" {subparam}: {subvalue}") + else: + typer.echo(f" {param}: {value}") + else: + # Legacy format - dict + for param, values in search_params.items(): + if isinstance(values, list): + typer.echo(f" {param}: {values}") + else: + typer.echo(f" {param}: {values}") if 'upload_params' in config: upload_params = config['upload_params'] typer.echo(f" Upload Params:") @@ -313,12 +374,27 @@ def describe_engines(filter_pattern: str = "*", verbose: bool = False): for name, config in sorted(filtered_engines.items()): engine_type = config.get('engine', 'N/A') module = config.get('module', 'N/A') - typer.echo(f"{name:<40} {engine_type:<15} {module:<25}") + display_name = name[:37] + "..." if len(name) > 40 else name + display_engine = engine_type[:12] + "..." if len(engine_type) > 15 else engine_type + display_module = module[:22] + "..." if len(module) > 25 else module + typer.echo(f"{display_name:<40} {display_engine:<15} {display_module:<25}") typer.echo(f"\nTotal: {len(filtered_engines)} engines") if filter_pattern != "*": typer.echo(f"Filter: '{filter_pattern}'") - typer.echo("\nUse --verbose for detailed information") + if not verbose: + typer.echo("\nUse --verbose for detailed information") + + +def describe_engines(filter_pattern: str = "*", verbose: bool = False): + """Display information about available engines using default configuration loading.""" + try: + all_engines = read_engine_configs() + except Exception as e: + typer.echo(f"Error reading engine configuration: {e}", err=True) + raise typer.Exit(1) + + describe_engines_with_configs(all_engines, filter_pattern, verbose) if __name__ == "__main__":