diff --git a/dataset_reader/ann_h5_reader.py b/dataset_reader/ann_h5_reader.py index 1bc984ac4..950270c3c 100644 --- a/dataset_reader/ann_h5_reader.py +++ b/dataset_reader/ann_h5_reader.py @@ -1,3 +1,4 @@ +import itertools from typing import Iterator import h5py @@ -14,9 +15,10 @@ def __init__(self, path, normalize=False): def read_queries(self) -> Iterator[Query]: data = h5py.File(self.path) + distances = data["distances"] if "distances" in data else itertools.repeat(None) for vector, expected_result, expected_scores in zip( - data["test"], data["neighbors"], data["distances"] + data["test"], data["neighbors"], distances ): if self.normalize: vector /= np.linalg.norm(vector) @@ -24,7 +26,7 @@ def read_queries(self) -> Iterator[Query]: vector=vector.tolist(), meta_conditions=None, expected_result=expected_result.tolist(), - expected_scores=expected_scores.tolist(), + expected_scores=expected_scores.tolist() if expected_scores is not None else None, ) def read_data(self, *args, **kwargs) -> Iterator[Record]: diff --git a/datasets/datasets.json b/datasets/datasets.json index 5493502ea..c615fcef1 100644 --- a/datasets/datasets.json +++ b/datasets/datasets.json @@ -1296,5 +1296,25 @@ "path": "random-100-match-kw-small-vocab/random_keywords_1m_vocab_10_no_filters", "vector_count": 100, "description": "Synthetic data" + }, + { + "name": "cohere-768-1M", + "vector_size": 768, + "distance": "dot", + "type": "h5", + "path": "cohere-768-1M/cohere-768-1M.hdf5", + "link": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/vectorsearch/cohere-wikipedia-22-12-en-embeddings/documents-1m.hdf5.bz2", + "vector_count": 1000000, + "description": "Wikipedia embeddings" + }, + { + "name": "cohere-768-10M", + "vector_size": 768, + "distance": "dot", + "type": "h5", + "path": "cohere-768-10M/cohere-768-10M.hdf5", + "link": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/vectorsearch/cohere-wikipedia-22-12-en-embeddings/documents-10m.hdf5.bz2", + "vector_count": 10000000, + "description": "Wikipedia embeddings" } -] \ No newline at end of file +] diff --git a/engine/base_client/search.py b/engine/base_client/search.py index 53287e31e..bff908ccb 100644 --- a/engine/base_client/search.py +++ b/engine/base_client/search.py @@ -20,7 +20,7 @@ class BaseSearcher: - _doc_id_counter = itertools.count(100000000) + _doc_id_counter = None # Will be initialized per process MP_CONTEXT = None def __init__(self, host, connection_params, search_params): @@ -67,15 +67,22 @@ def _search_one(cls, query, top: Optional[int] = None): precision = len(ids.intersection(query.expected_result[:top])) / top return precision, end - start + @classmethod + def _get_doc_id_counter(cls): + if cls._doc_id_counter is None: + # Use process ID to create unique starting point for each worker + process_id = os.getpid() + # Each process gets a unique range: 1000000000 + (pid * 1000000) + start_offset = 1000000000 + (process_id % 1000) * 1000000 + cls._doc_id_counter = itertools.count(start_offset) + return cls._doc_id_counter + @classmethod def _insert_one(cls, query): start = time.perf_counter() - # Generate unique doc_id here - doc_id = next(cls._doc_id_counter) - - # Debug logging to verify inserts are happening - #print(f"DEBUG: Inserting vector with doc_id={doc_id}") + # Generate unique doc_id with process-safe counter + doc_id = next(cls._get_doc_id_counter()) cls.insert_one(str(doc_id), query.vector, query.meta_conditions) end = time.perf_counter() diff --git a/experiments/configurations/cohere-calibration.json b/experiments/configurations/cohere-calibration.json new file mode 100644 index 000000000..c742daacc --- /dev/null +++ b/experiments/configurations/cohere-calibration.json @@ -0,0 +1,380 @@ +[ + { + "name": "cohere-cal-hnsw-float16", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "hnsw", + "data_type": "FLOAT16", + "hnsw_config": { + "M": 32, + "DISTANCE_METRIC": "IP", + "EF_CONSTRUCTION": 200 + } + }, + "search_params": [ + { + "parallel": 100, + "top": 100, + "calibration_param": "ef", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT16" + } + } + ], + "upload_params": { + "parallel": 100, + "algorithm": "hnsw", + "data_type": "FLOAT16" + } + }, + { + "name": "cohere-cal-hnsw-float32", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "hnsw", + "data_type": "FLOAT32", + "hnsw_config": { + "M": 32, + "DISTANCE_METRIC": "IP", + "EF_CONSTRUCTION": 200 + } + }, + "search_params": [ + { + "parallel": 100, + "top": 100, + "calibration_param": "ef", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT32" + } + } + ], + "upload_params": { + "parallel": 100, + "algorithm": "hnsw", + "data_type": "FLOAT32" + } + }, +{ + "name": "cohere-cal-svs-noquant-float16", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT16", + "svs-vamana_config": { + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200 + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "SEARCH_WINDOW_SIZE", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT16" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT16", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-noquant-float32", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT32", + "svs-vamana_config": { + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200 + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "SEARCH_WINDOW_SIZE", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT32" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT32", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-LVQ8-float16", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT16", + "svs-vamana_config": { + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200, + "compression": "LVQ8" + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "SEARCH_WINDOW_SIZE", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT16" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT16", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-LVQ8-float32", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT32", + "svs-vamana_config": { + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200, + "compression": "LVQ8" + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "SEARCH_WINDOW_SIZE", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT32" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT32", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-LVQ4X8-float16", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT16", + "svs-vamana_config": { + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200, + "compression": "LVQ4X8" + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "SEARCH_WINDOW_SIZE", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT16" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT16", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-LVQ4X8-float32", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT32", + "svs-vamana_config": { + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200, + "compression": "LVQ4X8" + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "SEARCH_WINDOW_SIZE", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT32" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT32", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-LVQ4X4-float32", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT32", + "svs-vamana_config": { + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200, + "compression": "LVQ4X4" + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "SEARCH_WINDOW_SIZE", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT32" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT32", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-LVQ4X4-float16", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT16", + "svs-vamana_config": { + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200, + "compression": "LVQ4X4" + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "SEARCH_WINDOW_SIZE", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT16" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT16", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-LeanVec4x8-float16", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT16", + "svs-vamana_config": { + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200, + "compression": "LeanVec4x8" + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "SEARCH_WINDOW_SIZE", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT16" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT16", + "algorithm": "svs-vamana" + } + }, + { + "name": "cohere-cal-svs-LeanVec4x8-float32", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "algorithm": "svs-vamana", + "data_type": "FLOAT32", + "svs-vamana_config": { + "DISTANCE_METRIC": "IP", + "GRAPH_MAX_DEGREE": 64, + "CONSTRUCTION_WINDOW_SIZE": 200, + "compression": "LeanVec4x8" + } + }, + "search_params": [ + { + "algorithm": "svs-vamana", + "parallel": 100, + "top": 100, + "calibration_param": "SEARCH_WINDOW_SIZE", + "calibration_precision": 0.95, + "search_params": { + "data_type": "FLOAT32" + } + } + ], + "upload_params": { + "parallel": 100, + "data_type": "FLOAT32", + "algorithm": "svs-vamana" + } + } +]