From 55a3dfbce82437ef0d649281a2083ac04057e867 Mon Sep 17 00:00:00 2001 From: Kumar Shivendu Date: Wed, 17 Apr 2024 15:14:16 +0530 Subject: [PATCH 1/5] fix: Manual benchmarks --- .github/workflows/manual-benchmark.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/manual-benchmark.yaml b/.github/workflows/manual-benchmark.yaml index 4749d9fa..cc708a9c 100644 --- a/.github/workflows/manual-benchmark.yaml +++ b/.github/workflows/manual-benchmark.yaml @@ -23,11 +23,10 @@ jobs: - name: Benches run: | export HCLOUD_TOKEN=${{ secrets.HCLOUD_TOKEN }} - export GCS_KEY=${{ secrets.GCS_KEY }} - export GCS_SECRET=${{ secrets.GCS_SECRET }} export POSTGRES_PASSWORD=${{ secrets.POSTGRES_PASSWORD }} export POSTGRES_HOST=${{ secrets.POSTGRES_HOST }} export QDRANT_VERSION=${{ inputs.qdrant_version }} export DATASETS=${{ inputs.dataset }} export POSTGRES_TABLE=benchmark_manual + bash -x tools/setup_ci.sh bash -x tools/run_ci.sh From 9faf2687de4ccdaa54b2eeb01fbd52d3b8dc765e Mon Sep 17 00:00:00 2001 From: KShivendu Date: Wed, 17 Apr 2024 15:15:35 +0530 Subject: [PATCH 2/5] fix: Remove gcs secrets --- .github/workflows/continuous-benchmark.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/continuous-benchmark.yaml b/.github/workflows/continuous-benchmark.yaml index 989c619b..8594017b 100644 --- a/.github/workflows/continuous-benchmark.yaml +++ b/.github/workflows/continuous-benchmark.yaml @@ -18,8 +18,6 @@ jobs: - name: Benches run: | export HCLOUD_TOKEN=${{ secrets.HCLOUD_TOKEN }} - export GCS_KEY=${{ secrets.GCS_KEY }} - export GCS_SECRET=${{ secrets.GCS_SECRET }} export POSTGRES_PASSWORD=${{ secrets.POSTGRES_PASSWORD }} export POSTGRES_HOST=${{ secrets.POSTGRES_HOST }} bash -x tools/setup_ci.sh From 869024b9a8cf3b7a8b48059bf1e5c454857db764 Mon Sep 17 00:00:00 2001 From: KShivendu Date: Wed, 17 Apr 2024 16:39:04 +0530 Subject: [PATCH 3/5] feat: Use mmap to read sparse vectors --- dataset_reader/sparse_reader.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/dataset_reader/sparse_reader.py b/dataset_reader/sparse_reader.py index fb2af5d9..b7e8b1b0 100644 --- a/dataset_reader/sparse_reader.py +++ b/dataset_reader/sparse_reader.py @@ -23,9 +23,24 @@ def read_sparse_matrix_fields( return values, columns, index_pointer +def mmap_sparse_matrix_fields(fname): + """mmap the fields of a CSR matrix without instantiating it""" + with open(fname, "rb") as f: + sizes = np.fromfile(f, dtype='int64', count=3) + n_row, n_col, n_non_zero = sizes + offset = sizes.nbytes + index_pointer = np.memmap(fname, dtype='int64', mode='r', offset=offset, shape=n_row + 1) + offset += index_pointer.nbytes + columns = np.memmap(fname, dtype='int32', mode='r', offset=offset, shape=n_non_zero) + offset += columns.nbytes + values = np.memmap(fname, dtype='float32', mode='r', offset=offset, shape=n_non_zero) + return values, columns, index_pointer + + def csr_to_sparse_vectors( values: List[float], columns: List[int], index_pointer: List[int] ) -> Iterator[SparseVector]: + """Convert a CSR matrix to a list of SparseVectors""" num_rows = len(index_pointer) - 1 for i in range(num_rows): @@ -38,9 +53,12 @@ def csr_to_sparse_vectors( yield SparseVector(indices=row_indices, values=row_values) -def read_csr_matrix(filename: Union[Path, str]) -> Iterator[SparseVector]: +def read_csr_matrix(filename: Union[Path, str], do_mmap=True) -> Iterator[SparseVector]: """Read a CSR matrix in spmat format""" - values, columns, index_pointer = read_sparse_matrix_fields(filename) + if do_mmap: + values, columns, index_pointer = mmap_sparse_matrix_fields(filename) + else: + values, columns, index_pointer = read_sparse_matrix_fields(filename) values = values.tolist() columns = columns.tolist() index_pointer = index_pointer.tolist() From 8e2fd436a7502e6800e7c3fe9ee6edc41d223cd2 Mon Sep 17 00:00:00 2001 From: KShivendu Date: Wed, 17 Apr 2024 16:39:34 +0530 Subject: [PATCH 4/5] fix: Format --- dataset_reader/sparse_reader.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dataset_reader/sparse_reader.py b/dataset_reader/sparse_reader.py index b7e8b1b0..b5e453c8 100644 --- a/dataset_reader/sparse_reader.py +++ b/dataset_reader/sparse_reader.py @@ -26,14 +26,18 @@ def read_sparse_matrix_fields( def mmap_sparse_matrix_fields(fname): """mmap the fields of a CSR matrix without instantiating it""" with open(fname, "rb") as f: - sizes = np.fromfile(f, dtype='int64', count=3) + sizes = np.fromfile(f, dtype="int64", count=3) n_row, n_col, n_non_zero = sizes offset = sizes.nbytes - index_pointer = np.memmap(fname, dtype='int64', mode='r', offset=offset, shape=n_row + 1) + index_pointer = np.memmap( + fname, dtype="int64", mode="r", offset=offset, shape=n_row + 1 + ) offset += index_pointer.nbytes - columns = np.memmap(fname, dtype='int32', mode='r', offset=offset, shape=n_non_zero) + columns = np.memmap(fname, dtype="int32", mode="r", offset=offset, shape=n_non_zero) offset += columns.nbytes - values = np.memmap(fname, dtype='float32', mode='r', offset=offset, shape=n_non_zero) + values = np.memmap( + fname, dtype="float32", mode="r", offset=offset, shape=n_non_zero + ) return values, columns, index_pointer From 63d332c8a27b15266bccd24e47544070a840ebcd Mon Sep 17 00:00:00 2001 From: KShivendu Date: Wed, 17 Apr 2024 16:40:25 +0530 Subject: [PATCH 5/5] fix: Make unused var private --- dataset_reader/sparse_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataset_reader/sparse_reader.py b/dataset_reader/sparse_reader.py index b5e453c8..94ee4167 100644 --- a/dataset_reader/sparse_reader.py +++ b/dataset_reader/sparse_reader.py @@ -27,7 +27,7 @@ def mmap_sparse_matrix_fields(fname): """mmap the fields of a CSR matrix without instantiating it""" with open(fname, "rb") as f: sizes = np.fromfile(f, dtype="int64", count=3) - n_row, n_col, n_non_zero = sizes + n_row, _n_col, n_non_zero = sizes offset = sizes.nbytes index_pointer = np.memmap( fname, dtype="int64", mode="r", offset=offset, shape=n_row + 1