fix: remove scipy, read csr matrix manually (#117)

joein · web-flow · commit 36bcfaaaf667 · 2024-04-09T15:17:20.000+02:00
* fix: remove scipy, read csr matrix manually
diff --git a/dataset_reader/base_reader.py b/dataset_reader/base_reader.py
@@ -1,13 +1,11 @@
 from dataclasses import dataclass
 from typing import Iterator, List, Optional
 
-import numpy as np
-
 
 @dataclass
 class SparseVector:
-    indices: np.array
-    values: np.array
+    indices: List[int]
+    values: List[float]
 
 
 @dataclass
diff --git a/dataset_reader/sparse_reader.py b/dataset_reader/sparse_reader.py
@@ -1,48 +1,62 @@
 import os
-from typing import Iterator
+from pathlib import Path
+from typing import Iterator, List, Tuple, Union
 
 import numpy as np
-from scipy.sparse import csr_matrix
 
 from dataset_reader.base_reader import BaseReader, Query, Record, SparseVector
 
-# credit: code extracted from neuIPS 2023 benchmarks
 
+def read_sparse_matrix_fields(
+    filename: Union[Path, str]
+) -> Tuple[np.array, np.array, np.array]:
+    """Read the fields of a CSR matrix without instantiating it"""
 
-def read_sparse_matrix_fields(fname):
-    """read the fields of a CSR matrix without instantiating it"""
-    with open(fname, "rb") as f:
+    with open(filename, "rb") as f:
         sizes = np.fromfile(f, dtype="int64", count=3)
-        nrow, ncol, nnz = sizes
-        indptr = np.fromfile(f, dtype="int64", count=nrow + 1)
-        assert nnz == indptr[-1]
-        indices = np.fromfile(f, dtype="int32", count=nnz)
-        assert np.all(indices >= 0) and np.all(indices < ncol)
-        data = np.fromfile(f, dtype="float32", count=nnz)
-        return data, indices, indptr, ncol
-
-
-def read_sparse_matrix(fname) -> Iterator[SparseVector]:
-    """read a CSR matrix in spmat format"""
-    data, indices, indptr, ncol = read_sparse_matrix_fields(fname)
-    # Need scipy csr_matrix to parse spmat format and easily take out rows
-    csr_mat = csr_matrix((data, indices, indptr), shape=(len(indptr) - 1, ncol))
-    num_vectors = csr_mat.shape[0]
-
-    for i in range(num_vectors):
-        indices = csr_mat[i].indices.tolist()
-        values = csr_mat[i].data.tolist()
-        yield SparseVector(indices=indices, values=values)
-
-
-def knn_result_read(fname):
-    n, d = map(int, np.fromfile(fname, dtype="uint32", count=2))
-    assert os.stat(fname).st_size == 8 + n * d * (4 + 4)
-    f = open(fname, "rb")
-    f.seek(4 + 4)
-    ids = np.fromfile(f, dtype="int32", count=n * d).reshape(n, d)
-    scores = np.fromfile(f, dtype="float32", count=n * d).reshape(n, d)
-    f.close()
+        n_row, n_col, n_non_zero = sizes
+        index_pointer = np.fromfile(f, dtype="int64", count=n_row + 1)
+        assert n_non_zero == index_pointer[-1]
+        columns = np.fromfile(f, dtype="int32", count=n_non_zero)
+        assert np.all(columns >= 0) and np.all(columns < n_col)
+        values = np.fromfile(f, dtype="float32", count=n_non_zero)
+        return values, columns, index_pointer
+
+
+def csr_to_sparse_vectors(
+    values: List[float], columns: List[int], index_pointer: List[int]
+) -> Iterator[SparseVector]:
+    num_rows = len(index_pointer) - 1
+
+    for i in range(num_rows):
+        start = index_pointer[i]
+        end = index_pointer[i + 1]
+        row_values, row_indices = [], []
+        for j in range(start, end):
+            row_values.append(values[j])
+            row_indices.append(columns[j])
+        yield SparseVector(indices=row_indices, values=row_values)
+
+
+def read_csr_matrix(filename: Union[Path, str]) -> Iterator[SparseVector]:
+    """Read a CSR matrix in spmat format"""
+    values, columns, index_pointer = read_sparse_matrix_fields(filename)
+    values = values.tolist()
+    columns = columns.tolist()
+    index_pointer = index_pointer.tolist()
+
+    yield from csr_to_sparse_vectors(values, columns, index_pointer)
+
+
+def knn_result_read(
+    filename: Union[Path, str]
+) -> Tuple[List[List[int]], List[List[float]]]:
+    n, d = map(int, np.fromfile(filename, dtype="uint32", count=2))
+    assert os.stat(filename).st_size == 8 + n * d * (4 + 4)
+    with open(filename, "rb") as f:
+        f.seek(4 + 4)
+        ids = np.fromfile(f, dtype="int32", count=n * d).reshape(n, d).tolist()
+        scores = np.fromfile(f, dtype="float32", count=n * d).reshape(n, d).tolist()
     return ids, scores
 
 
@@ -53,7 +67,7 @@ def __init__(self, path, normalize=False):
 
     def read_queries(self) -> Iterator[Query]:
         queries_path = self.path / "queries.csr"
-        X = read_sparse_matrix(queries_path)
+        X = read_csr_matrix(queries_path)
 
         gt_path = self.path / "results.gt"
         gt_indices, _ = knn_result_read(gt_path)
@@ -63,12 +77,24 @@ def read_queries(self) -> Iterator[Query]:
                 vector=None,
                 sparse_vector=sparse_vector,
                 meta_conditions=None,
-                expected_result=gt_indices[i].tolist(),
+                expected_result=gt_indices[i],
             )
 
     def read_data(self) -> Iterator[Record]:
         data_path = self.path / "data.csr"
-        X = read_sparse_matrix(data_path)
+        X = read_csr_matrix(data_path)
 
         for i, sparse_vector in enumerate(X):
             yield Record(id=i, vector=None, sparse_vector=sparse_vector, metadata=None)
+
+
+if __name__ == "__main__":
+    vals = [1, 3, 2, 3, 6, 4, 5]
+    cols = [0, 2, 2, 1, 3, 0, 2]
+    pointers = [0, 2, 3, 5, 7]
+    vecs = [vec for vec in csr_to_sparse_vectors(vals, cols, pointers)]
+
+    assert vecs[0] == SparseVector(indices=[0, 2], values=[1, 3])
+    assert vecs[1] == SparseVector(indices=[2], values=[2])
+    assert vecs[2] == SparseVector(indices=[1, 3], values=[3, 6])
+    assert vecs[3] == SparseVector(indices=[0, 2], values=[4, 5])
diff --git a/pyproject.toml b/pyproject.toml
@@ -2,7 +2,7 @@
 name = "vector-db-benchmark"
 version = "0.1.0"
 description = ""
-authors = ["Kacper Łukawski <kacper.lukawski@qdrant.com>"]
+authors = ["Qdrant Team <info@qdrant.tech>"]
 
 [tool.poetry.dependencies]
 python = ">=3.9,<3.12"
@@ -20,8 +20,6 @@ opensearch-py = "^2.3.2"
 tqdm = "^4.66.1"
 psycopg = {extras = ["binary"], version = "^3.1.17"}
 pgvector = "^0.2.4"
-scipy = "^1.12.0"
-
 
 [tool.poetry.dev-dependencies]
 pre-commit = "^2.20.0"