Skip to content

Commit 79d5d74

Browse files
authored
Merge pull request #332 from RedisAI/add_bindings_to_bf_index
Add bindings to brute force index
2 parents 342257e + 9c2dc7c commit 79d5d74

File tree

4 files changed

+372
-0
lines changed

4 files changed

+372
-0
lines changed

TESTING_RECALL.md

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# Testing recall
2+
3+
Selecting HNSW parameters for a specific use case highly impacts the search quality. One way to test the quality of the constructed index is to compare the HNSW search results to the actual results (i.e., the actual `k` nearest neighbors).
4+
For that cause, the API enables creating a simple "brute-force" index in which vectors are stored as is, and searching for the `k` nearest neighbors to a query vector requires going over the entire index.
5+
Comparing between HNSW and brute-force results may help with finding the desired HNSW parameters for achieving a satisfying recall, based on the index size and data dimension.
6+
7+
### Brute force index API
8+
`hnswlib.BFIndex(space, dim)` creates a non-initialized index in space `space` with integer dimension `dim`.
9+
10+
`hnswlib.BFIndex` methods:
11+
12+
`init_index(max_elements)` initializes the index with no elements.
13+
14+
max_elements defines the maximum number of elements that can be stored in the structure.
15+
16+
`add_items(data, ids)` inserts the data (numpy array of vectors, shape:`N*dim`) into the structure.
17+
`ids` are optional N-size numpy array of integer labels for all elements in data.
18+
19+
`delete_vector(label)` delete the element associated with the given `label` so it will be omitted from search results.
20+
21+
`knn_query(data, k = 1)` make a batch query for `k `closest elements for each element of the
22+
`data` (shape:`N*dim`). Returns a numpy array of (shape:`N*k`).
23+
24+
`load_index(path_to_index, max_elements = 0)` loads the index from persistence to the uninitialized index.
25+
26+
`save_index(path_to_index)` saves the index from persistence.
27+
28+
### measuring recall example
29+
30+
```
31+
import hnswlib
32+
import numpy as np
33+
34+
dim = 32
35+
num_elements = 100000
36+
k = 10
37+
nun_queries = 10
38+
39+
# Generating sample data
40+
data = np.float32(np.random.random((num_elements, dim)))
41+
42+
# Declaring index
43+
hnsw_index = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip
44+
bf_index = hnswlib.BFIndex(space='l2', dim=dim)
45+
46+
# Initing both hnsw and brute force indices
47+
# max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded
48+
# during insertion of an element.
49+
# The capacity can be increased by saving/loading the index, see below.
50+
#
51+
# hnsw construction params:
52+
# ef_construction - controls index search speed/build speed tradeoff
53+
#
54+
# M - is tightly connected with internal dimensionality of the data. Strongly affects the memory consumption (~M)
55+
# Higher M leads to higher accuracy/run_time at fixed ef/efConstruction
56+
57+
hnsw_index.init_index(max_elements=num_elements, ef_construction=200, M=16)
58+
bf_index.init_index(max_elements=num_elements)
59+
60+
# Controlling the recall for hnsw by setting ef:
61+
# higher ef leads to better accuracy, but slower search
62+
hnsw_index.set_ef(200)
63+
64+
# Set number of threads used during batch search/construction in hnsw
65+
# By default using all available cores
66+
hnsw_index.set_num_threads(1)
67+
68+
print("Adding batch of %d elements" % (len(data)))
69+
hnsw_index.add_items(data)
70+
bf_index.add_items(data)
71+
72+
print("Indices built")
73+
74+
# Generating query data
75+
query_data = np.float32(np.random.random((nun_queries, dim)))
76+
77+
# Query the elements and measure recall:
78+
labels_hnsw, distances_hnsw = hnsw_index.knn_query(query_data, k)
79+
labels_bf, distances_bf = bf_index.knn_query(query_data, k)
80+
81+
# Measure recall
82+
correct = 0
83+
for i in range(nun_queries):
84+
for label in labels_hnsw[i]:
85+
for correct_label in labels_bf[i]:
86+
if label == correct_label:
87+
correct += 1
88+
break
89+
90+
print("recall is :", float(correct)/(k*nun_queries))
91+
```

python_bindings/__init__.py

Whitespace-only changes.

python_bindings/bindings.cpp

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -652,6 +652,188 @@ class Index {
652652

653653
};
654654

655+
template<typename dist_t, typename data_t=float>
656+
class BFIndex {
657+
public:
658+
BFIndex(const std::string &space_name, const int dim) :
659+
space_name(space_name), dim(dim) {
660+
normalize=false;
661+
if(space_name=="l2") {
662+
space = new hnswlib::L2Space(dim);
663+
}
664+
else if(space_name=="ip") {
665+
space = new hnswlib::InnerProductSpace(dim);
666+
}
667+
else if(space_name=="cosine") {
668+
space = new hnswlib::InnerProductSpace(dim);
669+
normalize=true;
670+
} else {
671+
throw new std::runtime_error("Space name must be one of l2, ip, or cosine.");
672+
}
673+
alg = NULL;
674+
index_inited = false;
675+
}
676+
677+
static const int ser_version = 1; // serialization version
678+
679+
std::string space_name;
680+
int dim;
681+
bool index_inited;
682+
bool normalize;
683+
684+
hnswlib::labeltype cur_l;
685+
hnswlib::BruteforceSearch<dist_t> *alg;
686+
hnswlib::SpaceInterface<float> *space;
687+
688+
~BFIndex() {
689+
delete space;
690+
if (alg)
691+
delete alg;
692+
}
693+
694+
void init_new_index(const size_t maxElements) {
695+
if (alg) {
696+
throw new std::runtime_error("The index is already initiated.");
697+
}
698+
cur_l = 0;
699+
alg = new hnswlib::BruteforceSearch<dist_t>(space, maxElements);
700+
index_inited = true;
701+
}
702+
703+
void normalize_vector(float *data, float *norm_array){
704+
float norm=0.0f;
705+
for(int i=0;i<dim;i++)
706+
norm+=data[i]*data[i];
707+
norm= 1.0f / (sqrtf(norm) + 1e-30f);
708+
for(int i=0;i<dim;i++)
709+
norm_array[i]=data[i]*norm;
710+
}
711+
712+
void addItems(py::object input, py::object ids_ = py::none()) {
713+
py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input);
714+
auto buffer = items.request();
715+
size_t rows, features;
716+
717+
if (buffer.ndim != 2 && buffer.ndim != 1) throw std::runtime_error("data must be a 1d/2d array");
718+
if (buffer.ndim == 2) {
719+
rows = buffer.shape[0];
720+
features = buffer.shape[1];
721+
} else {
722+
rows = 1;
723+
features = buffer.shape[0];
724+
}
725+
726+
if (features != dim)
727+
throw std::runtime_error("wrong dimensionality of the vectors");
728+
729+
std::vector<size_t> ids;
730+
731+
if (!ids_.is_none()) {
732+
py::array_t < size_t, py::array::c_style | py::array::forcecast > items(ids_);
733+
auto ids_numpy = items.request();
734+
if (ids_numpy.ndim == 1 && ids_numpy.shape[0] == rows) {
735+
std::vector<size_t> ids1(ids_numpy.shape[0]);
736+
for (size_t i = 0; i < ids1.size(); i++) {
737+
ids1[i] = items.data()[i];
738+
}
739+
ids.swap(ids1);
740+
} else if (ids_numpy.ndim == 0 && rows == 1) {
741+
ids.push_back(*items.data());
742+
} else
743+
throw std::runtime_error("wrong dimensionality of the labels");
744+
}
745+
{
746+
747+
for (size_t row = 0; row < rows; row++) {
748+
size_t id = ids.size() ? ids.at(row) : cur_l + row;
749+
if (!normalize) {
750+
alg->addPoint((void *) items.data(row), (size_t) id);
751+
} else {
752+
float normalized_vector[dim];
753+
normalize_vector((float *)items.data(row), normalized_vector);
754+
alg->addPoint((void *) normalized_vector, (size_t) id);
755+
}
756+
}
757+
cur_l+=rows;
758+
}
759+
}
760+
761+
void deleteVector(size_t label) {
762+
alg->removePoint(label);
763+
}
764+
765+
void saveIndex(const std::string &path_to_index) {
766+
alg->saveIndex(path_to_index);
767+
}
768+
769+
void loadIndex(const std::string &path_to_index, size_t max_elements) {
770+
if (alg) {
771+
std::cerr<<"Warning: Calling load_index for an already inited index. Old index is being deallocated.";
772+
delete alg;
773+
}
774+
alg = new hnswlib::BruteforceSearch<dist_t>(space, path_to_index);
775+
cur_l = alg->cur_element_count;
776+
index_inited = true;
777+
}
778+
779+
py::object knnQuery_return_numpy(py::object input, size_t k = 1) {
780+
781+
py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input);
782+
auto buffer = items.request();
783+
hnswlib::labeltype *data_numpy_l;
784+
dist_t *data_numpy_d;
785+
size_t rows, features;
786+
{
787+
py::gil_scoped_release l;
788+
789+
if (buffer.ndim != 2 && buffer.ndim != 1) throw std::runtime_error("data must be a 1d/2d array");
790+
if (buffer.ndim == 2) {
791+
rows = buffer.shape[0];
792+
features = buffer.shape[1];
793+
} else {
794+
rows = 1;
795+
features = buffer.shape[0];
796+
}
797+
798+
data_numpy_l = new hnswlib::labeltype[rows * k];
799+
data_numpy_d = new dist_t[rows * k];
800+
801+
for (size_t row = 0; row < rows; row++) {
802+
std::priority_queue<std::pair<dist_t, hnswlib::labeltype >> result = alg->searchKnn(
803+
(void *) items.data(row), k);
804+
for (int i = k - 1; i >= 0; i--) {
805+
auto &result_tuple = result.top();
806+
data_numpy_d[row * k + i] = result_tuple.first;
807+
data_numpy_l[row * k + i] = result_tuple.second;
808+
result.pop();
809+
}
810+
}
811+
}
812+
813+
py::capsule free_when_done_l(data_numpy_l, [](void *f) {
814+
delete[] f;
815+
});
816+
py::capsule free_when_done_d(data_numpy_d, [](void *f) {
817+
delete[] f;
818+
});
819+
820+
821+
return py::make_tuple(
822+
py::array_t<hnswlib::labeltype>(
823+
{rows, k}, // shape
824+
{k * sizeof(hnswlib::labeltype),
825+
sizeof(hnswlib::labeltype)}, // C-style contiguous strides for double
826+
data_numpy_l, // the data pointer
827+
free_when_done_l),
828+
py::array_t<dist_t>(
829+
{rows, k}, // shape
830+
{k * sizeof(dist_t), sizeof(dist_t)}, // C-style contiguous strides for double
831+
data_numpy_d, // the data pointer
832+
free_when_done_d));
833+
834+
}
835+
836+
};
655837

656838

657839
PYBIND11_PLUGIN(hnswlib) {
@@ -716,5 +898,16 @@ PYBIND11_PLUGIN(hnswlib) {
716898
return "<hnswlib.Index(space='" + a.space_name + "', dim="+std::to_string(a.dim)+")>";
717899
});
718900

901+
py::class_<BFIndex<float>>(m, "BFIndex")
902+
.def(py::init<const std::string &, const int>(), py::arg("space"), py::arg("dim"))
903+
.def("init_index", &BFIndex<float>::init_new_index, py::arg("max_elements"))
904+
.def("knn_query", &BFIndex<float>::knnQuery_return_numpy, py::arg("data"), py::arg("k")=1)
905+
.def("add_items", &BFIndex<float>::addItems, py::arg("data"), py::arg("ids") = py::none())
906+
.def("delete_vector", &BFIndex<float>::deleteVector, py::arg("label"))
907+
.def("save_index", &BFIndex<float>::saveIndex, py::arg("path_to_index"))
908+
.def("load_index", &BFIndex<float>::loadIndex, py::arg("path_to_index"), py::arg("max_elements")=0)
909+
.def("__repr__", [](const BFIndex<float> &a) {
910+
return "<hnswlib.BFIndex(space='" + a.space_name + "', dim="+std::to_string(a.dim)+")>";
911+
});
719912
return m.ptr();
720913
}
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import hnswlib
2+
import numpy as np
3+
4+
dim = 32
5+
num_elements = 100000
6+
k = 10
7+
nun_queries = 10
8+
9+
# Generating sample data
10+
data = np.float32(np.random.random((num_elements, dim)))
11+
12+
# Declaring index
13+
hnsw_index = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip
14+
bf_index = hnswlib.BFIndex(space='l2', dim=dim)
15+
16+
# Initing both hnsw and brute force indices
17+
# max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded
18+
# during insertion of an element.
19+
# The capacity can be increased by saving/loading the index, see below.
20+
#
21+
# hnsw construction params:
22+
# ef_construction - controls index search speed/build speed tradeoff
23+
#
24+
# M - is tightly connected with internal dimensionality of the data. Strongly affects the memory consumption (~M)
25+
# Higher M leads to higher accuracy/run_time at fixed ef/efConstruction
26+
27+
hnsw_index.init_index(max_elements=num_elements, ef_construction=200, M=16)
28+
bf_index.init_index(max_elements=num_elements)
29+
30+
# Controlling the recall for hnsw by setting ef:
31+
# higher ef leads to better accuracy, but slower search
32+
hnsw_index.set_ef(200)
33+
34+
# Set number of threads used during batch search/construction in hnsw
35+
# By default using all available cores
36+
hnsw_index.set_num_threads(1)
37+
38+
print("Adding batch of %d elements" % (len(data)))
39+
hnsw_index.add_items(data)
40+
bf_index.add_items(data)
41+
42+
print("Indices built")
43+
44+
# Generating query data
45+
query_data = np.float32(np.random.random((nun_queries, dim)))
46+
47+
# Query the elements and measure recall:
48+
labels_hnsw, distances_hnsw = hnsw_index.knn_query(query_data, k)
49+
labels_bf, distances_bf = bf_index.knn_query(query_data, k)
50+
51+
# Measure recall
52+
correct = 0
53+
for i in range(nun_queries):
54+
for label in labels_hnsw[i]:
55+
for correct_label in labels_bf[i]:
56+
if label == correct_label:
57+
correct += 1
58+
break
59+
60+
print("recall is :", float(correct)/(k*nun_queries))
61+
62+
# test serializing the brute force index
63+
index_path = 'bf_index.bin'
64+
print("Saving index to '%s'" % index_path)
65+
bf_index.save_index(index_path)
66+
del bf_index
67+
68+
# Re-initiating, loading the index
69+
bf_index = hnswlib.BFIndex(space='l2', dim=dim)
70+
71+
print("\nLoading index from '%s'\n" % index_path)
72+
bf_index.load_index(index_path)
73+
74+
# Query the brute force index again to verify that we get the same results
75+
labels_bf, distances_bf = bf_index.knn_query(query_data, k)
76+
77+
# Measure recall
78+
correct = 0
79+
for i in range(nun_queries):
80+
for label in labels_hnsw[i]:
81+
for correct_label in labels_bf[i]:
82+
if label == correct_label:
83+
correct += 1
84+
break
85+
86+
print("recall after reloading is :", float(correct)/(k*nun_queries))
87+
88+

0 commit comments

Comments
 (0)