From 4833abe5538882707ec6ecfe21f8e6a9b91b5e0f Mon Sep 17 00:00:00 2001 From: alon Date: Tue, 20 Jul 2021 19:14:28 +0300 Subject: [PATCH 1/5] Parallel indexing --- examples/searchKnnCloserFirst_test.cpp | 71 ++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 3 deletions(-) diff --git a/examples/searchKnnCloserFirst_test.cpp b/examples/searchKnnCloserFirst_test.cpp index cc1392c8..4e50b66f 100644 --- a/examples/searchKnnCloserFirst_test.cpp +++ b/examples/searchKnnCloserFirst_test.cpp @@ -9,12 +9,69 @@ #include #include +#include namespace { using idx_t = hnswlib::labeltype; + template + inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn) { + if (numThreads <= 0) { + numThreads = std::thread::hardware_concurrency(); + } + + if (numThreads == 1) { + for (size_t id = start; id < end; id++) { + fn(id, 0); + } + } else { + std::vector threads; + std::atomic current(start); + + // keep track of exceptions in threads + // https://stackoverflow.com/a/32428427/1713196 + std::exception_ptr lastException = nullptr; + std::mutex lastExceptMutex; + + for (size_t threadId = 0; threadId < numThreads; ++threadId) { + threads.push_back(std::thread([&, threadId] { + while (true) { + size_t id = current.fetch_add(1); + + if ((id >= end)) { + break; + } + + try { + fn(id, threadId); + } catch (...) { + std::unique_lock lastExcepLock(lastExceptMutex); + lastException = std::current_exception(); + /* + * This will work even when current is the largest value that + * size_t can fit, because fetch_add returns the previous value + * before the increment (what will result in overflow + * and produce 0 instead of current + 1). + */ + current = end; + break; + } + } + })); + } + for (auto &thread : threads) { + thread.join(); + } + if (lastException) { + std::rethrow_exception(lastException); + } + } + + + } + void test() { int d = 4; idx_t n = 100; @@ -40,10 +97,18 @@ void test() { hnswlib::AlgorithmInterface* alg_brute = new hnswlib::BruteforceSearch(&space, 2 * n); hnswlib::AlgorithmInterface* alg_hnsw = new hnswlib::HierarchicalNSW(&space, 2 * n); - for (size_t i = 0; i < n; ++i) { - alg_brute->addPoint(data.data() + d * i, i); +// for (size_t i = 0; i < n; ++i) { +// alg_brute->addPoint(data.data() + d * i, i); +// alg_hnsw->addPoint(data.data() + d * i, i); +// } + + ParallelFor(0, n, 4, [&](size_t i, size_t threadId) { alg_hnsw->addPoint(data.data() + d * i, i); - } + }); + + ParallelFor(0, n, 4, [&](size_t i, size_t threadId) { + alg_brute->addPoint(data.data() + d * i, i); + }); // test searchKnnCloserFirst of BruteforceSearch for (size_t j = 0; j < nq; ++j) { From a6af73d98d78659a46a6d3fec4c78af1e94be079 Mon Sep 17 00:00:00 2001 From: alon Date: Wed, 21 Jul 2021 17:54:50 +0300 Subject: [PATCH 2/5] Added python bindings for BF index for recall testing --- hnswlib/bruteforce.h | 3 +- python_bindings/bindings.cpp | 172 +++++++++++++++++++++++++++++++++++ 2 files changed, 173 insertions(+), 2 deletions(-) diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h index 24260400..7fbdee9a 100644 --- a/hnswlib/bruteforce.h +++ b/hnswlib/bruteforce.h @@ -68,8 +68,6 @@ namespace hnswlib { memcpy(data_ + size_per_element_ * idx, datapoint, data_size_); - - }; void removePoint(labeltype cur_external) { @@ -99,6 +97,7 @@ namespace hnswlib { dist_t lastdist = topResults.top().first; for (int i = k; i < cur_element_count; i++) { dist_t dist = fstdistfunc_(query_data, data_ + size_per_element_ * i, dist_func_param_); + if (dist <= lastdist) { topResults.push(std::pair(dist, *((labeltype *) (data_ + size_per_element_ * i + data_size_)))); diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 285b5185..22029019 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -652,6 +652,170 @@ class Index { }; +template +class BFIndex { +public: + BFIndex(const std::string &space_name, const int dim) : + space_name(space_name), dim(dim) { + normalize=false; + if(space_name=="l2") { + space = new hnswlib::L2Space(dim); + } + else if(space_name=="ip") { + space = new hnswlib::InnerProductSpace(dim); + } + else if(space_name=="cosine") { + space = new hnswlib::InnerProductSpace(dim); + normalize=true; + } else { + throw new std::runtime_error("Space name must be one of l2, ip, or cosine."); + } + alg = NULL; + index_inited = false; + } + + static const int ser_version = 1; // serialization version + + std::string space_name; + int dim; + bool index_inited; + bool normalize; + + hnswlib::labeltype cur_l; + hnswlib::BruteforceSearch *alg; + hnswlib::SpaceInterface *space; + + ~BFIndex() { + delete space; + if (alg) + delete alg; + } + + void init_new_index(const size_t maxElements) { + if (alg) { + throw new std::runtime_error("The index is already initiated."); + } + cur_l = 0; + alg = new hnswlib::BruteforceSearch(space, maxElements); + index_inited = true; + } + + void normalize_vector(float *data, float *norm_array){ + float norm=0.0f; + for(int i=0;i items(input); + auto buffer = items.request(); + size_t rows, features; + + if (buffer.ndim != 2 && buffer.ndim != 1) throw std::runtime_error("data must be a 1d/2d array"); + if (buffer.ndim == 2) { + rows = buffer.shape[0]; + features = buffer.shape[1]; + } else { + rows = 1; + features = buffer.shape[0]; + } + + if (features != dim) + throw std::runtime_error("wrong dimensionality of the vectors"); + + std::vector ids; + + if (!ids_.is_none()) { + py::array_t < size_t, py::array::c_style | py::array::forcecast > items(ids_); + auto ids_numpy = items.request(); + if (ids_numpy.ndim == 1 && ids_numpy.shape[0] == rows) { + std::vector ids1(ids_numpy.shape[0]); + for (size_t i = 0; i < ids1.size(); i++) { + ids1[i] = items.data()[i]; + } + ids.swap(ids1); + } else if (ids_numpy.ndim == 0 && rows == 1) { + ids.push_back(*items.data()); + } else + throw std::runtime_error("wrong dimensionality of the labels"); + } + { + int start = 0; + py::gil_scoped_release l; + + std::vector norm_array(dim); + for (size_t i = start; i < rows; i++) { + alg->addPoint((void *) items.data(i), (size_t) i); + } + cur_l+=rows; + } + } + + void deletedVector(size_t label) { + alg->removePoint(label); + } + + py::object knnQuery_return_numpy(py::object input, size_t k = 1) { + + py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input); + auto buffer = items.request(); + hnswlib::labeltype *data_numpy_l; + dist_t *data_numpy_d; + size_t rows, features; + { + py::gil_scoped_release l; + + if (buffer.ndim != 2 && buffer.ndim != 1) throw std::runtime_error("data must be a 1d/2d array"); + if (buffer.ndim == 2) { + rows = buffer.shape[0]; + features = buffer.shape[1]; + } else { + rows = 1; + features = buffer.shape[0]; + } + + data_numpy_l = new hnswlib::labeltype[rows * k]; + data_numpy_d = new dist_t[rows * k]; + + for (size_t row = 0; row < rows; row++) { + std::priority_queue> result = alg->searchKnn( + (void *) items.data(row), k); + for (int i = k - 1; i >= 0; i--) { + auto &result_tuple = result.top(); + data_numpy_d[row * k + i] = result_tuple.first; + data_numpy_l[row * k + i] = result_tuple.second; + result.pop(); + } + } + } + + py::capsule free_when_done_l(data_numpy_l, [](void *f) { + delete[] f; + }); + py::capsule free_when_done_d(data_numpy_d, [](void *f) { + delete[] f; + }); + + + return py::make_tuple( + py::array_t( + {rows, k}, // shape + {k * sizeof(hnswlib::labeltype), + sizeof(hnswlib::labeltype)}, // C-style contiguous strides for double + data_numpy_l, // the data pointer + free_when_done_l), + py::array_t( + {rows, k}, // shape + {k * sizeof(dist_t), sizeof(dist_t)}, // C-style contiguous strides for double + data_numpy_d, // the data pointer + free_when_done_d)); + + } + +}; PYBIND11_PLUGIN(hnswlib) { @@ -716,5 +880,13 @@ PYBIND11_PLUGIN(hnswlib) { return ""; }); + py::class_>(m, "BFIndex") + .def(py::init(), py::arg("space"), py::arg("dim")) + .def("init_index", &BFIndex::init_new_index, py::arg("max_elements")) + .def("knn_query", &BFIndex::knnQuery_return_numpy, py::arg("data"), py::arg("k")=1) + .def("add_items", &BFIndex::addItems, py::arg("data"), py::arg("ids") = py::none()) + .def("__repr__", [](const BFIndex &a) { + return ""; + }); return m.ptr(); } From d4c881da19589f58b318f043bbde9a01b397b6b7 Mon Sep 17 00:00:00 2001 From: alon Date: Thu, 22 Jul 2021 10:05:47 +0300 Subject: [PATCH 3/5] Add recall test for hnsw via python bindings --- examples/recall_test.py | 60 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 examples/recall_test.py diff --git a/examples/recall_test.py b/examples/recall_test.py new file mode 100644 index 00000000..feba3477 --- /dev/null +++ b/examples/recall_test.py @@ -0,0 +1,60 @@ +import hnswlib +import numpy as np + +dim = 128 +num_elements = 100000 +k = 10 +nun_queries = 10 + +# Generating sample data +data = np.float32(np.random.random((num_elements, dim))) + +# Declaring index +hnsw_index = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip +bf_index = hnswlib.BFIndex(space='l2', dim=dim) + +# Initing both hnsw and brute force indices +# max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded +# during insertion of an element. +# The capacity can be increased by saving/loading the index, see below. +# +# hnsw construction params: +# ef_construction - controls index search speed/build speed tradeoff +# +# M - is tightly connected with internal dimensionality of the data. Strongly affects the memory consumption (~M) +# Higher M leads to higher accuracy/run_time at fixed ef/efConstruction + +hnsw_index.init_index(max_elements=num_elements, ef_construction=10, M=6) +bf_index.init_index(max_elements=num_elements) + +# Controlling the recall for hnsw by setting ef: +# higher ef leads to better accuracy, but slower search +hnsw_index.set_ef(10) + +# Set number of threads used during batch search/construction in hnsw +# By default using all available cores +hnsw_index.set_num_threads(1) + +print("Adding batch of %d elements" % (len(data))) +hnsw_index.add_items(data) +bf_index.add_items(data) + +print("Indices built") + +# Generating query data +query_data = np.float32(np.random.random((10, dim))) + +# Query the elements and measure recall: +labels_hnsw, distances_hnsw = hnsw_index.knn_query(query_data, k) +labels_bf, distances_bf = bf_index.knn_query(query_data, k) + +# Measure recall +correct = 0 +for i in range(nun_queries): + for label in labels_hnsw[i]: + for correct_label in labels_bf[i]: + if label == correct_label: + correct += 1 + break + +print("recall is :", float(correct)/(k*nun_queries)) From 079c71e2d33d0ad48d16bee80dac444eb7b3d3c1 Mon Sep 17 00:00:00 2001 From: alon Date: Sun, 25 Jul 2021 16:02:03 +0300 Subject: [PATCH 4/5] Add load and store index to the bindings, update test recall --- examples/searchKnnCloserFirst_test.cpp | 71 +------------------ hnswlib/bruteforce.h | 3 +- python_bindings/__init__.py | 0 python_bindings/bindings.cpp | 33 +++++++-- .../tests/bindings_test_recall.py | 36 ++++++++-- 5 files changed, 64 insertions(+), 79 deletions(-) create mode 100644 python_bindings/__init__.py rename examples/recall_test.py => python_bindings/tests/bindings_test_recall.py (67%) diff --git a/examples/searchKnnCloserFirst_test.cpp b/examples/searchKnnCloserFirst_test.cpp index 4e50b66f..cc1392c8 100644 --- a/examples/searchKnnCloserFirst_test.cpp +++ b/examples/searchKnnCloserFirst_test.cpp @@ -9,69 +9,12 @@ #include #include -#include namespace { using idx_t = hnswlib::labeltype; - template - inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn) { - if (numThreads <= 0) { - numThreads = std::thread::hardware_concurrency(); - } - - if (numThreads == 1) { - for (size_t id = start; id < end; id++) { - fn(id, 0); - } - } else { - std::vector threads; - std::atomic current(start); - - // keep track of exceptions in threads - // https://stackoverflow.com/a/32428427/1713196 - std::exception_ptr lastException = nullptr; - std::mutex lastExceptMutex; - - for (size_t threadId = 0; threadId < numThreads; ++threadId) { - threads.push_back(std::thread([&, threadId] { - while (true) { - size_t id = current.fetch_add(1); - - if ((id >= end)) { - break; - } - - try { - fn(id, threadId); - } catch (...) { - std::unique_lock lastExcepLock(lastExceptMutex); - lastException = std::current_exception(); - /* - * This will work even when current is the largest value that - * size_t can fit, because fetch_add returns the previous value - * before the increment (what will result in overflow - * and produce 0 instead of current + 1). - */ - current = end; - break; - } - } - })); - } - for (auto &thread : threads) { - thread.join(); - } - if (lastException) { - std::rethrow_exception(lastException); - } - } - - - } - void test() { int d = 4; idx_t n = 100; @@ -97,18 +40,10 @@ void test() { hnswlib::AlgorithmInterface* alg_brute = new hnswlib::BruteforceSearch(&space, 2 * n); hnswlib::AlgorithmInterface* alg_hnsw = new hnswlib::HierarchicalNSW(&space, 2 * n); -// for (size_t i = 0; i < n; ++i) { -// alg_brute->addPoint(data.data() + d * i, i); -// alg_hnsw->addPoint(data.data() + d * i, i); -// } - - ParallelFor(0, n, 4, [&](size_t i, size_t threadId) { - alg_hnsw->addPoint(data.data() + d * i, i); - }); - - ParallelFor(0, n, 4, [&](size_t i, size_t threadId) { + for (size_t i = 0; i < n; ++i) { alg_brute->addPoint(data.data() + d * i, i); - }); + alg_hnsw->addPoint(data.data() + d * i, i); + } // test searchKnnCloserFirst of BruteforceSearch for (size_t j = 0; j < nq; ++j) { diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h index 7fbdee9a..24260400 100644 --- a/hnswlib/bruteforce.h +++ b/hnswlib/bruteforce.h @@ -68,6 +68,8 @@ namespace hnswlib { memcpy(data_ + size_per_element_ * idx, datapoint, data_size_); + + }; void removePoint(labeltype cur_external) { @@ -97,7 +99,6 @@ namespace hnswlib { dist_t lastdist = topResults.top().first; for (int i = k; i < cur_element_count; i++) { dist_t dist = fstdistfunc_(query_data, data_ + size_per_element_ * i, dist_func_param_); - if (dist <= lastdist) { topResults.push(std::pair(dist, *((labeltype *) (data_ + size_per_element_ * i + data_size_)))); diff --git a/python_bindings/__init__.py b/python_bindings/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 22029019..f0761640 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -743,21 +743,39 @@ class BFIndex { throw std::runtime_error("wrong dimensionality of the labels"); } { - int start = 0; - py::gil_scoped_release l; - std::vector norm_array(dim); - for (size_t i = start; i < rows; i++) { - alg->addPoint((void *) items.data(i), (size_t) i); + for (size_t row = 0; row < rows; row++) { + size_t id = ids.size() ? ids.at(row) : cur_l + row; + if (!normalize) { + alg->addPoint((void *) items.data(row), (size_t) id); + } else { + float normalized_vector[dim]; + normalize_vector((float *)items.data(row), normalized_vector); + alg->addPoint((void *) normalized_vector, (size_t) id); + } } cur_l+=rows; } } - void deletedVector(size_t label) { + void deleteVector(size_t label) { alg->removePoint(label); } + void saveIndex(const std::string &path_to_index) { + alg->saveIndex(path_to_index); + } + + void loadIndex(const std::string &path_to_index, size_t max_elements) { + if (alg) { + std::cerr<<"Warning: Calling load_index for an already inited index. Old index is being deallocated."; + delete alg; + } + alg = new hnswlib::BruteforceSearch(space, path_to_index); + cur_l = alg->cur_element_count; + index_inited = true; + } + py::object knnQuery_return_numpy(py::object input, size_t k = 1) { py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input); @@ -885,6 +903,9 @@ PYBIND11_PLUGIN(hnswlib) { .def("init_index", &BFIndex::init_new_index, py::arg("max_elements")) .def("knn_query", &BFIndex::knnQuery_return_numpy, py::arg("data"), py::arg("k")=1) .def("add_items", &BFIndex::addItems, py::arg("data"), py::arg("ids") = py::none()) + .def("delete_vector", &BFIndex::deleteVector, py::arg("label")) + .def("save_index", &BFIndex::saveIndex, py::arg("path_to_index")) + .def("load_index", &BFIndex::loadIndex, py::arg("path_to_index"), py::arg("max_elements")=0) .def("__repr__", [](const BFIndex &a) { return ""; }); diff --git a/examples/recall_test.py b/python_bindings/tests/bindings_test_recall.py similarity index 67% rename from examples/recall_test.py rename to python_bindings/tests/bindings_test_recall.py index feba3477..3742fcdd 100644 --- a/examples/recall_test.py +++ b/python_bindings/tests/bindings_test_recall.py @@ -1,7 +1,7 @@ import hnswlib import numpy as np -dim = 128 +dim = 32 num_elements = 100000 k = 10 nun_queries = 10 @@ -24,12 +24,12 @@ # M - is tightly connected with internal dimensionality of the data. Strongly affects the memory consumption (~M) # Higher M leads to higher accuracy/run_time at fixed ef/efConstruction -hnsw_index.init_index(max_elements=num_elements, ef_construction=10, M=6) +hnsw_index.init_index(max_elements=num_elements, ef_construction=200, M=16) bf_index.init_index(max_elements=num_elements) # Controlling the recall for hnsw by setting ef: # higher ef leads to better accuracy, but slower search -hnsw_index.set_ef(10) +hnsw_index.set_ef(200) # Set number of threads used during batch search/construction in hnsw # By default using all available cores @@ -42,7 +42,7 @@ print("Indices built") # Generating query data -query_data = np.float32(np.random.random((10, dim))) +query_data = np.float32(np.random.random((nun_queries, dim))) # Query the elements and measure recall: labels_hnsw, distances_hnsw = hnsw_index.knn_query(query_data, k) @@ -58,3 +58,31 @@ break print("recall is :", float(correct)/(k*nun_queries)) + +# test serializing the brute force index +index_path = 'bf_index.bin' +print("Saving index to '%s'" % index_path) +bf_index.save_index(index_path) +del bf_index + +# Re-initiating, loading the index +bf_index = hnswlib.BFIndex(space='l2', dim=dim) + +print("\nLoading index from '%s'\n" % index_path) +bf_index.load_index(index_path) + +# Query the brute force index again to verify that we get the same results +labels_bf, distances_bf = bf_index.knn_query(query_data, k) + +# Measure recall +correct = 0 +for i in range(nun_queries): + for label in labels_hnsw[i]: + for correct_label in labels_bf[i]: + if label == correct_label: + correct += 1 + break + +print("recall after reloading is :", float(correct)/(k*nun_queries)) + + From 9c2dc7cea7dcab1cdfe3312508787ebb249fc5ac Mon Sep 17 00:00:00 2001 From: alon Date: Sun, 25 Jul 2021 16:44:05 +0300 Subject: [PATCH 5/5] Adding documentation --- TESTING_RECALL.md | 91 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 TESTING_RECALL.md diff --git a/TESTING_RECALL.md b/TESTING_RECALL.md new file mode 100644 index 00000000..23a6f654 --- /dev/null +++ b/TESTING_RECALL.md @@ -0,0 +1,91 @@ +# Testing recall + +Selecting HNSW parameters for a specific use case highly impacts the search quality. One way to test the quality of the constructed index is to compare the HNSW search results to the actual results (i.e., the actual `k` nearest neighbors). +For that cause, the API enables creating a simple "brute-force" index in which vectors are stored as is, and searching for the `k` nearest neighbors to a query vector requires going over the entire index. +Comparing between HNSW and brute-force results may help with finding the desired HNSW parameters for achieving a satisfying recall, based on the index size and data dimension. + +### Brute force index API +`hnswlib.BFIndex(space, dim)` creates a non-initialized index in space `space` with integer dimension `dim`. + +`hnswlib.BFIndex` methods: + +`init_index(max_elements)` initializes the index with no elements. + +max_elements defines the maximum number of elements that can be stored in the structure. + +`add_items(data, ids)` inserts the data (numpy array of vectors, shape:`N*dim`) into the structure. +`ids` are optional N-size numpy array of integer labels for all elements in data. + +`delete_vector(label)` delete the element associated with the given `label` so it will be omitted from search results. + +`knn_query(data, k = 1)` make a batch query for `k `closest elements for each element of the +`data` (shape:`N*dim`). Returns a numpy array of (shape:`N*k`). + +`load_index(path_to_index, max_elements = 0)` loads the index from persistence to the uninitialized index. + +`save_index(path_to_index)` saves the index from persistence. + +### measuring recall example + +``` +import hnswlib +import numpy as np + +dim = 32 +num_elements = 100000 +k = 10 +nun_queries = 10 + +# Generating sample data +data = np.float32(np.random.random((num_elements, dim))) + +# Declaring index +hnsw_index = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip +bf_index = hnswlib.BFIndex(space='l2', dim=dim) + +# Initing both hnsw and brute force indices +# max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded +# during insertion of an element. +# The capacity can be increased by saving/loading the index, see below. +# +# hnsw construction params: +# ef_construction - controls index search speed/build speed tradeoff +# +# M - is tightly connected with internal dimensionality of the data. Strongly affects the memory consumption (~M) +# Higher M leads to higher accuracy/run_time at fixed ef/efConstruction + +hnsw_index.init_index(max_elements=num_elements, ef_construction=200, M=16) +bf_index.init_index(max_elements=num_elements) + +# Controlling the recall for hnsw by setting ef: +# higher ef leads to better accuracy, but slower search +hnsw_index.set_ef(200) + +# Set number of threads used during batch search/construction in hnsw +# By default using all available cores +hnsw_index.set_num_threads(1) + +print("Adding batch of %d elements" % (len(data))) +hnsw_index.add_items(data) +bf_index.add_items(data) + +print("Indices built") + +# Generating query data +query_data = np.float32(np.random.random((nun_queries, dim))) + +# Query the elements and measure recall: +labels_hnsw, distances_hnsw = hnsw_index.knn_query(query_data, k) +labels_bf, distances_bf = bf_index.knn_query(query_data, k) + +# Measure recall +correct = 0 +for i in range(nun_queries): + for label in labels_hnsw[i]: + for correct_label in labels_bf[i]: + if label == correct_label: + correct += 1 + break + +print("recall is :", float(correct)/(k*nun_queries)) +```