From 225b519e9054cddff7f6f9ce1bca08bb225693d8 Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Sat, 14 Jan 2023 19:09:33 +0400 Subject: [PATCH 1/2] Add warning that python filter works slow in multi-threaded mode --- python_bindings/bindings.cpp | 4 ++++ tests/python/bindings_test_filter.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 3196a228..3f228832 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -611,6 +611,10 @@ class Index { if (num_threads <= 0) num_threads = num_threads_default; + if ((filter != nullptr) && (num_threads != 1)) { + std::cout << "Warning: search with python filter works slow in multi-threaded mode. For best performance set num_threads=1\n"; + } + { py::gil_scoped_release l; get_input_array_shapes(buffer, &rows, &features); diff --git a/tests/python/bindings_test_filter.py b/tests/python/bindings_test_filter.py index a798e02f..ecb79ab9 100644 --- a/tests/python/bindings_test_filter.py +++ b/tests/python/bindings_test_filter.py @@ -47,7 +47,8 @@ def testRandomSelf(self): print("Querying only even elements") # Query the even elements for themselves and measure recall: filter_function = lambda id: id%2 == 0 - labels, distances = hnsw_index.knn_query(data, k=1, filter=filter_function) + # Search with python filter works slow in multi-threaded mode, therefore we set num_threads=1 + labels, distances = hnsw_index.knn_query(data, k=1, filter=filter_function, num_threads=1) self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), .5, 3) # Verify that there are only even elements: self.assertTrue(np.max(np.mod(labels, 2)) == 0) From 32f4b02def881a48565a9d51a4e7332a1e24b778 Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Sun, 15 Jan 2023 12:07:08 +0400 Subject: [PATCH 2/2] Add comments with warnings that filter works slow in python in multithreaded mode. Add example files to CI test. --- .github/workflows/build.yml | 4 +++- README.md | 2 +- examples/EXAMPLES.md | 3 ++- examples/example_filter.py | 3 ++- python_bindings/bindings.cpp | 5 +---- tests/python/bindings_test_filter.py | 4 ++-- 6 files changed, 11 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d45b8b33..f2662c15 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,7 +20,9 @@ jobs: - name: Test timeout-minutes: 15 - run: python -m unittest discover -v --start-directory tests/python --pattern "bindings_test*.py" + run: | + python -m unittest discover -v --start-directory examples --pattern "example*.py" + python -m unittest discover -v --start-directory tests/python --pattern "bindings_test*.py" test_cpp: runs-on: ${{matrix.os}} diff --git a/README.md b/README.md index 04d84d66..2b027216 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ For other spaces use the nmslib library https://github.com/nmslib/nmslib. * `knn_query(data, k = 1, num_threads = -1, filter = None)` make a batch query for `k` closest elements for each element of the * `data` (shape:`N*dim`). Returns a numpy array of (shape:`N*k`). * `num_threads` sets the number of cpu threads to use (-1 means use default). - * `filter` filters elements by its labels, returns elements with allowed ids + * `filter` filters elements by its labels, returns elements with allowed ids. Note that search with a filter works slow in python in multithreaded mode. It is recommended to set `num_threads=1` * Thread-safe with other `knn_query` calls, but not with `add_items`. * `load_index(path_to_index, max_elements = 0, allow_replace_deleted = False)` loads the index from persistence to the uninitialized index. diff --git a/examples/EXAMPLES.md b/examples/EXAMPLES.md index 71f69ff4..a92f3626 100644 --- a/examples/EXAMPLES.md +++ b/examples/EXAMPLES.md @@ -147,7 +147,8 @@ print("Querying only even elements") # Define filter function that allows only even ids filter_function = lambda idx: idx%2 == 0 # Query the elements for themselves and search only for even elements: -labels, distances = hnsw_index.knn_query(data, k=1, filter=filter_function) +# Warning: search with python filter works slow in multithreaded mode, therefore we set num_threads=1 +labels, distances = hnsw_index.knn_query(data, k=1, num_threads=1, filter=filter_function) # labels contain only elements with even id ``` diff --git a/examples/example_filter.py b/examples/example_filter.py index 10a059a8..add22a3d 100644 --- a/examples/example_filter.py +++ b/examples/example_filter.py @@ -41,5 +41,6 @@ # Define filter function that allows only even ids filter_function = lambda idx: idx%2 == 0 # Query the elements for themselves and search only for even elements: -labels, distances = hnsw_index.knn_query(data, k=1, filter=filter_function) +# Warning: search with a filter works slow in python in multithreaded mode, therefore we set num_threads=1 +labels, distances = hnsw_index.knn_query(data, k=1, num_threads=1, filter=filter_function) # labels contain only elements with even id diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 3f228832..5153bb58 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -611,10 +611,6 @@ class Index { if (num_threads <= 0) num_threads = num_threads_default; - if ((filter != nullptr) && (num_threads != 1)) { - std::cout << "Warning: search with python filter works slow in multi-threaded mode. For best performance set num_threads=1\n"; - } - { py::gil_scoped_release l; get_input_array_shapes(buffer, &rows, &features); @@ -627,6 +623,7 @@ class Index { data_numpy_l = new hnswlib::labeltype[rows * k]; data_numpy_d = new dist_t[rows * k]; + // Warning: search with a filter works slow in python in multithreaded mode. For best performance set num_threads=1 CustomFilterFunctor idFilter(filter); CustomFilterFunctor* p_idFilter = filter ? &idFilter : nullptr; diff --git a/tests/python/bindings_test_filter.py b/tests/python/bindings_test_filter.py index ecb79ab9..480c8dcd 100644 --- a/tests/python/bindings_test_filter.py +++ b/tests/python/bindings_test_filter.py @@ -47,8 +47,8 @@ def testRandomSelf(self): print("Querying only even elements") # Query the even elements for themselves and measure recall: filter_function = lambda id: id%2 == 0 - # Search with python filter works slow in multi-threaded mode, therefore we set num_threads=1 - labels, distances = hnsw_index.knn_query(data, k=1, filter=filter_function, num_threads=1) + # Warning: search with a filter works slow in python in multithreaded mode, therefore we set num_threads=1 + labels, distances = hnsw_index.knn_query(data, k=1, num_threads=1, filter=filter_function) self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), .5, 3) # Verify that there are only even elements: self.assertTrue(np.max(np.mod(labels, 2)) == 0)