diff --git a/examples/pyw_hnswlib.py b/examples/pyw_hnswlib.py index dc300173..2d1e70bc 100644 --- a/examples/pyw_hnswlib.py +++ b/examples/pyw_hnswlib.py @@ -11,8 +11,8 @@ def __init__(self, space, dim): self.dict_labels = {} self.cur_ind = 0 - def init_index(self, max_elements, ef_construction = 200, M = 16): - self.index.init_index(max_elements = max_elements, ef_construction = ef_construction, M = M) + def init_index(self, max_elements, ef_construction=200, M=16): + self.index.init_index(max_elements=max_elements, ef_construction=ef_construction, M=M) def add_items(self, data, ids=None): if ids is not None: diff --git a/python_bindings/tests/bindings_test.py b/python_bindings/tests/bindings_test.py index d718bc3b..f9b3092f 100644 --- a/python_bindings/tests/bindings_test.py +++ b/python_bindings/tests/bindings_test.py @@ -18,15 +18,15 @@ def testRandomSelf(self): # Declaring index p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip - # Initing index + # Initiating index # max_elements - the maximum number of elements, should be known beforehand # (probably will be made optional in the future) # # ef_construction - controls index search speed/build speed tradeoff # M - is tightly connected with internal dimensionality of the data - # stronlgy affects the memory consumption + # strongly affects the memory consumption - p.init_index(max_elements = num_elements, ef_construction = 100, M = 16) + p.init_index(max_elements=num_elements, ef_construction=100, M=16) # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search @@ -51,7 +51,7 @@ def testRandomSelf(self): p.save_index(index_path) del p - # Reiniting, loading the index + # Re-initiating, loading the index p = hnswlib.Index(space='l2', dim=dim) # you can change the sa print("\nLoading index from '%s'\n" % index_path) diff --git a/python_bindings/tests/bindings_test_getdata.py b/python_bindings/tests/bindings_test_getdata.py index 8655d7f8..2985c1dd 100644 --- a/python_bindings/tests/bindings_test_getdata.py +++ b/python_bindings/tests/bindings_test_getdata.py @@ -19,13 +19,13 @@ def testGettingItems(self): # Declaring index p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip - # Initing index + # Initiating index # max_elements - the maximum number of elements, should be known beforehand # (probably will be made optional in the future) # # ef_construction - controls index search speed/build speed tradeoff # M - is tightly connected with internal dimensionality of the data - # stronlgy affects the memory consumption + # strongly affects the memory consumption p.init_index(max_elements=num_elements, ef_construction=100, M=16) diff --git a/python_bindings/tests/bindings_test_labels.py b/python_bindings/tests/bindings_test_labels.py index 5c13e198..668d7694 100644 --- a/python_bindings/tests/bindings_test_labels.py +++ b/python_bindings/tests/bindings_test_labels.py @@ -21,13 +21,13 @@ def testRandomSelf(self): # Declaring index p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip - # Initing index + # Initiating index # max_elements - the maximum number of elements, should be known beforehand # (probably will be made optional in the future) # # ef_construction - controls index search speed/build speed tradeoff # M - is tightly connected with internal dimensionality of the data - # stronlgy affects the memory consumption + # strongly affects the memory consumption p.init_index(max_elements=num_elements, ef_construction=100, M=16) @@ -47,7 +47,7 @@ def testRandomSelf(self): # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data1, k=1) - items=p.get_items(labels) + items = p.get_items(labels) # Check the recall: self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3) @@ -67,8 +67,8 @@ def testRandomSelf(self): print("Deleted") print("\n**** Mark delete test ****\n") - # Reiniting, loading the index - print("Reiniting") + # Re-initiating, loading the index + print("Re-initiating") p = hnswlib.Index(space='l2', dim=dim) print("\nLoading index from '%s'\n" % index_path) @@ -80,17 +80,17 @@ def testRandomSelf(self): # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data, k=1) - items=p.get_items(labels) + items = p.get_items(labels) # Check the recall: self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3) # Check that the returned element data is correct: - diff_with_gt_labels=np.mean(np.abs(data-items)) + diff_with_gt_labels = np.mean(np.abs(data-items)) self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) # deleting index. # Checking that all labels are returned correctly: - sorted_labels=sorted(p.get_ids_list()) + sorted_labels = sorted(p.get_ids_list()) self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0) # Delete data1 diff --git a/python_bindings/tests/bindings_test_pickle.py b/python_bindings/tests/bindings_test_pickle.py index 3a42df2e..07820b1d 100644 --- a/python_bindings/tests/bindings_test_pickle.py +++ b/python_bindings/tests/bindings_test_pickle.py @@ -60,38 +60,38 @@ def test_space_main(self, space, dim): p.num_threads = self.num_threads # by default using all available cores - p0 = pickle.loads(pickle.dumps(p)) ### pickle un-initialized Index + p0 = pickle.loads(pickle.dumps(p)) # pickle un-initialized Index p.init_index(max_elements=self.num_elements, ef_construction=self.ef_construction, M=self.M) p0.init_index(max_elements=self.num_elements, ef_construction=self.ef_construction, M=self.M) p.ef = self.ef p0.ef = self.ef - p1 = pickle.loads(pickle.dumps(p)) ### pickle Index before adding items + p1 = pickle.loads(pickle.dumps(p)) # pickle Index before adding items - ### add items to ann index p,p0,p1 + # add items to ann index p,p0,p1 p.add_items(data) p1.add_items(data) p0.add_items(data) - p2=pickle.loads(pickle.dumps(p)) ### pickle Index before adding items + p2=pickle.loads(pickle.dumps(p)) # pickle Index before adding items self.assertTrue(np.allclose(p.get_items(), p0.get_items()), "items for p and p0 must be same") self.assertTrue(np.allclose(p0.get_items(), p1.get_items()), "items for p0 and p1 must be same") self.assertTrue(np.allclose(p1.get_items(), p2.get_items()), "items for p1 and p2 must be same") - ### Test if returned distances are same + # Test if returned distances are same l, d = p.knn_query(test_data, k=self.k) l0, d0 = p0.knn_query(test_data, k=self.k) l1, d1 = p1.knn_query(test_data, k=self.k) l2, d2 = p2.knn_query(test_data, k=self.k) - self.assertLessEqual(np.sum(((d-d0)**2.)>1e-3), self.dists_err_thresh, msg=f"knn distances returned by p and p0 must match") - self.assertLessEqual(np.sum(((d0-d1)**2.)>1e-3), self.dists_err_thresh, msg=f"knn distances returned by p0 and p1 must match") - self.assertLessEqual(np.sum(((d1-d2)**2.)>1e-3), self.dists_err_thresh, msg=f"knn distances returned by p1 and p2 must match") + self.assertLessEqual(np.sum(((d-d0)**2.) > 1e-3), self.dists_err_thresh, msg=f"knn distances returned by p and p0 must match") + self.assertLessEqual(np.sum(((d0-d1)**2.) > 1e-3), self.dists_err_thresh, msg=f"knn distances returned by p0 and p1 must match") + self.assertLessEqual(np.sum(((d1-d2)**2.) > 1e-3), self.dists_err_thresh, msg=f"knn distances returned by p1 and p2 must match") - ### check if ann results match brute-force search - ### allow for 2 labels to be missing from ann results + # check if ann results match brute-force search + # allow for 2 labels to be missing from ann results check_ann_results(self, space, data, test_data, self.k, l, d, err_thresh=self.label_err_thresh, total_thresh=self.item_err_thresh, @@ -102,19 +102,19 @@ def test_space_main(self, space, dim): total_thresh=self.item_err_thresh, dists_thresh=self.dists_err_thresh) - ### Check ef parameter value + # Check ef parameter value self.assertEqual(p.ef, self.ef, "incorrect value of p.ef") self.assertEqual(p0.ef, self.ef, "incorrect value of p0.ef") self.assertEqual(p2.ef, self.ef, "incorrect value of p2.ef") self.assertEqual(p1.ef, self.ef, "incorrect value of p1.ef") - ### Check M parameter value + # Check M parameter value self.assertEqual(p.M, self.M, "incorrect value of p.M") self.assertEqual(p0.M, self.M, "incorrect value of p0.M") self.assertEqual(p1.M, self.M, "incorrect value of p1.M") self.assertEqual(p2.M, self.M, "incorrect value of p2.M") - ### Check ef_construction parameter value + # Check ef_construction parameter value self.assertEqual(p.ef_construction, self.ef_construction, "incorrect value of p.ef_construction") self.assertEqual(p0.ef_construction, self.ef_construction, "incorrect value of p0.ef_construction") self.assertEqual(p1.ef_construction, self.ef_construction, "incorrect value of p1.ef_construction") @@ -135,12 +135,12 @@ def setUp(self): self.num_threads = 4 self.k = 25 - self.label_err_thresh = 5 ### max number of missing labels allowed per test item - self.item_err_thresh = 5 ### max number of items allowed with incorrect labels + self.label_err_thresh = 5 # max number of missing labels allowed per test item + self.item_err_thresh = 5 # max number of items allowed with incorrect labels - self.dists_err_thresh = 50 ### for two matrices, d1 and d2, dists_err_thresh controls max - ### number of value pairs that are allowed to be different in d1 and d2 - ### i.e., number of values that are (d1-d2)**2>1e-3 + self.dists_err_thresh = 50 # for two matrices, d1 and d2, dists_err_thresh controls max + # number of value pairs that are allowed to be different in d1 and d2 + # i.e., number of values that are (d1-d2)**2>1e-3 def test_inner_product_space(self): test_space_main(self, 'ip', 48) diff --git a/python_bindings/tests/bindings_test_resize.py b/python_bindings/tests/bindings_test_resize.py index 3c4e3e4f..b5bceeb1 100644 --- a/python_bindings/tests/bindings_test_resize.py +++ b/python_bindings/tests/bindings_test_resize.py @@ -7,71 +7,71 @@ class RandomSelfTestCase(unittest.TestCase): def testRandomSelf(self): - for idx in range(16): - print("\n**** Index resize test ****\n") + for idx in range(16): + print("\n**** Index resize test ****\n") - np.random.seed(idx) - dim = 16 - num_elements = 10000 + np.random.seed(idx) + dim = 16 + num_elements = 10000 - # Generating sample data - data = np.float32(np.random.random((num_elements, dim))) + # Generating sample data + data = np.float32(np.random.random((num_elements, dim))) - # Declaring index - p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip + # Declaring index + p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip - # Initing index - # max_elements - the maximum number of elements, should be known beforehand - # (probably will be made optional in the future) - # - # ef_construction - controls index search speed/build speed tradeoff - # M - is tightly connected with internal dimensionality of the data - # stronlgy affects the memory consumption + # Initiating index + # max_elements - the maximum number of elements, should be known beforehand + # (probably will be made optional in the future) + # + # ef_construction - controls index search speed/build speed tradeoff + # M - is tightly connected with internal dimensionality of the data + # strongly affects the memory consumption - p.init_index(max_elements=num_elements//2, ef_construction=100, M=16) + p.init_index(max_elements=num_elements//2, ef_construction=100, M=16) - # Controlling the recall by setting ef: - # higher ef leads to better accuracy, but slower search - p.set_ef(20) + # Controlling the recall by setting ef: + # higher ef leads to better accuracy, but slower search + p.set_ef(20) - p.set_num_threads(idx%8) # by default using all available cores + p.set_num_threads(idx % 8) # by default using all available cores - # We split the data in two batches: - data1 = data[:num_elements // 2] - data2 = data[num_elements // 2:] + # We split the data in two batches: + data1 = data[:num_elements // 2] + data2 = data[num_elements // 2:] - print("Adding first batch of %d elements" % (len(data1))) - p.add_items(data1) + print("Adding first batch of %d elements" % (len(data1))) + p.add_items(data1) - # Query the elements for themselves and measure recall: - labels, distances = p.knn_query(data1, k=1) + # Query the elements for themselves and measure recall: + labels, distances = p.knn_query(data1, k=1) - items = p.get_items(list(range(len(data1)))) + items = p.get_items(list(range(len(data1)))) - # Check the recall: - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3) + # Check the recall: + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3) - # Check that the returned element data is correct: - diff_with_gt_labels = np.max(np.abs(data1-items)) - self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) + # Check that the returned element data is correct: + diff_with_gt_labels = np.max(np.abs(data1-items)) + self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) - print("Resizing the index") - p.resize_index(num_elements) + print("Resizing the index") + p.resize_index(num_elements) - print("Adding the second batch of %d elements" % (len(data2))) - p.add_items(data2) + print("Adding the second batch of %d elements" % (len(data2))) + p.add_items(data2) - # Query the elements for themselves and measure recall: - labels, distances = p.knn_query(data, k=1) - items=p.get_items(list(range(num_elements))) + # Query the elements for themselves and measure recall: + labels, distances = p.knn_query(data, k=1) + items=p.get_items(list(range(num_elements))) - # Check the recall: - self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3) + # Check the recall: + self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3) - # Check that the returned element data is correct: - diff_with_gt_labels=np.max(np.abs(data-items)) - self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) + # Check that the returned element data is correct: + diff_with_gt_labels = np.max(np.abs(data-items)) + self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) - # Checking that all labels are returned correcly: - sorted_labels=sorted(p.get_ids_list()) - self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0) + # Checking that all labels are returned correctly: + sorted_labels = sorted(p.get_ids_list()) + self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0)