diff --git a/nebula3/utils/__init__.py b/nebula3/utils/__init__.py new file mode 100644 index 00000000..427e116d --- /dev/null +++ b/nebula3/utils/__init__.py @@ -0,0 +1 @@ +from .hash import hash diff --git a/nebula3/utils/hash.py b/nebula3/utils/hash.py new file mode 100644 index 00000000..e5ac3dd7 --- /dev/null +++ b/nebula3/utils/hash.py @@ -0,0 +1,49 @@ +# nebula3/hash.py +from __future__ import annotations + +_M: int = 0xC6A4A7935BD1E995 +_R: int = 47 +_MASK64: int = (1 << 64) - 1 + + +def _read_u64_le(buf: bytes) -> int: + """Convert little-endian bytes of up to 8 bytes to an unsigned integer.""" + return int.from_bytes(buf, byteorder="little", signed=False) + + +def hash(data: bytes | str, seed: int = 0xC70F6907) -> int: + """MurmurHash2 64-bit variant: + :Param data: supports str (utf-8 encoding), bytes, bytearray + :Param seed: defaults to 0xC70F6907 + :return: Python int, in the range of signed 64-bit + """ + if isinstance(data, str): + data_as_bytes = data.encode("utf-8") + elif isinstance(data, (bytes, bytearray)): + data_as_bytes = bytes(data) + else: + raise TypeError("Input must be str, bytes, or bytearray") + + h = (seed ^ (_M * len(data_as_bytes) & _MASK64)) & _MASK64 + off = len(data_as_bytes) // 8 * 8 + for i in range(0, off, 8): + k = _read_u64_le(data_as_bytes[i : i + 8]) + k = (k * _M) & _MASK64 + k ^= k >> _R + k = (k * _M) & _MASK64 + h ^= k + h = (h * _M) & _MASK64 + + tail = data_as_bytes[off:] + if tail: + t = _read_u64_le(tail) + h ^= t + h = (h * _M) & _MASK64 + + h ^= h >> _R + h = (h * _M) & _MASK64 + h ^= h >> _R + + if h & (1 << 63): + h -= 1 << 64 + return h diff --git a/tests/test_hash.py b/tests/test_hash.py new file mode 100644 index 00000000..2cfd848c --- /dev/null +++ b/tests/test_hash.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python +# --coding:utf-8-- + +# Copyright (c) 2020 vesoft inc. All rights reserved. +# +# This source code is licensed under Apache 2.0 License. + +import pytest +from nebula3.utils.hash import hash as murmur_hash + +TEST_VECTORS = [ + (b"", 6142509188972423790), + (b"a", 4993892634952068459), + (b"abcdefgh", 8664279048047335611), # length-8 bytes cases + (b"abcdefghi", -5409788147785758033), + ("to_be_hashed", -1098333533029391540), + ("中文", -8591787916246384322), +] + + +@pytest.mark.parametrize("data, expected", TEST_VECTORS) +def test_known_vectors(data, expected): + assert murmur_hash(data) == expected + + +def test_str_bytes_equiv(): + """ + Ensure str and bytes inputs produce the same hash. + """ + s = "pytest" + assert murmur_hash(s) == murmur_hash(s.encode("utf-8")) + + +def test_type_error(): + """ + TypeError + """ + with pytest.raises(TypeError): + murmur_hash(12345) + + +def test_seed_variation(): + """Different seed values should produce different hashes.""" + data = b"seed_test" + hash1 = murmur_hash(data, seed=0) + hash2 = murmur_hash(data, seed=1) + assert hash1 != hash2 + + +def test_idempotent(): + """Repeated calls with same input must yield the same result.""" + data = b"consistent" + assert murmur_hash(data) == murmur_hash(data) + + +def test_large_input_performance(): + """Large inputs should be processed without error and return an int.""" + data = b"x" * 10_000 + result = murmur_hash(data) + assert isinstance(result, int) diff --git a/tests/test_hash_integration.py b/tests/test_hash_integration.py new file mode 100644 index 00000000..819de512 --- /dev/null +++ b/tests/test_hash_integration.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python +# --coding:utf-8-- + +# Copyright (c) 2020 vesoft inc. All rights reserved. +# +# This source code is licensed under Apache 2.0 License. + +import pytest +from nebula3.Config import Config +from nebula3.gclient.net import ConnectionPool +from nebula3.utils.hash import hash as murmur_hash + + +@pytest.fixture(scope="module") +def nebula_session(): + config = Config() + config.max_connection_pool_size = 10 + pool = ConnectionPool() + pool.init([("127.0.0.1", 9669)], config) + session = pool.get_session("root", "nebula") + yield session + pool.close() + + +@pytest.mark.parametrize( + "data", ["", "a", "abcdefgh", "abcdefghi", "to_be_hashed", "中文"] +) +def test_hash_against_server(nebula_session, data): + # Local Computing + expected = murmur_hash(data) + result = nebula_session.execute(f'YIELD hash("{data}")') + assert result.is_succeeded(), result.error_msg() + actual = result.row_values(0)[0].as_int() + assert actual == expected