Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions nebula3/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .hash import hash
49 changes: 49 additions & 0 deletions nebula3/utils/hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# nebula3/hash.py
from __future__ import annotations

_M: int = 0xC6A4A7935BD1E995
_R: int = 47
_MASK64: int = (1 << 64) - 1


def _read_u64_le(buf: bytes) -> int:
"""Convert little-endian bytes of up to 8 bytes to an unsigned integer."""
return int.from_bytes(buf, byteorder="little", signed=False)


def hash(data: bytes | str, seed: int = 0xC70F6907) -> int:
"""MurmurHash2 64-bit variant:
:Param data: supports str (utf-8 encoding), bytes, bytearray
:Param seed: defaults to 0xC70F6907
:return: Python int, in the range of signed 64-bit
"""
if isinstance(data, str):
data_as_bytes = data.encode("utf-8")
elif isinstance(data, (bytes, bytearray)):
data_as_bytes = bytes(data)
else:
raise TypeError("Input must be str, bytes, or bytearray")

h = (seed ^ (_M * len(data_as_bytes) & _MASK64)) & _MASK64
off = len(data_as_bytes) // 8 * 8
for i in range(0, off, 8):
k = _read_u64_le(data_as_bytes[i : i + 8])
k = (k * _M) & _MASK64
k ^= k >> _R
k = (k * _M) & _MASK64
h ^= k
h = (h * _M) & _MASK64

tail = data_as_bytes[off:]
if tail:
t = _read_u64_le(tail)
h ^= t
h = (h * _M) & _MASK64

h ^= h >> _R
h = (h * _M) & _MASK64
h ^= h >> _R

if h & (1 << 63):
h -= 1 << 64
return h
60 changes: 60 additions & 0 deletions tests/test_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env python
# --coding:utf-8--

# Copyright (c) 2020 vesoft inc. All rights reserved.
#
# This source code is licensed under Apache 2.0 License.

import pytest
from nebula3.utils.hash import hash as murmur_hash

TEST_VECTORS = [
(b"", 6142509188972423790),
(b"a", 4993892634952068459),
(b"abcdefgh", 8664279048047335611), # length-8 bytes cases
(b"abcdefghi", -5409788147785758033),
("to_be_hashed", -1098333533029391540),
("中文", -8591787916246384322),
]


@pytest.mark.parametrize("data, expected", TEST_VECTORS)
def test_known_vectors(data, expected):
assert murmur_hash(data) == expected


def test_str_bytes_equiv():
"""
Ensure str and bytes inputs produce the same hash.
"""
s = "pytest"
assert murmur_hash(s) == murmur_hash(s.encode("utf-8"))


def test_type_error():
"""
TypeError
"""
with pytest.raises(TypeError):
murmur_hash(12345)


def test_seed_variation():
"""Different seed values should produce different hashes."""
data = b"seed_test"
hash1 = murmur_hash(data, seed=0)
hash2 = murmur_hash(data, seed=1)
assert hash1 != hash2


def test_idempotent():
"""Repeated calls with same input must yield the same result."""
data = b"consistent"
assert murmur_hash(data) == murmur_hash(data)


def test_large_input_performance():
"""Large inputs should be processed without error and return an int."""
data = b"x" * 10_000
result = murmur_hash(data)
assert isinstance(result, int)
34 changes: 34 additions & 0 deletions tests/test_hash_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env python
# --coding:utf-8--

# Copyright (c) 2020 vesoft inc. All rights reserved.
#
# This source code is licensed under Apache 2.0 License.

import pytest
from nebula3.Config import Config
from nebula3.gclient.net import ConnectionPool
from nebula3.utils.hash import hash as murmur_hash


@pytest.fixture(scope="module")
def nebula_session():
config = Config()
config.max_connection_pool_size = 10
pool = ConnectionPool()
pool.init([("127.0.0.1", 9669)], config)
session = pool.get_session("root", "nebula")
yield session
pool.close()


@pytest.mark.parametrize(
"data", ["", "a", "abcdefgh", "abcdefghi", "to_be_hashed", "中文"]
)
def test_hash_against_server(nebula_session, data):
# Local Computing
expected = murmur_hash(data)
result = nebula_session.execute(f'YIELD hash("{data}")')
assert result.is_succeeded(), result.error_msg()
actual = result.row_values(0)[0].as_int()
assert actual == expected
Loading