@@ -365,7 +365,7 @@ cdef class {{name}}HashTable(HashTable):
365365 def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
366366 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
367367 object na_value=None, bint ignore_na=False,
368- bint return_inverse=False):
368+ object mask=None, bint return_inverse=False):
369369 """
370370 Calculate unique values and labels (no sorting!)
371371
@@ -388,6 +388,10 @@ cdef class {{name}}HashTable(HashTable):
388388 Whether NA-values should be ignored for calculating the uniques. If
389389 True, the labels corresponding to missing values will be set to
390390 na_sentinel.
391+ mask : ndarray[bool], optional
392+ If not None, the mask is used as indicator for missing values
393+ (True = missing, False = valid) instead of `na_value` or
394+ condition "val != val".
391395 return_inverse : boolean, default False
392396 Whether the mapping of the original array values to their location
393397 in the vector of uniques should be returned.
@@ -406,12 +410,17 @@ cdef class {{name}}HashTable(HashTable):
406410 {{dtype}}_t val, na_value2
407411 khiter_t k
408412 {{name}}VectorData *ud
409- bint use_na_value
413+ bint use_na_value, use_mask
414+ uint8_t[:] mask_values
410415
411416 if return_inverse:
412417 labels = np.empty(n, dtype=np.int64)
413418 ud = uniques.data
414419 use_na_value = na_value is not None
420+ use_mask = mask is not None
421+
422+ if use_mask:
423+ mask_values = mask.view("uint8")
415424
416425 if use_na_value:
417426 # We need this na_value2 because we want to allow users
@@ -427,7 +436,11 @@ cdef class {{name}}HashTable(HashTable):
427436 for i in range(n):
428437 val = values[i]
429438
430- if ignore_na and (
439+ if ignore_na and use_mask:
440+ if mask_values[i]:
441+ labels[i] = na_sentinel
442+ continue
443+ elif ignore_na and (
431444 {{if not name.lower().startswith(("uint", "int"))}}
432445 val != val or
433446 {{endif}}
@@ -491,7 +504,7 @@ cdef class {{name}}HashTable(HashTable):
491504 return_inverse=return_inverse)
492505
493506 def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
494- object na_value=None):
507+ object na_value=None, object mask=None ):
495508 """
496509 Calculate unique values and labels (no sorting!)
497510
@@ -509,6 +522,10 @@ cdef class {{name}}HashTable(HashTable):
509522 any value "val" satisfying val != val is considered missing.
510523 If na_value is not None, then _additionally_, any value "val"
511524 satisfying val == na_value is considered missing.
525+ mask : ndarray[bool], optional
526+ If not None, the mask is used as indicator for missing values
527+ (True = missing, False = valid) instead of `na_value` or
528+ condition "val != val".
512529
513530 Returns
514531 -------
@@ -519,7 +536,7 @@ cdef class {{name}}HashTable(HashTable):
519536 """
520537 uniques_vector = {{name}}Vector()
521538 return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
522- na_value=na_value, ignore_na=True,
539+ na_value=na_value, ignore_na=True, mask=mask,
523540 return_inverse=True)
524541
525542 def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
@@ -852,7 +869,7 @@ cdef class StringHashTable(HashTable):
852869 return_inverse=return_inverse)
853870
854871 def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
855- object na_value=None):
872+ object na_value=None, object mask=None ):
856873 """
857874 Calculate unique values and labels (no sorting!)
858875
@@ -870,6 +887,8 @@ cdef class StringHashTable(HashTable):
870887 that is not a string is considered missing. If na_value is
871888 not None, then _additionally_ any value "val" satisfying
872889 val == na_value is considered missing.
890+ mask : ndarray[bool], optional
891+ Not yet implementd for StringHashTable.
873892
874893 Returns
875894 -------
@@ -1091,7 +1110,7 @@ cdef class PyObjectHashTable(HashTable):
10911110 return_inverse=return_inverse)
10921111
10931112 def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
1094- object na_value=None):
1113+ object na_value=None, object mask=None ):
10951114 """
10961115 Calculate unique values and labels (no sorting!)
10971116
@@ -1109,6 +1128,8 @@ cdef class PyObjectHashTable(HashTable):
11091128 any value "val" satisfying val != val is considered missing.
11101129 If na_value is not None, then _additionally_, any value "val"
11111130 satisfying val == na_value is considered missing.
1131+ mask : ndarray[bool], optional
1132+ Not yet implemented for PyObjectHashTable.
11121133
11131134 Returns
11141135 -------
0 commit comments