Updated snippets docstring, refactored _mpdist_vect

seanlaw · seanlaw · commit 60bb08d5836e · 2022-04-05T10:40:50.000-04:00
diff --git a/stumpy/aampdist.py b/stumpy/aampdist.py
@@ -2,10 +2,11 @@
 # Copyright 2019 TD Ameritrade. Released under the terms of the 3-Clause BSD license.
 # STUMPY is a trademark of TD Ameritrade IP Company, Inc. All rights reserved.
 
+import numpy as np
+import math
 import functools
 
-from . import aamp, aamped, mpdist
-from .core import _mass_absolute_distance_matrix
+from . import core, aamp, aamped, mpdist
 
 
 def _aampdist_vect(
@@ -52,18 +53,29 @@ def _aampdist_vect(
     p : float, default 2.0
         The p-norm to apply for computing the Minkowski distance.
     """
-    partial_distance_matrix_func = functools.partial(
-        _mass_absolute_distance_matrix, p=p
-    )
-    return mpdist._mpdist_vect(
-        Q,
-        T,
-        m,
-        percentage=percentage,
-        k=k,
-        custom_func=custom_func,
-        distance_matrix_func=partial_distance_matrix_func,
-    )
+    j = Q.shape[0] - m + 1  # `k` is reserved for `P_ABBA` selection
+    l = T.shape[0] - m + 1
+    MPdist_vect = np.empty(T.shape[0] - Q.shape[0] + 1, dtype=np.float64)
+    distance_matrix = np.full((j, l), np.inf, dtype=np.float64)
+    P_ABBA = np.empty(2 * j, dtype=np.float64)
+
+    if k is None:
+        percentage = np.clip(percentage, 0.0, 1.0)
+        k = min(math.ceil(percentage * (2 * Q.shape[0])), 2 * j - 1)
+
+    k = min(int(k), P_ABBA.shape[0] - 1)
+
+    core._mass_absolute_distance_matrix(Q, T, m, distance_matrix, p=p)
+
+    rolling_row_min = core.rolling_nanmin(distance_matrix, j)
+    col_min = np.nanmin(distance_matrix, axis=0)
+
+    for i in range(MPdist_vect.shape[0]):
+        P_ABBA[:j] = rolling_row_min[:, i]
+        P_ABBA[j:] = col_min[i : i + j]
+        MPdist_vect[i] = core._select_P_ABBA_value(P_ABBA, k, custom_func)
+
+    return MPdist_vect
 
 
 def aampdist(T_A, T_B, m, percentage=0.05, k=None, p=2.0):
diff --git a/stumpy/core.py b/stumpy/core.py
@@ -1437,7 +1437,7 @@ def mass(Q, T, M_T=None, Σ_T=None, normalize=True, p=2.0):
     return distance_profile
 
 
-def _mass_distance_matrix(Q, T, m, distance_matrix):
+def _mass_distance_matrix(Q, T, m, distance_matrix, μ_Q, σ_Q, M_T, Σ_T):
     """
     Compute the full distance matrix between all of the subsequences of `Q` and `T`
     using the MASS algorithm
@@ -1456,15 +1456,67 @@ def _mass_distance_matrix(Q, T, m, distance_matrix):
     distance_matrix : numpy.ndarray
         The full output distance matrix. This is mandatory since it may be reused.
 
+    μ_Q : float
+        Mean of `Q`
+
+    σ_Q : float
+        Standard deviation of `Q`
+
+    M_T : numpy.ndarray
+        Sliding mean of `T`
+
+    Σ_T : numpy.ndarray
+        Sliding standard deviation of `T`
+
     Returns
     -------
         None
     """
-    k, l = distance_matrix.shape
-    T, M_T, Σ_T = preprocess(T, m)
+    for i in range(distance_matrix.shape[0]):
+        if np.any(~np.isfinite(Q[i : i + m])):  # pragma: no cover
+            distance_matrix[i, :] = np.inf
+        else:
+            QT = _sliding_dot_product(Q[i : i + m], T)
+            distance_matrix[i, :] = _mass(Q[i : i + m], T, QT, μ_Q[i], σ_Q[i], M_T, Σ_T)
 
-    for i in range(k):
-        distance_matrix[i, :] = mass(Q[i : i + m], T, M_T, Σ_T)
+
+def mass_distance_matrix(Q, T, m, distance_matrix, M_T=None, Σ_T=None):
+    """
+    Compute the full distance matrix between all of the subsequences of `Q` and `T`
+    using the MASS algorithm
+
+    Parameters
+    ----------
+    Q : numpy.ndarray
+        Query array
+
+    T : numpy.ndarray
+        Time series or sequence
+
+    m : int
+        Window size
+
+    distance_matrix : numpy.ndarray
+        The full output distance matrix. This is mandatory since it may be reused.
+
+    M_T : numpy.ndarray, default None
+        Sliding mean of `T`
+
+    Σ_T : numpy.ndarray, default None
+        Sliding standard deviation of `T`
+
+    Returns
+    -------
+        None
+    """
+    Q, μ_Q, σ_Q = preprocess(Q, m)
+
+    if M_T is None or Σ_T is None:
+        T, M_T, Σ_T = preprocess(T, m)
+
+    check_window_size(m, max_size=min(Q.shape[-1], T.shape[-1]))
+
+    return _mass_distance_matrix(Q, T, m, distance_matrix, μ_Q, σ_Q, M_T, Σ_T)
 
 
 def _get_QT(start, T_A, T_B, m):
@@ -2394,3 +2446,51 @@ def _binarize_pan(pan, threshold, bfs_indices, n_processed):
     """
     idx = bfs_indices[:n_processed]
     pan[idx] = np.where(pan[idx] <= threshold, 0.0, 1.0)
+
+
+def _select_P_ABBA_value(P_ABBA, k, custom_func=None):
+    """
+    A convenience function for returning the `k`th smallest value from the `P_ABBA`
+    array or use a custom function to specify what `P_ABBA` value to return.
+
+    The MPdist distance measure considers two time series to be similar if they share
+    many subsequences, regardless of the order of matching subsequences. MPdist
+    concatenates the output of an AB-join and a BA-join and returns the `k`th smallest
+    value as the reported distance. Note that MPdist is a measure and not a metric.
+    Therefore, it does not obey the triangular inequality but the method is highly
+    scalable.
+
+    Parameters
+    ----------
+    P_ABBA : numpy.ndarray
+        An unsorted array resulting from the concatenation of the outputs from an
+        AB-joinand BA-join for two time series, `T_A` and `T_B`
+
+    k : int
+        Specify the `k`th value in the concatenated matrix profiles to return. This
+        parameter is ignored when `k_func` is not None.
+
+    custom_func : object, default None
+        A custom user defined function for selecting the desired value from the
+        unsorted `P_ABBA` array. This function may need to leverage `functools.partial`
+        and should take `P_ABBA` as its only input parameter and return a single
+        `MPdist` value. The `percentage` and `k` parameters are ignored when
+        `custom_func` is not None.
+
+    Returns
+    -------
+    MPdist : float
+        The matrix profile distance
+    """
+    k = min(int(k), P_ABBA.shape[0] - 1)
+    if custom_func is not None:
+        MPdist = custom_func(P_ABBA)
+    else:
+        partition = np.partition(P_ABBA, k)
+        MPdist = partition[k]
+        if ~np.isfinite(MPdist):
+            partition[:k].sort()
+            k = max(0, np.count_nonzero(np.isfinite(partition[:k])) - 1)
+            MPdist = partition[k]
+
+    return MPdist
diff --git a/stumpy/mpdist.py b/stumpy/mpdist.py
@@ -5,8 +5,7 @@
 import numpy as np
 import math
 
-from . import stump, stumped, core
-from .core import _mass_distance_matrix
+from . import core, stump, stumped
 from .aampdist import aampdist, aampdisted
 
 
@@ -74,54 +73,6 @@ def _compute_P_ABBA(
     P_ABBA[n_A - m + 1 :] = partial_mp_func(T_B, m, T_A, ignore_trivial=False)[:, 0]
 
 
-def _select_P_ABBA_value(P_ABBA, k, custom_func=None):
-    """
-    A convenience function for returning the `k`th smallest value from the `P_ABBA`
-    array or use a custom function to specify what `P_ABBA` value to return.
-
-    The MPdist distance measure considers two time series to be similar if they share
-    many subsequences, regardless of the order of matching subsequences. MPdist
-    concatenates the output of an AB-join and a BA-join and returns the `k`th smallest
-    value as the reported distance. Note that MPdist is a measure and not a metric.
-    Therefore, it does not obey the triangular inequality but the method is highly
-    scalable.
-
-    Parameters
-    ----------
-    P_ABBA : numpy.ndarray
-        An unsorted array resulting from the concatenation of the outputs from an
-        AB-joinand BA-join for two time series, `T_A` and `T_B`
-
-    k : int
-        Specify the `k`th value in the concatenated matrix profiles to return. This
-        parameter is ignored when `k_func` is not None.
-
-    custom_func : object, default None
-        A custom user defined function for selecting the desired value from the
-        unsorted `P_ABBA` array. This function may need to leverage `functools.partial`
-        and should take `P_ABBA` as its only input parameter and return a single
-        `MPdist` value. The `percentage` and `k` parameters are ignored when
-        `custom_func` is not None.
-
-    Returns
-    -------
-    MPdist : float
-        The matrix profile distance
-    """
-    k = min(int(k), P_ABBA.shape[0] - 1)
-    if custom_func is not None:
-        MPdist = custom_func(P_ABBA)
-    else:
-        partition = np.partition(P_ABBA, k)
-        MPdist = partition[k]
-        if ~np.isfinite(MPdist):
-            partition[:k].sort()
-            k = max(0, np.count_nonzero(np.isfinite(partition[:k])) - 1)
-            MPdist = partition[k]
-
-    return MPdist
-
-
 def _mpdist(
     T_A,
     T_B,
@@ -211,7 +162,7 @@ def _mpdist(
         percentage = np.clip(percentage, 0.0, 1.0)
         k = min(math.ceil(percentage * (n_A + n_B)), n_A - m + 1 + n_B - m + 1 - 1)
 
-    MPdist = _select_P_ABBA_value(P_ABBA, k, custom_func)
+    MPdist = core._select_P_ABBA_value(P_ABBA, k, custom_func)
 
     return MPdist
 
@@ -220,10 +171,13 @@ def _mpdist_vect(
     Q,
     T,
     m,
+    μ_Q,
+    σ_Q,
+    M_T,
+    Σ_T,
     percentage=0.05,
     k=None,
     custom_func=None,
-    distance_matrix_func=_mass_distance_matrix,
 ):
     """
     Compute the matrix profile distance measure vector between `Q` and each subsequence,
@@ -240,6 +194,18 @@ def _mpdist_vect(
     m : int
         Window size
 
+    μ_Q : float
+        Mean of `Q`
+
+    σ_Q : float
+        Standard deviation of `Q`
+
+    M_T : numpy.ndarray
+        Sliding mean of `T`
+
+    Σ_T : numpy.ndarray
+        Sliding standard deviation of `T`
+
     percentage : float, 0.05
         The percentage of distances that will be used to report `mpdist`. The value
         is between 0.0 and 1.0. This parameter is ignored when `k` is not `None` or when
@@ -256,9 +222,6 @@ def _mpdist_vect(
         and should take `P_ABBA` as its only input parameter and return a single
         `MPdist` value. The `percentage` and `k` parameters are ignored when
         `custom_func` is not None.
-
-    distance_matrix_func : object, default _mass_distance_matrix
-        The function to use to compute the distance matrix between `Q` and `T`
     """
     j = Q.shape[0] - m + 1  # `k` is reserved for `P_ABBA` selection
     l = T.shape[0] - m + 1
@@ -272,15 +235,15 @@ def _mpdist_vect(
 
     k = min(int(k), P_ABBA.shape[0] - 1)
 
-    distance_matrix_func(Q, T, m, distance_matrix)
+    core._mass_distance_matrix(Q, T, m, distance_matrix, μ_Q, σ_Q, M_T, Σ_T)
 
     rolling_row_min = core.rolling_nanmin(distance_matrix, j)
     col_min = np.nanmin(distance_matrix, axis=0)
 
     for i in range(MPdist_vect.shape[0]):
         P_ABBA[:j] = rolling_row_min[:, i]
         P_ABBA[j:] = col_min[i : i + j]
-        MPdist_vect[i] = _select_P_ABBA_value(P_ABBA, k, custom_func)
+        MPdist_vect[i] = core._select_P_ABBA_value(P_ABBA, k, custom_func)
 
     return MPdist_vect
 
diff --git a/stumpy/snippets.py b/stumpy/snippets.py
@@ -96,6 +96,8 @@ def _get_all_profiles(
         percentage = np.clip(percentage, 0.0, 1.0)
         s = min(math.ceil(percentage * m), m)
 
+    M_T, Σ_T = core.compute_mean_std(T, s)
+
     # Iterate over non-overlapping subsequences, see Definition 3
     for i in range((n_padded // m) - 1):
         start = i * m
@@ -105,6 +107,10 @@ def _get_all_profiles(
             S_i,
             T,
             s,
+            M_T[start : start + s],
+            Σ_T[start : start + s],
+            M_T,
+            Σ_T,
             percentage=mpdist_percentage,
             k=mpdist_k,
             custom_func=mpdist_custom_func,
@@ -144,10 +150,12 @@ def snippets(
 
     percentage : float, default 1.0
         With the length of each non-overlapping subsequence, `S[i]`, set to `m`, this
-        is the percentage of `S[i]` (i.e., `percentage * m`) to set the `s` to. When
-        `percentage == 1.0`, then the full length of `S[i]` is used to compute the
-        `mpdist_vect`. When `percentage < 1.0`, then shorter subsequences from `S[i]`
-        is used to compute `mpdist_vect`.
+        is the percentage of `S[i]` (i.e., `percentage * m`) to set `s` (the
+        sub-subsequence length) to. When `percentage == 1.0`, then the full length of
+        `S[i]` is used to compute the `mpdist_vect`. When `percentage < 1.0`, then
+        a shorter sub-subsequence length of `s = min(math.ceil(percentage * m), m)`
+        from each `S[i]` is used to compute `mpdist_vect`. When `s` is not `None`, then
+        the `percentage` parameter is ignored.
 
     s : int, default None
         With the length of each non-overlapping subsequence, `S[i]`, set to `m`, this
@@ -221,6 +229,8 @@ def snippets(
            [1, 1, 2],
            [1, 3, 4]]))
     """
+    T = core._preprocess(T)
+
     if m > T.shape[0] // 2:  # pragma: no cover
         raise ValueError(
             f"The snippet window size of {m} is too large for a time series with "
diff --git a/tests/naive.py b/tests/naive.py
@@ -17,6 +17,24 @@ def distance(a, b, axis=0, p=2.0):
     return np.linalg.norm(a - b, axis=axis, ord=p)
 
 
+def compute_mean_std(T, m):
+    n = T.shape[0]
+
+    M_T = np.zeros(n - m + 1, dtype=float)
+    Σ_T = np.zeros(n - m + 1, dtype=float)
+
+    for i in range(n - m + 1):
+        Q = T[i : i + m].copy()
+        Q[np.isinf(Q)] = np.nan
+
+        M_T[i] = np.mean(Q)
+        Σ_T[i] = np.nanstd(Q)
+
+    M_T[np.isnan(M_T)] = np.inf
+    Σ_T[np.isnan(Σ_T)] = 0
+    return M_T, Σ_T
+
+
 def apply_exclusion_zone(a, trivial_idx, excl_zone, val):
     start = max(0, trivial_idx - excl_zone)
     stop = min(a.shape[-1], trivial_idx + excl_zone + 1)
diff --git a/tests/test_core.py b/tests/test_core.py
diff --git a/tests/test_mpdist.py b/tests/test_mpdist.py