Skip to content

Commit 60bb08d

Browse files
committed
Updated snippets docstring, refactored _mpdist_vect
1 parent 7439720 commit 60bb08d

File tree

7 files changed

+216
-124
lines changed

7 files changed

+216
-124
lines changed

stumpy/aampdist.py

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
# Copyright 2019 TD Ameritrade. Released under the terms of the 3-Clause BSD license.
33
# STUMPY is a trademark of TD Ameritrade IP Company, Inc. All rights reserved.
44

5+
import numpy as np
6+
import math
57
import functools
68

7-
from . import aamp, aamped, mpdist
8-
from .core import _mass_absolute_distance_matrix
9+
from . import core, aamp, aamped, mpdist
910

1011

1112
def _aampdist_vect(
@@ -52,18 +53,29 @@ def _aampdist_vect(
5253
p : float, default 2.0
5354
The p-norm to apply for computing the Minkowski distance.
5455
"""
55-
partial_distance_matrix_func = functools.partial(
56-
_mass_absolute_distance_matrix, p=p
57-
)
58-
return mpdist._mpdist_vect(
59-
Q,
60-
T,
61-
m,
62-
percentage=percentage,
63-
k=k,
64-
custom_func=custom_func,
65-
distance_matrix_func=partial_distance_matrix_func,
66-
)
56+
j = Q.shape[0] - m + 1 # `k` is reserved for `P_ABBA` selection
57+
l = T.shape[0] - m + 1
58+
MPdist_vect = np.empty(T.shape[0] - Q.shape[0] + 1, dtype=np.float64)
59+
distance_matrix = np.full((j, l), np.inf, dtype=np.float64)
60+
P_ABBA = np.empty(2 * j, dtype=np.float64)
61+
62+
if k is None:
63+
percentage = np.clip(percentage, 0.0, 1.0)
64+
k = min(math.ceil(percentage * (2 * Q.shape[0])), 2 * j - 1)
65+
66+
k = min(int(k), P_ABBA.shape[0] - 1)
67+
68+
core._mass_absolute_distance_matrix(Q, T, m, distance_matrix, p=p)
69+
70+
rolling_row_min = core.rolling_nanmin(distance_matrix, j)
71+
col_min = np.nanmin(distance_matrix, axis=0)
72+
73+
for i in range(MPdist_vect.shape[0]):
74+
P_ABBA[:j] = rolling_row_min[:, i]
75+
P_ABBA[j:] = col_min[i : i + j]
76+
MPdist_vect[i] = core._select_P_ABBA_value(P_ABBA, k, custom_func)
77+
78+
return MPdist_vect
6779

6880

6981
def aampdist(T_A, T_B, m, percentage=0.05, k=None, p=2.0):

stumpy/core.py

Lines changed: 105 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1437,7 +1437,7 @@ def mass(Q, T, M_T=None, Σ_T=None, normalize=True, p=2.0):
14371437
return distance_profile
14381438

14391439

1440-
def _mass_distance_matrix(Q, T, m, distance_matrix):
1440+
def _mass_distance_matrix(Q, T, m, distance_matrix, μ_Q, σ_Q, M_T, Σ_T):
14411441
"""
14421442
Compute the full distance matrix between all of the subsequences of `Q` and `T`
14431443
using the MASS algorithm
@@ -1456,15 +1456,67 @@ def _mass_distance_matrix(Q, T, m, distance_matrix):
14561456
distance_matrix : numpy.ndarray
14571457
The full output distance matrix. This is mandatory since it may be reused.
14581458
1459+
μ_Q : float
1460+
Mean of `Q`
1461+
1462+
σ_Q : float
1463+
Standard deviation of `Q`
1464+
1465+
M_T : numpy.ndarray
1466+
Sliding mean of `T`
1467+
1468+
Σ_T : numpy.ndarray
1469+
Sliding standard deviation of `T`
1470+
14591471
Returns
14601472
-------
14611473
None
14621474
"""
1463-
k, l = distance_matrix.shape
1464-
T, M_T, Σ_T = preprocess(T, m)
1475+
for i in range(distance_matrix.shape[0]):
1476+
if np.any(~np.isfinite(Q[i : i + m])): # pragma: no cover
1477+
distance_matrix[i, :] = np.inf
1478+
else:
1479+
QT = _sliding_dot_product(Q[i : i + m], T)
1480+
distance_matrix[i, :] = _mass(Q[i : i + m], T, QT, μ_Q[i], σ_Q[i], M_T, Σ_T)
14651481

1466-
for i in range(k):
1467-
distance_matrix[i, :] = mass(Q[i : i + m], T, M_T, Σ_T)
1482+
1483+
def mass_distance_matrix(Q, T, m, distance_matrix, M_T=None, Σ_T=None):
1484+
"""
1485+
Compute the full distance matrix between all of the subsequences of `Q` and `T`
1486+
using the MASS algorithm
1487+
1488+
Parameters
1489+
----------
1490+
Q : numpy.ndarray
1491+
Query array
1492+
1493+
T : numpy.ndarray
1494+
Time series or sequence
1495+
1496+
m : int
1497+
Window size
1498+
1499+
distance_matrix : numpy.ndarray
1500+
The full output distance matrix. This is mandatory since it may be reused.
1501+
1502+
M_T : numpy.ndarray, default None
1503+
Sliding mean of `T`
1504+
1505+
Σ_T : numpy.ndarray, default None
1506+
Sliding standard deviation of `T`
1507+
1508+
Returns
1509+
-------
1510+
None
1511+
"""
1512+
Q, μ_Q, σ_Q = preprocess(Q, m)
1513+
1514+
if M_T is None or Σ_T is None:
1515+
T, M_T, Σ_T = preprocess(T, m)
1516+
1517+
check_window_size(m, max_size=min(Q.shape[-1], T.shape[-1]))
1518+
1519+
return _mass_distance_matrix(Q, T, m, distance_matrix, μ_Q, σ_Q, M_T, Σ_T)
14681520

14691521

14701522
def _get_QT(start, T_A, T_B, m):
@@ -2394,3 +2446,51 @@ def _binarize_pan(pan, threshold, bfs_indices, n_processed):
23942446
"""
23952447
idx = bfs_indices[:n_processed]
23962448
pan[idx] = np.where(pan[idx] <= threshold, 0.0, 1.0)
2449+
2450+
2451+
def _select_P_ABBA_value(P_ABBA, k, custom_func=None):
2452+
"""
2453+
A convenience function for returning the `k`th smallest value from the `P_ABBA`
2454+
array or use a custom function to specify what `P_ABBA` value to return.
2455+
2456+
The MPdist distance measure considers two time series to be similar if they share
2457+
many subsequences, regardless of the order of matching subsequences. MPdist
2458+
concatenates the output of an AB-join and a BA-join and returns the `k`th smallest
2459+
value as the reported distance. Note that MPdist is a measure and not a metric.
2460+
Therefore, it does not obey the triangular inequality but the method is highly
2461+
scalable.
2462+
2463+
Parameters
2464+
----------
2465+
P_ABBA : numpy.ndarray
2466+
An unsorted array resulting from the concatenation of the outputs from an
2467+
AB-joinand BA-join for two time series, `T_A` and `T_B`
2468+
2469+
k : int
2470+
Specify the `k`th value in the concatenated matrix profiles to return. This
2471+
parameter is ignored when `k_func` is not None.
2472+
2473+
custom_func : object, default None
2474+
A custom user defined function for selecting the desired value from the
2475+
unsorted `P_ABBA` array. This function may need to leverage `functools.partial`
2476+
and should take `P_ABBA` as its only input parameter and return a single
2477+
`MPdist` value. The `percentage` and `k` parameters are ignored when
2478+
`custom_func` is not None.
2479+
2480+
Returns
2481+
-------
2482+
MPdist : float
2483+
The matrix profile distance
2484+
"""
2485+
k = min(int(k), P_ABBA.shape[0] - 1)
2486+
if custom_func is not None:
2487+
MPdist = custom_func(P_ABBA)
2488+
else:
2489+
partition = np.partition(P_ABBA, k)
2490+
MPdist = partition[k]
2491+
if ~np.isfinite(MPdist):
2492+
partition[:k].sort()
2493+
k = max(0, np.count_nonzero(np.isfinite(partition[:k])) - 1)
2494+
MPdist = partition[k]
2495+
2496+
return MPdist

stumpy/mpdist.py

Lines changed: 20 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,7 @@
55
import numpy as np
66
import math
77

8-
from . import stump, stumped, core
9-
from .core import _mass_distance_matrix
8+
from . import core, stump, stumped
109
from .aampdist import aampdist, aampdisted
1110

1211

@@ -74,54 +73,6 @@ def _compute_P_ABBA(
7473
P_ABBA[n_A - m + 1 :] = partial_mp_func(T_B, m, T_A, ignore_trivial=False)[:, 0]
7574

7675

77-
def _select_P_ABBA_value(P_ABBA, k, custom_func=None):
78-
"""
79-
A convenience function for returning the `k`th smallest value from the `P_ABBA`
80-
array or use a custom function to specify what `P_ABBA` value to return.
81-
82-
The MPdist distance measure considers two time series to be similar if they share
83-
many subsequences, regardless of the order of matching subsequences. MPdist
84-
concatenates the output of an AB-join and a BA-join and returns the `k`th smallest
85-
value as the reported distance. Note that MPdist is a measure and not a metric.
86-
Therefore, it does not obey the triangular inequality but the method is highly
87-
scalable.
88-
89-
Parameters
90-
----------
91-
P_ABBA : numpy.ndarray
92-
An unsorted array resulting from the concatenation of the outputs from an
93-
AB-joinand BA-join for two time series, `T_A` and `T_B`
94-
95-
k : int
96-
Specify the `k`th value in the concatenated matrix profiles to return. This
97-
parameter is ignored when `k_func` is not None.
98-
99-
custom_func : object, default None
100-
A custom user defined function for selecting the desired value from the
101-
unsorted `P_ABBA` array. This function may need to leverage `functools.partial`
102-
and should take `P_ABBA` as its only input parameter and return a single
103-
`MPdist` value. The `percentage` and `k` parameters are ignored when
104-
`custom_func` is not None.
105-
106-
Returns
107-
-------
108-
MPdist : float
109-
The matrix profile distance
110-
"""
111-
k = min(int(k), P_ABBA.shape[0] - 1)
112-
if custom_func is not None:
113-
MPdist = custom_func(P_ABBA)
114-
else:
115-
partition = np.partition(P_ABBA, k)
116-
MPdist = partition[k]
117-
if ~np.isfinite(MPdist):
118-
partition[:k].sort()
119-
k = max(0, np.count_nonzero(np.isfinite(partition[:k])) - 1)
120-
MPdist = partition[k]
121-
122-
return MPdist
123-
124-
12576
def _mpdist(
12677
T_A,
12778
T_B,
@@ -211,7 +162,7 @@ def _mpdist(
211162
percentage = np.clip(percentage, 0.0, 1.0)
212163
k = min(math.ceil(percentage * (n_A + n_B)), n_A - m + 1 + n_B - m + 1 - 1)
213164

214-
MPdist = _select_P_ABBA_value(P_ABBA, k, custom_func)
165+
MPdist = core._select_P_ABBA_value(P_ABBA, k, custom_func)
215166

216167
return MPdist
217168

@@ -220,10 +171,13 @@ def _mpdist_vect(
220171
Q,
221172
T,
222173
m,
174+
μ_Q,
175+
σ_Q,
176+
M_T,
177+
Σ_T,
223178
percentage=0.05,
224179
k=None,
225180
custom_func=None,
226-
distance_matrix_func=_mass_distance_matrix,
227181
):
228182
"""
229183
Compute the matrix profile distance measure vector between `Q` and each subsequence,
@@ -240,6 +194,18 @@ def _mpdist_vect(
240194
m : int
241195
Window size
242196
197+
μ_Q : float
198+
Mean of `Q`
199+
200+
σ_Q : float
201+
Standard deviation of `Q`
202+
203+
M_T : numpy.ndarray
204+
Sliding mean of `T`
205+
206+
Σ_T : numpy.ndarray
207+
Sliding standard deviation of `T`
208+
243209
percentage : float, 0.05
244210
The percentage of distances that will be used to report `mpdist`. The value
245211
is between 0.0 and 1.0. This parameter is ignored when `k` is not `None` or when
@@ -256,9 +222,6 @@ def _mpdist_vect(
256222
and should take `P_ABBA` as its only input parameter and return a single
257223
`MPdist` value. The `percentage` and `k` parameters are ignored when
258224
`custom_func` is not None.
259-
260-
distance_matrix_func : object, default _mass_distance_matrix
261-
The function to use to compute the distance matrix between `Q` and `T`
262225
"""
263226
j = Q.shape[0] - m + 1 # `k` is reserved for `P_ABBA` selection
264227
l = T.shape[0] - m + 1
@@ -272,15 +235,15 @@ def _mpdist_vect(
272235

273236
k = min(int(k), P_ABBA.shape[0] - 1)
274237

275-
distance_matrix_func(Q, T, m, distance_matrix)
238+
core._mass_distance_matrix(Q, T, m, distance_matrix, μ_Q, σ_Q, M_T, Σ_T)
276239

277240
rolling_row_min = core.rolling_nanmin(distance_matrix, j)
278241
col_min = np.nanmin(distance_matrix, axis=0)
279242

280243
for i in range(MPdist_vect.shape[0]):
281244
P_ABBA[:j] = rolling_row_min[:, i]
282245
P_ABBA[j:] = col_min[i : i + j]
283-
MPdist_vect[i] = _select_P_ABBA_value(P_ABBA, k, custom_func)
246+
MPdist_vect[i] = core._select_P_ABBA_value(P_ABBA, k, custom_func)
284247

285248
return MPdist_vect
286249

stumpy/snippets.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ def _get_all_profiles(
9696
percentage = np.clip(percentage, 0.0, 1.0)
9797
s = min(math.ceil(percentage * m), m)
9898

99+
M_T, Σ_T = core.compute_mean_std(T, s)
100+
99101
# Iterate over non-overlapping subsequences, see Definition 3
100102
for i in range((n_padded // m) - 1):
101103
start = i * m
@@ -105,6 +107,10 @@ def _get_all_profiles(
105107
S_i,
106108
T,
107109
s,
110+
M_T[start : start + s],
111+
Σ_T[start : start + s],
112+
M_T,
113+
Σ_T,
108114
percentage=mpdist_percentage,
109115
k=mpdist_k,
110116
custom_func=mpdist_custom_func,
@@ -144,10 +150,12 @@ def snippets(
144150
145151
percentage : float, default 1.0
146152
With the length of each non-overlapping subsequence, `S[i]`, set to `m`, this
147-
is the percentage of `S[i]` (i.e., `percentage * m`) to set the `s` to. When
148-
`percentage == 1.0`, then the full length of `S[i]` is used to compute the
149-
`mpdist_vect`. When `percentage < 1.0`, then shorter subsequences from `S[i]`
150-
is used to compute `mpdist_vect`.
153+
is the percentage of `S[i]` (i.e., `percentage * m`) to set `s` (the
154+
sub-subsequence length) to. When `percentage == 1.0`, then the full length of
155+
`S[i]` is used to compute the `mpdist_vect`. When `percentage < 1.0`, then
156+
a shorter sub-subsequence length of `s = min(math.ceil(percentage * m), m)`
157+
from each `S[i]` is used to compute `mpdist_vect`. When `s` is not `None`, then
158+
the `percentage` parameter is ignored.
151159
152160
s : int, default None
153161
With the length of each non-overlapping subsequence, `S[i]`, set to `m`, this
@@ -221,6 +229,8 @@ def snippets(
221229
[1, 1, 2],
222230
[1, 3, 4]]))
223231
"""
232+
T = core._preprocess(T)
233+
224234
if m > T.shape[0] // 2: # pragma: no cover
225235
raise ValueError(
226236
f"The snippet window size of {m} is too large for a time series with "

tests/naive.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,24 @@ def distance(a, b, axis=0, p=2.0):
1717
return np.linalg.norm(a - b, axis=axis, ord=p)
1818

1919

20+
def compute_mean_std(T, m):
21+
n = T.shape[0]
22+
23+
M_T = np.zeros(n - m + 1, dtype=float)
24+
Σ_T = np.zeros(n - m + 1, dtype=float)
25+
26+
for i in range(n - m + 1):
27+
Q = T[i : i + m].copy()
28+
Q[np.isinf(Q)] = np.nan
29+
30+
M_T[i] = np.mean(Q)
31+
Σ_T[i] = np.nanstd(Q)
32+
33+
M_T[np.isnan(M_T)] = np.inf
34+
Σ_T[np.isnan(Σ_T)] = 0
35+
return M_T, Σ_T
36+
37+
2038
def apply_exclusion_zone(a, trivial_idx, excl_zone, val):
2139
start = max(0, trivial_idx - excl_zone)
2240
stop = min(a.shape[-1], trivial_idx + excl_zone + 1)

0 commit comments

Comments
 (0)