# Source code for stumpy.mstump

```
# STUMPY
# Copyright 2019 TD Ameritrade. Released under the terms of the 3-Clause BSD license.
# STUMPY is a trademark of TD Ameritrade IP Company, Inc. All rights reserved.
from functools import lru_cache, partial
import numpy as np
from numba import njit, prange
from scipy.stats import norm
from . import config, core
from .maamp import maamp, maamp_mdl, maamp_multi_distance_profile, maamp_subspace
def _multi_mass(
Q,
T,
m,
M_T,
Σ_T,
μ_Q,
σ_Q,
T_subseq_isconstant,
Q_subseq_isconstant,
query_idx=None,
):
"""
A multi-dimensional wrapper around "Mueen's Algorithm for Similarity Search"
(MASS) to compute multi-dimensional distance profile.
Parameters
----------
Q : numpy.ndarray
Query array or subsequence
T : numpy.ndarray
Time series array or sequence
m : int
Window size
M_T : numpy.ndarray
Sliding mean for `T`
Σ_T : numpy.ndarray
Sliding standard deviation for `T`
μ_Q : numpy.ndarray
Mean value of `Q`
σ_Q : numpy.ndarray
Standard deviation of `Q`
T_subseq_isconstant : numpy.ndarray
A boolean array that indicates whether a subsequence in `T` is constant (True)
Q_subseq_isconstant : numpy.ndarray
A boolean array that indicates whether the subsequence in `Q` is constant (True)
query_idx : int, default None
This is the index position along each of the time series in `T`, where
the query subsequence, `Q`, is located. `query_idx` should be set to None
if `Q` is not a subsequence of `T`. If `Q` is a subsequence of `T`, provding
this argument is optional. If query_idx is provided, the distance between Q
and `T[:, query_idx : query_idx + m]` will automatically be set to zero.
Returns
-------
D : numpy.ndarray
Multi-dimensional distance profile
"""
d, n = T.shape
l = n - m + 1
D = np.empty((d, l), dtype=np.float64)
for i in range(d):
if np.isinf(μ_Q[i]):
D[i, :] = np.inf
else:
D[i, :] = core.mass(
Q[i],
T[i],
M_T[i],
Σ_T[i],
T_subseq_isconstant=T_subseq_isconstant[i],
Q_subseq_isconstant=Q_subseq_isconstant[i],
query_idx=query_idx,
)
return D
[docs]
@core.non_normalized(maamp_subspace, exclude=["normalize", "T_subseq_isconstant"])
def subspace(
T,
m,
subseq_idx,
nn_idx,
k,
include=None,
discords=False,
discretize_func=None,
n_bit=8,
normalize=True,
p=2.0,
T_subseq_isconstant=None,
):
"""
Compute the k-dimensional matrix profile subspace for a given subsequence index and
its nearest neighbor index
Parameters
----------
T : numpy.ndarray
The time series or sequence for which the multi-dimensional matrix profile,
multi-dimensional matrix profile indices were computed
m : int
Window size
subseq_idx : int
The subsequence index in T
nn_idx : int
The nearest neighbor index in T
k : int
The subset number of dimensions out of `D = T.shape[0]`-dimensions to return
the subspace for. Note that zero-based indexing is used.
include : numpy.ndarray, default None
A list of (zero-based) indices corresponding to the dimensions in `T` that
must be included in the constrained multidimensional motif search.
For more information, see Section IV D in:
`DOI: 10.1109/ICDM.2017.66 \
<https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__
discords : bool, default False
When set to `True`, this reverses the distance profile to favor discords rather
than motifs. Note that indices in `include` are still maintained and respected.
discretize_func : func, default None
A function for discretizing each input array. When this is `None`, an
appropriate discretization function (based on the `normalize` parameter) will
be applied.
n_bit : int, default 8
The number of bits used for discretization. For more information on an
appropriate value, see Figure 4 in:
`DOI: 10.1109/ICDM.2016.0069 \
<https://www.cs.ucr.edu/~eamonn/PID4481999_Matrix%20Profile_III.pdf>`__
and Figure 2 in:
`DOI: 10.1109/ICDM.2011.54 \
<https://www.cs.ucr.edu/~eamonn/ICDM_mdl.pdf>`__
normalize : bool, default True
When set to `True`, this z-normalizes subsequences prior to computing distances.
Otherwise, this function gets re-routed to its complementary non-normalized
equivalent set in the `@core.non_normalized` function decorator.
p : float, default 2.0
The p-norm to apply for computing the Minkowski distance. Minkowski distance is
typically used with `p` being 1 or 2, which correspond to the Manhattan distance
and the Euclidean distance, respectively. This parameter is ignored when
`normalize == True`.
T_subseq_isconstant : numpy.ndarray, function, or list, default None
A parameter that is used to show whether a subsequence of a time series in `T`
is constant (True) or not. T_subseq_isconstant can be a 2D boolean numpy.ndarry
or a function that can be applied to each time series in `T`. Alternatively, for
maximum flexibility, a list (with length equal to the total number of time
series) may also be used. In this case, T_subseq_isconstant[i] corresponds to
the i-th time series T[i] and each element in the list can either be a 1D
boolean np.ndarray, a function, or None.
Returns
-------
S : numpy.ndarray
An array that contains the (singular) `k`th-dimensional subspace for the
subsequence with index equal to `subseq_idx`. Note that `k+1` rows will be
returned.
See Also
--------
stumpy.mstump : Compute the multi-dimensional z-normalized matrix profile
stumpy.mstumped : Compute the multi-dimensional z-normalized matrix profile with
a distributed dask cluster
stumpy.mdl : Compute the number of bits needed to compress one array with another
using the minimum description length (MDL)
Examples
--------
>>> import stumpy
>>> import numpy as np
>>> mps, indices = stumpy.mstump(
... np.array([[584., -11., 23., 79., 1001., 0., -19.],
... [ 1., 2., 4., 8., 16., 0., 32.]]),
... m=3)
>>> motifs_idx = np.argsort(mps, axis=1)[:, :2]
>>> k = 1
>>> stumpy.subspace(
... np.array([[584., -11., 23., 79., 1001., 0., -19.],
... [ 1., 2., 4., 8., 16., 0., 32.]]),
... m=3,
... subseq_idx=motifs_idx[k][0],
... nn_idx=indices[k][motifs_idx[k][0]],
... k=k)
array([0, 1])
"""
T = core._preprocess(T)
core.check_window_size(m, max_size=T.shape[-1])
T_subseq_isconstant = core.process_isconstant(T, m, T_subseq_isconstant)
if discretize_func is None:
bins = _inverse_norm(n_bit)
discretize_func = partial(_discretize, bins=bins)
subseqs, _, _, _ = core.preprocess(T[:, subseq_idx : subseq_idx + m], m)
subseqs = core.z_norm(subseqs, axis=1)
subseqs[T_subseq_isconstant[:, subseq_idx]] = 0.0
neighbors, _, _, _ = core.preprocess(T[:, nn_idx : nn_idx + m], m)
neighbors = core.z_norm(neighbors, axis=1)
neighbors[T_subseq_isconstant[:, nn_idx]] = 0.0
disc_subseqs = discretize_func(subseqs)
disc_neighbors = discretize_func(neighbors)
D = np.linalg.norm(disc_subseqs - disc_neighbors, axis=1)
S = core._subspace(D, k, include=include, discords=discords)
return S
@lru_cache()
def _inverse_norm(n_bit=8): # pragma: no cover
"""
Generate bin edges from an inverse normal distribution
This distribution is best suited for z-normalized time series data
Parameters
----------
n_bit : int, default 8
The number of bits to be used in generating the inverse normal distribution
Returns
-------
out : numpy.ndarray
Array of bin edges that can be used for data discretization
"""
return norm.ppf(np.arange(1, (2**n_bit)) / (2**n_bit))
def _discretize(a, bins, right=True): # pragma: no cover
"""
Discretize each row of the input array
This is equivalent to `np.searchsorted(bins, a)`
Parameters
----------
a : numpy.ndarray
The input array
bins : numpy.ndarray
The bin edges used to discretize `a`
right : bool, default True
Indicates whether the intervals for binning include the right or the left bin
edge.
Returns
-------
out : numpy.ndarray
Discretized array
"""
return np.digitize(a, bins, right=right)
[docs]
@core.non_normalized(maamp_mdl, exclude=["normalize", "T_subseq_isconstant"])
def mdl(
T,
m,
subseq_idx,
nn_idx,
include=None,
discords=False,
discretize_func=None,
n_bit=8,
normalize=True,
p=2.0,
T_subseq_isconstant=None,
):
"""
Compute the multi-dimensional number of bits needed to compress one
multi-dimensional subsequence with another along each of the k-dimensions
using the minimum description length (MDL)
Parameters
----------
T : numpy.ndarray
The time series or sequence for which the multi-dimensional matrix profile,
multi-dimensional matrix profile indices were computed
m : int
Window size
subseq_idx : numpy.ndarray
The multi-dimensional subsequence indices in T
nn_idx : numpy.ndarray
The multi-dimensional nearest neighbor index in T
include : numpy.ndarray, default None
A list of (zero-based) indices corresponding to the dimensions in `T` that
must be included in the constrained multidimensional motif search.
For more information, see Section IV D in:
`DOI: 10.1109/ICDM.2017.66 \
<https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__
discords : bool, default False
When set to `True`, this reverses the distance profile to favor discords rather
than motifs. Note that indices in `include` are still maintained and respected.
discretize_func : func, default None
A function for discretizing each input array. When this is `None`, an
appropriate discretization function (based on the `normalization` parameter)
will be applied.
n_bit : int, default 8
The number of bits used for discretization and for computing the bit size. For
more information on an appropriate value, see Figure 4 in:
`DOI: 10.1109/ICDM.2016.0069 \
<https://www.cs.ucr.edu/~eamonn/PID4481999_Matrix%20Profile_III.pdf>`__
and Figure 2 in:
`DOI: 10.1109/ICDM.2011.54 \
<https://www.cs.ucr.edu/~eamonn/ICDM_mdl.pdf>`__
normalize : bool, default True
When set to `True`, this z-normalizes subsequences prior to computing distances.
Otherwise, this function gets re-routed to its complementary non-normalized
equivalent set in the `@core.non_normalized` function decorator.
p : float, default 2.0
The p-norm to apply for computing the Minkowski distance. Minkowski distance is
typically used with `p` being 1 or 2, which correspond to the Manhattan distance
and the Euclidean distance, respectively. This parameter is ignored when
`normalize == True`.
T_subseq_isconstant : numpy.ndarray, function, or list, default None
A parameter that is used to show whether a subsequence of a time series in `T`
is constant (True) or not. T_subseq_isconstant can be a 2D boolean numpy.ndarry
or a function that can be applied to each time series in `T`. Alternatively, for
maximum flexibility, a list (with length equal to the total number of time
series) may also be used. In this case, T_subseq_isconstant[i] corresponds to
the i-th time series T[i] and each element in the list can either be a 1D
boolean np.ndarray, a function, or None.
Returns
-------
bit_sizes : numpy.ndarray
The total number of bits computed from MDL for representing each pair of
multidimensional subsequences.
S : list
A list of numpy.ndarrays that contain the `k`th-dimensional subspaces
See Also
--------
stumpy.mstump : Compute the multi-dimensional z-normalized matrix profile
stumpy.mstumped : Compute the multi-dimensional z-normalized matrix profile with
a distributed dask cluster
stumpy.subspace : Compute the k-dimensional matrix profile subspace for a given
subsequence index and its nearest neighbor index
Examples
--------
>>> import stumpy
>>> import numpy as np
>>> mps, indices = stumpy.mstump(
... np.array([[584., -11., 23., 79., 1001., 0., -19.],
... [ 1., 2., 4., 8., 16., 0., 32.]]),
... m=3)
>>> motifs_idx = np.argsort(mps, axis=1)[:, 0]
>>> stumpy.mdl(
... np.array([[584., -11., 23., 79., 1001., 0., -19.],
... [ 1., 2., 4., 8., 16., 0., 32.]]),
... m=3,
... subseq_idx=motifs_idx,
... nn_idx=indices[np.arange(motifs_idx.shape[0]), motifs_idx])
(array([ 80. , 111.509775]), [array([1]), array([0, 1])])
"""
T = core._preprocess(T)
core.check_window_size(m, max_size=T.shape[-1])
T_subseq_isconstant = core.process_isconstant(T, m, T_subseq_isconstant)
if discretize_func is None:
bins = _inverse_norm(n_bit)
discretize_func = partial(_discretize, bins=bins)
bit_sizes = np.empty(T.shape[0])
S = [None] * T.shape[0]
for k in range(T.shape[0]):
subseqs, _, _, _ = core.preprocess(T[:, subseq_idx[k] : subseq_idx[k] + m], m)
subseqs = core.z_norm(subseqs, axis=1)
subseqs[T_subseq_isconstant[:, subseq_idx[k]]] = 0.0
neighbors, _, _, _ = core.preprocess(T[:, nn_idx[k] : nn_idx[k] + m], m)
neighbors = core.z_norm(neighbors, axis=1)
neighbors[T_subseq_isconstant[:, nn_idx[k]]] = 0.0
disc_subseqs = discretize_func(subseqs)
disc_neighbors = discretize_func(neighbors)
D = np.linalg.norm(disc_subseqs - disc_neighbors, axis=1)
S[k] = core._subspace(D, k, include=include, discords=discords)
bit_sizes[k] = core._mdl(disc_subseqs, disc_neighbors, S[k], n_bit=n_bit)
return bit_sizes, S
def _multi_distance_profile(
query_idx,
T_A,
T_B,
m,
M_T,
Σ_T,
μ_Q,
σ_Q,
T_subseq_isconstant,
Q_subseq_isconstant,
include=None,
discords=False,
excl_zone=None,
):
"""
Multi-dimensional wrapper to compute the multi-dimensional distance profile for a
given query window within the times series or sequence that is denoted by the
`query_idx` index. Essentially, this is a convenience wrapper around `_multi_mass`.
Parameters
----------
query_idx : int
The start index of the (multi-dimensional) query subsequence in `T_B`
T_A : numpy.ndarray
The time series or sequence for which the multi-dimensional distance profile
is computed
T_B : numpy.ndarray
The time series or sequence that contains your query subsequences
m : int
Window size
M_T : numpy.ndarray
Sliding mean for `T_A`
Σ_T : numpy.ndarray
Sliding standard deviation for `T_A`
μ_Q : numpy.ndarray
Sliding mean for the query subsequence `T_B`
σ_Q : numpy.ndarray
Sliding standard deviation for the query subsequence `T_B`
T_subseq_isconstant : numpy.ndarray
A boolean array that indicates whether a subsequence in `T_A` is constant (True)
Q_subseq_isconstant : numpy.ndarray
A boolean array that indicates whether the subsequence in `T_B` is
constant (True)
include : numpy.ndarray, default None
A list of (zero-based) indices corresponding to the dimensions in `T` that
must be included in the constrained multidimensional motif search.
For more information, see Section IV D in:
`DOI: 10.1109/ICDM.2017.66 \
<https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__
discords : bool, default False
When set to `True`, this reverses the distance profile to favor discords rather
than motifs. Note that indices in `include` are still maintained and respected.
excl_zone : int, default None
The half width for the exclusion zone relative to the `query_idx`.
Returns
-------
D : numpy.ndarray
Multi-dimensional distance profile for the window with index equal to
`query_idx`
"""
d, n = T_A.shape
l = n - m + 1
start_row_idx = 0
D = _multi_mass(
T_B[:, query_idx : query_idx + m],
T_A,
m,
M_T,
Σ_T,
μ_Q[:, query_idx],
σ_Q[:, query_idx],
T_subseq_isconstant,
np.expand_dims(Q_subseq_isconstant[:, query_idx], 1),
query_idx=query_idx,
)
if include is not None:
core._apply_include(D, include)
start_row_idx = include.shape[0]
if discords:
D[start_row_idx:][::-1].sort(axis=0, kind="mergesort")
else:
D[start_row_idx:].sort(axis=0, kind="mergesort")
D_prime = np.zeros(l, dtype=np.float64)
for i in range(d):
D_prime[:] = D_prime + D[i]
D[i, :] = D_prime / (i + 1)
if excl_zone is not None:
core.apply_exclusion_zone(D, query_idx, excl_zone, np.inf)
return D
@core.non_normalized(
maamp_multi_distance_profile, exclude=["normalize", "T_subseq_isconstant"]
)
def multi_distance_profile(
query_idx,
T,
m,
include=None,
discords=False,
normalize=True,
p=2.0,
T_subseq_isconstant=None,
):
"""
Multi-dimensional wrapper to compute the multi-dimensional distance profile for a
given query window within the times series or sequence that is denoted by the
`query_idx` index.
Parameters
----------
query_idx : int
The window index to calculate the multi-dimensional distance profile for
T : numpy.ndarray
The multi-dimensional time series or sequence for which the multi-dimensional
distance profile will be returned
m : int
Window size
include : numpy.ndarray, default None
A list of (zero-based) indices corresponding to the dimensions in `T` that
must be included in the constrained multidimensional motif search.
For more information, see Section IV D in:
`DOI: 10.1109/ICDM.2017.66 \
<https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__
discords : bool, default False
When set to `True`, this reverses the distance profile to favor discords rather
than motifs. Note that indices in `include` are still maintained and respected.
normalize : bool, default True
When set to `True`, this z-normalizes subsequences prior to computing distances.
Otherwise, this function gets re-routed to its complementary non-normalized
equivalent set in the `@core.non_normalized` function decorator.
p : float, default 2.0
The p-norm to apply for computing the Minkowski distance. Minkowski distance is
typically used with `p` being 1 or 2, which correspond to the Manhattan distance
and the Euclidean distance, respectively. This parameter is ignored when
`normalize == True`.
T_subseq_isconstant : numpy.ndarray, function, or list, default None
A parameter that is used to show whether a subsequence of a time series in `T`
is constant (True) or not. T_subseq_isconstant can be a 2D boolean numpy.ndarry
or a function that can be applied to each time series in `T`. Alternatively, for
maximum flexibility, a list (with length equal to the total number of time
series) may also be used. In this case, T_subseq_isconstant[i] corresponds to
the i-th time series T[i] and each element in the list can either be a 1D
boolean np.ndarray, a function, or None.
Returns
-------
D : numpy.ndarray
Multi-dimensional distance profile for the window with index equal to
`query_idx`
"""
T, M_T, Σ_T, T_subseq_isconstant = core.preprocess(
T, m, T_subseq_isconstant=T_subseq_isconstant
)
if T.ndim <= 1: # pragma: no cover
err = f"T is {T.ndim}-dimensional and must be at least 1-dimensional"
raise ValueError(f"{err}")
core.check_window_size(m, max_size=T.shape[1])
if include is not None: # pragma: no cover
include = core._preprocess_include(include)
excl_zone = int(
np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)
) # See Definition 3 and Figure 3
D = _multi_distance_profile(
query_idx,
T,
T,
m,
M_T,
Σ_T,
M_T,
Σ_T,
T_subseq_isconstant,
T_subseq_isconstant,
include,
discords,
excl_zone,
)
return D
def _get_first_mstump_profile(
start,
T_A,
T_B,
m,
excl_zone,
M_T,
Σ_T,
μ_Q,
σ_Q,
T_subseq_isconstant,
Q_subseq_isconstant,
include=None,
discords=False,
):
"""
Multi-dimensional wrapper to compute the multi-dimensional matrix profile
and multi-dimensional matrix profile index for a given window within the
times series or sequence that is denoted by the `start` index.
Essentially, this is a convenience wrapper around `_multi_mass`. This is a
convenience wrapper for the `_multi_distance_profile` function but does not
return the multi-dimensional matrix profile subspace.
Parameters
----------
start : int
The window index to calculate the first multi-dimensional matrix profile,
multi-dimensional matrix profile indices, and multi-dimensional subspace.
T_A : numpy.ndarray
The time series or sequence for which the multi-dimensional matrix profile,
multi-dimensional matrix profile indices, and multi-dimensional subspace will be
returned
T_B : numpy.ndarray
The time series or sequence that contains your query subsequences
m : int
Window size
excl_zone : int
The half width for the exclusion zone relative to the `start`.
M_T : numpy.ndarray
Sliding mean for `T_A`
Σ_T : numpy.ndarray
Sliding standard deviation for `T_A`
μ_Q : numpy.ndarray
Sliding mean for `T_B`
σ_Q : numpy.ndarray
Sliding standard deviation for `T_B`
T_subseq_isconstant : numpy.ndarray
A boolean array that indicates whether a subsequence in `T_A` is constant (True)
Q_subseq_isconstant : numpy.ndarray
A boolean array that indicates whether a (query) subsequence in `T_B` is
constant (True)
include : numpy.ndarray, default None
A list of (zero-based) indices corresponding to the dimensions in `T` that
must be included in the constrained multidimensional motif search.
For more information, see Section IV D in:
`DOI: 10.1109/ICDM.2017.66 \
<https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__
discords : bool, default False
When set to `True`, this reverses the distance profile to favor discords rather
than motifs. Note that indices in `include` are still maintained and respected.
Returns
-------
P : numpy.ndarray
Multi-dimensional matrix profile for the window with index equal to
`start`
I : numpy.ndarray
Multi-dimensional matrix profile indices for the window with index
equal to `start`
"""
D = _multi_distance_profile(
start,
T_A,
T_B,
m,
M_T,
Σ_T,
μ_Q,
σ_Q,
T_subseq_isconstant,
Q_subseq_isconstant,
include,
discords,
excl_zone,
)
d = T_A.shape[0]
P = np.full(d, np.inf, dtype=np.float64)
I = np.full(d, -1, dtype=np.int64)
for i in range(d):
min_index = np.argmin(D[i])
I[i] = min_index
P[i] = D[i, min_index]
if np.isinf(P[i]): # pragma nocover
I[i] = -1
return P, I
def _get_multi_QT(start, T, m):
"""
Multi-dimensional wrapper to compute the sliding dot product between
the query, `T[:, start:start+m])` and the time series, `T`.
Additionally, compute QT for the first window.
Parameters
----------
start : int
The window index for T_B from which to calculate the QT dot product
T : numpy.ndarray
The time series or sequence for which to compute the dot product
m : int
Window size
Returns
-------
QT : numpy.ndarray
Given `start`, return the corresponding multi-dimensional QT
QT_first : numpy.ndarray
Multi-dimensional QT for the first window
"""
d = T.shape[0]
l = T.shape[1] - m + 1
QT = np.empty((d, l), dtype=np.float64)
QT_first = np.empty((d, l), dtype=np.float64)
for i in range(d):
QT[i] = core.sliding_dot_product(T[i, start : start + m], T[i])
QT_first[i] = core.sliding_dot_product(T[i, :m], T[i])
return QT, QT_first
@njit(
# "(i8, i8, i8, f8[:, :], f8[:, :], i8, i8, f8[:, :], f8[:, :], f8[:, :],"
# "f8[:, :], f8[:, :], f8[:, :], f8[:, :])",
parallel=True,
fastmath=True,
)
def _compute_multi_D(
d,
k,
idx,
D,
T,
m,
excl_zone,
M_T,
Σ_T,
QT_even,
QT_odd,
QT_first,
μ_Q,
σ_Q,
Q_subseq_isconstant,
T_subseq_isconstant,
):
"""
A Numba JIT-compiled version of mSTOMP for parallel computation of the
multi-dimensional distance profile
Parameters
----------
d : int
The total number of dimensions in `T`
k : int
The total number of sliding windows to iterate over
idx : int
The subsequence index for the i-th time series, `T[i]`
D : numpy.ndarray
The output distance profile
T : numpy.ndarray
The time series or sequence for which to compute the matrix profile
m : int
Window size
excl_zone : int
The half width for the exclusion zone relative to the current
sliding window
M_T : numpy.ndarray
Sliding mean of time series, `T`
Σ_T : numpy.ndarray
Sliding standard deviation of time series, `T`
QT_even : numpy.ndarray
Dot product between some query sequence,`Q`, and time series, `T`
QT_odd : numpy.ndarray
Dot product between some query sequence,`Q`, and time series, `T`
QT_first : numpy.ndarray
QT for the first window relative to the current sliding window
μ_Q : numpy.ndarray
Mean of the query sequence, `Q`, relative to the current sliding window
σ_Q : numpy.ndarray
Standard deviation of the query sequence, `Q`, relative to the current
sliding window
Q_subseq_isconstant : numpy.ndarray
A boolean array that indicates whether the query subsequence in `Q`
is constant (True)
T_subseq_isconstant : numpy.ndarray
A boolean array that indicates whether a subsequence in `T`
is constant (True)
Returns
-------
None
Notes
-----
`DOI: 10.1109/ICDM.2017.66 \
<https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__
See mSTAMP Algorithm
"""
for i in range(d):
# Numba's prange requires incrementing a range by 1 so replace
# `for j in range(k-1,0,-1)` with its incrementing compliment
for rev_j in prange(1, k):
j = k - rev_j
# GPU Stomp Parallel Implementation with Numba
# DOI: 10.1109/ICDM.2016.0085
# See Figure 5
if idx % 2 == 0:
# Even
QT_even[i, j] = (
QT_odd[i, j - 1]
- T[i, idx - 1] * T[i, j - 1]
+ T[i, idx + m - 1] * T[i, j + m - 1]
)
else:
# Odd
QT_odd[i, j] = (
QT_even[i, j - 1]
- T[i, idx - 1] * T[i, j - 1]
+ T[i, idx + m - 1] * T[i, j + m - 1]
)
if idx % 2 == 0:
QT_even[i, 0] = QT_first[i, idx]
D[i] = core._calculate_squared_distance_profile(
m,
QT_even[i],
μ_Q[i, idx],
σ_Q[i, idx],
M_T[i],
Σ_T[i],
Q_subseq_isconstant[i, idx],
T_subseq_isconstant[i],
)
else:
QT_odd[i, 0] = QT_first[i, idx]
D[i] = core._calculate_squared_distance_profile(
m,
QT_odd[i],
μ_Q[i, idx],
σ_Q[i, idx],
M_T[i],
Σ_T[i],
Q_subseq_isconstant[i, idx],
T_subseq_isconstant[i],
)
core._apply_exclusion_zone(D, idx, excl_zone, np.inf)
def _mstump(
T,
m,
range_stop,
excl_zone,
M_T,
Σ_T,
QT,
QT_first,
μ_Q,
σ_Q,
T_subseq_isconstant,
Q_subseq_isconstant,
k,
range_start=1,
include=None,
discords=False,
):
"""
A Numba JIT-compiled version of mSTOMP, a variant of mSTAMP, for parallel
computation of the multi-dimensional matrix profile and multi-dimensional
matrix profile indices. Note that only self-joins are supported.
Parameters
----------
T : numpy.ndarray
The time series or sequence for which to compute the multi-dimensional
matrix profile
m : int
Window size
range_stop : int
The index value along T for which to stop the matrix profile
calculation. This parameter is here for consistency with the
distributed `mstumped` algorithm.
excl_zone : int
The half width for the exclusion zone relative to the current
sliding window
M_T : numpy.ndarray
Sliding mean of time series, `T`
Σ_T : numpy.ndarray
Sliding standard deviation of time series, `T`
QT : numpy.ndarray
Dot product between some query sequence,`Q`, and time series, `T`
QT_first : numpy.ndarray
QT for the first window relative to the current sliding window
μ_Q : numpy.ndarray
Mean of the query sequence, `Q`, relative to the current sliding window
σ_Q : numpy.ndarray
Standard deviation of the query sequence, `Q`, relative to the current
sliding window
T_subseq_isconstant : numpy.ndarray
A boolean array that indicates whether a subsequence in `T`
is constant (True)
Q_subseq_isconstant : numpy.ndarray
A boolean array that indicates whether the query subsequence in `Q`
is constant (True)
k : int
The total number of sliding windows to iterate over
range_start : int, default 1
The starting index value along T_B for which to start the matrix
profile calculation. Default is 1.
include : numpy.ndarray, default None
A list of (zero-based) indices corresponding to the dimensions in `T` that
must be included in the constrained multidimensional motif search.
For more information, see Section IV D in:
`DOI: 10.1109/ICDM.2017.66 \
<https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__
discords : bool, default False
When set to `True`, this reverses the distance profile to favor discords rather
than motifs. Note that indices in `include` are still maintained and respected.
Returns
-------
P : numpy.ndarray
The multi-dimensional matrix profile. Each row of the array corresponds
to each matrix profile for a given dimension (i.e., the first row is the
1-D matrix profile and the second row is the 2-D matrix profile).
I : numpy.ndarray
The multi-dimensional matrix profile index where each row of the array
corresponds to each matrix profile index for a given dimension.
Notes
-----
`DOI: 10.1109/ICDM.2017.66 \
<https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__
See mSTAMP Algorithm
"""
QT_odd = QT.copy()
QT_even = QT.copy()
d = T.shape[0]
P = np.empty((d, range_stop - range_start), dtype=np.float64)
I = np.empty((d, range_stop - range_start), dtype=np.int64)
D = np.empty((d, k), dtype=np.float64)
D_prime = np.empty(k, dtype=np.float64)
start_row_idx = 0
if include is not None:
tmp_swap = np.empty((include.shape[0], k), dtype=np.float64)
restricted_indices = include[include < include.shape[0]]
unrestricted_indices = include[include >= include.shape[0]]
mask = np.ones(include.shape[0], dtype=bool)
mask[restricted_indices] = False
for idx in range(range_start, range_stop):
_compute_multi_D(
d,
k,
idx,
D,
T,
m,
excl_zone,
M_T,
Σ_T,
QT_even,
QT_odd,
QT_first,
μ_Q,
σ_Q,
Q_subseq_isconstant,
T_subseq_isconstant,
)
# `include` processing must occur here since we are dealing with distances
if include is not None:
core._apply_include(
D, include, restricted_indices, unrestricted_indices, mask, tmp_swap
)
start_row_idx = include.shape[0]
if discords:
D[start_row_idx:][::-1].sort(axis=0)
else:
D[start_row_idx:].sort(axis=0)
core._compute_multi_PI(d, idx, D, D_prime, range_start, P, I)
return P, I
[docs]
@core.non_normalized(maamp, exclude=["normalize", "T_subseq_isconstant"])
def mstump(
T, m, include=None, discords=False, normalize=True, p=2.0, T_subseq_isconstant=None
):
"""
Compute the multi-dimensional z-normalized matrix profile
This is a convenience wrapper around the Numba JIT-compiled parallelized
`_mstump` function which computes the multi-dimensional matrix profile and
multi-dimensional matrix profile index according to mSTOMP, a variant of
mSTAMP. Note that only self-joins are supported.
Parameters
----------
T : numpy.ndarray
The time series or sequence for which to compute the multi-dimensional
matrix profile. Each row in `T` represents data from a different
dimension while each column in `T` represents data from the same
dimension.
m : int
Window size
include : list, numpy.ndarray, default None
A list of (zero-based) indices corresponding to the dimensions in `T` that
must be included in the constrained multidimensional motif search.
For more information, see Section IV D in:
`DOI: 10.1109/ICDM.2017.66 \
<https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__
discords : bool, default False
When set to `True`, this reverses the distance matrix which results in a
multi-dimensional matrix profile that favors larger matrix profile values
(i.e., discords) rather than smaller values (i.e., motifs). Note that indices
in `include` are still maintained and respected.
normalize : bool, default True
When set to `True`, this z-normalizes subsequences prior to computing distances.
Otherwise, this function gets re-routed to its complementary non-normalized
equivalent set in the `@core.non_normalized` function decorator.
p : float, default 2.0
The p-norm to apply for computing the Minkowski distance. Minkowski distance is
typically used with `p` being 1 or 2, which correspond to the Manhattan distance
and the Euclidean distance, respectively. This parameter is ignored when
`normalize == True`.
T_subseq_isconstant : numpy.ndarray, function, or list, default None
A parameter that is used to show whether a subsequence of a time series in `T`
is constant (True) or not. T_subseq_isconstant can be a 2D boolean numpy.ndarry
or a function that can be applied to each time series in `T`. Alternatively, for
maximum flexibility, a list (with length equal to the total number of time
series) may also be used. In this case, T_subseq_isconstant[i] corresponds to
the i-th time series T[i] and each element in the list can either be a 1D
boolean np.ndarray, a function, or None.
Returns
-------
P : numpy.ndarray
The multi-dimensional matrix profile. Each row of the array corresponds
to each matrix profile for a given dimension (i.e., the first row is
the 1-D matrix profile and the second row is the 2-D matrix profile).
I : numpy.ndarray
The multi-dimensional matrix profile index where each row of the array
corresponds to each matrix profile index for a given dimension.
See Also
--------
stumpy.mstumped : Compute the multi-dimensional z-normalized matrix profile with
a distributed dask cluster
stumpy.subspace : Compute the k-dimensional matrix profile subspace for a given
subsequence index and its nearest neighbor index
stumpy.mdl : Compute the number of bits needed to compress one array with another
using the minimum description length (MDL)
Notes
-----
`DOI: 10.1109/ICDM.2017.66 \
<https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__
See mSTAMP Algorithm
Examples
--------
>>> stumpy.mstump(
... np.array([[584., -11., 23., 79., 1001., 0., -19.],
... [ 1., 2., 4., 8., 16., 0., 32.]]),
... m=3)
(array([[0. , 1.43947142, 0. , 2.69407392, 0.11633857],
[0.777905 , 2.36179922, 1.50004632, 2.92246722, 0.777905 ]]),
array([[2, 4, 0, 1, 0],
[4, 4, 0, 1, 0]]))
"""
T_A = T
T_B = T_A
T_A = core._preprocess(T_A)
T_B = core._preprocess(T_B)
T_A_subseq_isconstant = T_subseq_isconstant
T_A_subseq_isconstant = core.process_isconstant(T_A, m, T_A_subseq_isconstant)
T_B_subseq_isconstant = T_A_subseq_isconstant
T_A, M_T, Σ_T, T_subseq_isconstant = core.preprocess(
T_A, m, T_subseq_isconstant=T_A_subseq_isconstant
)
T_B, μ_Q, σ_Q, Q_subseq_isconstant = core.preprocess(
T_B, m, T_subseq_isconstant=T_B_subseq_isconstant
)
if T_A.ndim <= 1: # pragma: no cover
err = f"T is {T_A.ndim}-dimensional and must be at least 1-dimensional"
raise ValueError(f"{err}")
core.check_window_size(m, max_size=min(T_A.shape[1], T_B.shape[1]))
if include is not None:
include = core._preprocess_include(include)
d, n = T_B.shape
l = n - m + 1
excl_zone = int(
np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)
) # See Definition 3 and Figure 3
P = np.empty((d, l), dtype=np.float64)
I = np.empty((d, l), dtype=np.int64)
start = 0
stop = l
P[:, start], I[:, start] = _get_first_mstump_profile(
start,
T_A,
T_B,
m,
excl_zone,
M_T,
Σ_T,
μ_Q,
σ_Q,
T_subseq_isconstant,
Q_subseq_isconstant,
include,
discords,
)
QT, QT_first = _get_multi_QT(start, T_A, m)
P[:, start + 1 : stop], I[:, start + 1 : stop] = _mstump(
T_A,
m,
stop,
excl_zone,
M_T,
Σ_T,
QT,
QT_first,
μ_Q,
σ_Q,
T_subseq_isconstant,
Q_subseq_isconstant,
l,
start + 1,
include,
discords,
)
return P, I
```