Source code for stumpy.mstump

# STUMPY
# Copyright 2019 TD Ameritrade. Released under the terms of the 3-Clause BSD license.
# STUMPY is a trademark of TD Ameritrade IP Company, Inc. All rights reserved.

import logging

import numpy as np
from scipy.stats import norm
from numba import njit, prange
from functools import lru_cache, partial

from . import core, config
from .maamp import maamp_multi_distance_profile, maamp, maamp_subspace, maamp_mdl

logger = logging.getLogger(__name__)


def _preprocess_include(include):
    """
    A utility function for processing the `include` input

    Parameters
    ----------
    include : numpy.ndarray
        A list of (zero-based) indices corresponding to the dimensions in `T` that
        must be included in the constrained multidimensional motif search.
        For more information, see Section IV D in:

        `DOI: 10.1109/ICDM.2017.66 \
        <https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__

    Returns
    -------
    include : numpy.ndarray
        Process `include` and remove any redundant index values
    """
    include = np.asarray(include)
    _, idx = np.unique(include, return_index=True)
    if include.shape[0] != idx.shape[0]:  # pragma: no cover
        logger.warning("Removed repeating indices in `include`")
        include = include[np.sort(idx)]

    return include


def _apply_include(
    D,
    include,
    restricted_indices=None,
    unrestricted_indices=None,
    mask=None,
    tmp_swap=None,
):
    """
    Apply a transformation to the multi-dimensional distance profile so that specific
    dimensions are always included. Essentially, it is swapping rows within the distance
    profile.

    Parameters
    ----------
    D : numpy.ndarray
        The multi-dimensional distance profile

    include : numpy.ndarray
        A list of (zero-based) indices corresponding to the dimensions in `T` that
        must be included in the constrained multidimensional motif search.
        For more information, see Section IV D in:

        `DOI: 10.1109/ICDM.2017.66 \
        <https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__

    restricted_indices : numpy.ndarray, default None
        A list of indices specified in `include` that reside in the first
        `include.shape[0]` rows

    unrestricted_indices : numpy.ndarray, default None
        A list of indices specified in `include` that do not reside in the first
        `include.shape[0]` rows

    mask : numpy.ndarray, default None
        A boolean mask to select for unrestricted indices

    tmp_swap : numpy.ndarray, default None
        A reusable array to aid in array element swapping
    """
    include = _preprocess_include(include)

    if restricted_indices is None:
        restricted_indices = include[include < include.shape[0]]

    if unrestricted_indices is None:
        unrestricted_indices = include[include >= include.shape[0]]

    if mask is None:
        mask = np.ones(include.shape[0], dtype=bool)
        mask[restricted_indices] = False

    if tmp_swap is None:
        tmp_swap = D[: include.shape[0]].copy()
    else:
        tmp_swap[:] = D[: include.shape[0]]

    D[: include.shape[0]] = D[include]
    D[unrestricted_indices] = tmp_swap[mask]


def _multi_mass(Q, T, m, M_T, Σ_T, μ_Q, σ_Q):
    """
    A multi-dimensional wrapper around "Mueen's Algorithm for Similarity Search"
    (MASS) to compute multi-dimensional distance profile.

    Parameters
    ----------
    Q : numpy.ndarray
        Query array or subsequence

    T : numpy.ndarray
        Time series array or sequence

    m : int
        Window size

    M_T : numpy.ndarray
        Sliding mean for `T_A`

    Σ_T : numpy.ndarray
        Sliding standard deviation for `T_A`

    μ_Q : numpy.ndarray
        Mean value of `Q`

    σ_Q : numpy.ndarray
        Standard deviation of `Q`

    Returns
    -------
    D : numpy.ndarray
        Multi-dimensional distance profile
    """
    d, n = T.shape
    k = n - m + 1

    D = np.empty((d, k), dtype=np.float64)

    for i in range(d):
        if np.isinf(μ_Q[i]):
            D[i, :] = np.inf
        else:
            D[i, :] = core.mass(Q[i], T[i], M_T[i], Σ_T[i])

    return D


def _subspace(D, k, include=None, discords=False):
    """
    Compute the k-dimensional matrix profile subspace for a given subsequence index and
    its nearest neighbor index

    Parameters
    ----------
    D : numpy.ndarray
        The multi-dimensional distance profile

    k : int
        The subset number of dimensions out of `D = T.shape[0]`-dimensions to return
        the subspace for. Note that zero-based indexing is used.

    include : numpy.ndarray, default None
        A list of (zero-based) indices corresponding to the dimensions in `T` that
        must be included in the constrained multidimensional motif search.
        For more information, see Section IV D in:

        `DOI: 10.1109/ICDM.2017.66 \
        <https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__

    discords : bool, default False
        When set to `True`, this reverses the distance profile to favor discords rather
        than motifs. Note that indices in `include` are still maintained and respected.

    Returns
    -------
        S : numpy.ndarray
        An array that contains the `k`th-dimensional subspace for the subsequence. Note
        that `k+1` rows will be returned.
    """
    if discords:
        sorted_idx = D[::-1].argsort(axis=0, kind="mergesort")
    else:
        sorted_idx = D.argsort(axis=0, kind="mergesort")

    # `include` processing occur here since we are dealing with indices, not distances
    if include is not None:
        include = _preprocess_include(include)
        mask = np.in1d(sorted_idx, include)
        include_idx = mask.nonzero()[0]
        exclude_idx = (~mask).nonzero()[0]
        sorted_idx[: include_idx.shape[0]], sorted_idx[include_idx.shape[0] :] = (
            sorted_idx[include_idx],
            sorted_idx[exclude_idx],
        )

    S = sorted_idx[: k + 1]

    return S


[docs]@core.non_normalized(maamp_subspace) def subspace( T, m, subseq_idx, nn_idx, k, include=None, discords=False, discretize_func=None, n_bit=8, normalize=True, p=2.0, ): """ Compute the k-dimensional matrix profile subspace for a given subsequence index and its nearest neighbor index Parameters ---------- T : numpy.ndarray The time series or sequence for which the multi-dimensional matrix profile, multi-dimensional matrix profile indices were computed m : int Window size subseq_idx : int The subsequence index in T nn_idx : int The nearest neighbor index in T k : int The subset number of dimensions out of `D = T.shape[0]`-dimensions to return the subspace for. Note that zero-based indexing is used. include : numpy.ndarray, default None A list of (zero-based) indices corresponding to the dimensions in `T` that must be included in the constrained multidimensional motif search. For more information, see Section IV D in: `DOI: 10.1109/ICDM.2017.66 \ <https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__ discords : bool, default False When set to `True`, this reverses the distance profile to favor discords rather than motifs. Note that indices in `include` are still maintained and respected. discretize_func : func, default None A function for discretizing each input array. When this is `None`, an appropriate discretization function (based on the `normalize` parameter) will be applied. n_bit : int, default 8 The number of bits used for discretization. For more information on an appropriate value, see Figure 4 in: `DOI: 10.1109/ICDM.2016.0069 \ <https://www.cs.ucr.edu/~eamonn/PID4481999_Matrix%20Profile_III.pdf>`__ and Figure 2 in: `DOI: 10.1109/ICDM.2011.54 \ <https://www.cs.ucr.edu/~eamonn/ICDM_mdl.pdf>`__ normalize : bool, default True When set to `True`, this z-normalizes subsequences prior to computing distances. Otherwise, this function gets re-routed to its complementary non-normalized equivalent set in the `@core.non_normalized` function decorator. p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. This parameter is ignored when `normalize == True`. Returns ------- S : numpy.ndarray An array that contains the (singular) `k`th-dimensional subspace for the subsequence with index equal to `subseq_idx`. Note that `k+1` rows will be returned. See Also -------- stumpy.mstump : Compute the multi-dimensional z-normalized matrix profile stumpy.mstumped : Compute the multi-dimensional z-normalized matrix profile with a distributed dask cluster stumpy.mdl : Compute the number of bits needed to compress one array with another using the minimum description length (MDL) Examples -------- >>> mps, indices = stumpy.mstump( ... np.array([[584., -11., 23., 79., 1001., 0., -19.], ... [ 1., 2., 4., 8., 16., 0., 32.]]), ... m=3) >>> motifs_idx = np.argsort(mps, axis=1)[:, :2] >>> stumpy.subspace( ... np.array([[584., -11., 23., 79., 1001., 0., -19.], ... [ 1., 2., 4., 8., 16., 0., 32.]]), ... m=3, ... subseq_idx=motifs_idx[k][0], ... nn_idx=indices[k][motifs_idx[k][0]], ... k=1) array([0, 1]) """ T = core._preprocess(T) core.check_window_size(m, max_size=T.shape[-1]) if discretize_func is None: bins = _inverse_norm(n_bit) discretize_func = partial(_discretize, bins=bins) subseqs, _, _ = core.preprocess(T[:, subseq_idx : subseq_idx + m], m) subseqs = core.z_norm(subseqs, axis=1) neighbors, _, _ = core.preprocess(T[:, nn_idx : nn_idx + m], m) neighbors = core.z_norm(neighbors, axis=1) disc_subseqs = discretize_func(subseqs) disc_neighbors = discretize_func(neighbors) D = np.linalg.norm(disc_subseqs - disc_neighbors, axis=1) S = _subspace(D, k, include=include, discords=discords) return S
@lru_cache() def _inverse_norm(n_bit=8): # pragma: no cover """ Generate bin edges from an inverse normal distribution This distribution is best suited for z-normalized time series data Parameters ---------- n_bit : int, default 8 The number of bits to be used in generating the inverse normal distribution Returns ------- out : numpy.ndarray Array of bin edges that can be used for data discretization """ return norm.ppf(np.arange(1, (2**n_bit)) / (2**n_bit)) def _discretize(a, bins, right=True): # pragma: no cover """ Discretize each row of the input array This is equivalent to `np.searchsorted(bins, a)` Parameters ---------- a : numpy.ndarray The input array bins : numpy.ndarray The bin edges used to discretize `a` right : bool, default True Indicates whether the intervals for binning include the right or the left bin edge. Returns ------- out : numpy.ndarray Discretized array """ return np.digitize(a, bins, right=right) def _mdl(disc_subseqs, disc_neighbors, S, n_bit=8): """ Compute the number of bits needed to compress one array with another using the minimum description length (MDL) Parameters ---------- disc_subseqs : numpy.ndarray The discretized array to be compressed disc_neighbors : numpy.ndarray The discretized array that will be used as a hypothesis for compression S : numpy.ndarray An array that contains the `k`th-dimensional subspace to be used n_bit : int, default 8 The number of bits to use for computing the bit size Returns ------- bit_size : float The total number of bits computed from MDL for representing both input arrays """ ndim = disc_subseqs.shape[0] sub_dims, m = disc_subseqs[S].shape n_val = len(np.unique(disc_subseqs[S] - disc_neighbors[S])) bit_size = n_bit * (2 * ndim * m - sub_dims * m) bit_size = bit_size + sub_dims * m * np.log2(n_val) + n_val * n_bit return bit_size
[docs]@core.non_normalized(maamp_mdl) def mdl( T, m, subseq_idx, nn_idx, include=None, discords=False, discretize_func=None, n_bit=8, normalize=True, p=2.0, ): """ Compute the multi-dimensional number of bits needed to compress one multi-dimensional subsequence with another along each of the k-dimensions using the minimum description length (MDL) Parameters ---------- T : numpy.ndarray The time series or sequence for which the multi-dimensional matrix profile, multi-dimensional matrix profile indices were computed m : int Window size subseq_idx : numpy.ndarray The multi-dimensional subsequence indices in T nn_idx : numpy.ndarray The multi-dimensional nearest neighbor index in T include : numpy.ndarray, default None A list of (zero-based) indices corresponding to the dimensions in `T` that must be included in the constrained multidimensional motif search. For more information, see Section IV D in: `DOI: 10.1109/ICDM.2017.66 \ <https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__ discords : bool, default False When set to `True`, this reverses the distance profile to favor discords rather than motifs. Note that indices in `include` are still maintained and respected. discretize_func : func, default None A function for discretizing each input array. When this is `None`, an appropriate discretization function (based on the `normalization` parameter) will be applied. n_bit : int, default 8 The number of bits used for discretization and for computing the bit size. For more information on an appropriate value, see Figure 4 in: `DOI: 10.1109/ICDM.2016.0069 \ <https://www.cs.ucr.edu/~eamonn/PID4481999_Matrix%20Profile_III.pdf>`__ and Figure 2 in: `DOI: 10.1109/ICDM.2011.54 \ <https://www.cs.ucr.edu/~eamonn/ICDM_mdl.pdf>`__ normalize : bool, default True When set to `True`, this z-normalizes subsequences prior to computing distances. Otherwise, this function gets re-routed to its complementary non-normalized equivalent set in the `@core.non_normalized` function decorator. p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. This parameter is ignored when `normalize == True`. Returns ------- bit_sizes : numpy.ndarray The total number of bits computed from MDL for representing each pair of multidimensional subsequences. S : list A list of numpy.ndarrays that contain the `k`th-dimensional subspaces See Also -------- stumpy.mstump : Compute the multi-dimensional z-normalized matrix profile stumpy.mstumped : Compute the multi-dimensional z-normalized matrix profile with a distributed dask cluster stumpy.subspace : Compute the k-dimensional matrix profile subspace for a given subsequence index and its nearest neighbor index Examples -------- >>> mps, indices = stumpy.mstump( ... np.array([[584., -11., 23., 79., 1001., 0., -19.], ... [ 1., 2., 4., 8., 16., 0., 32.]]), ... m=3) >>> motifs_idx = np.argsort(mps, axis=1)[:, 0] >>> stumpy.mdl( ... np.array([[584., -11., 23., 79., 1001., 0., -19.], ... [ 1., 2., 4., 8., 16., 0., 32.]]), ... m=3, ... subseq_idx=motifs_idx, ... nn_idx=indices[np.arange(motifs_idx.shape[0]), motifs_idx]) (array([ 80. , 111.509775]), [array([1]), array([0, 1])]) """ T = core._preprocess(T) core.check_window_size(m, max_size=T.shape[-1]) if discretize_func is None: bins = _inverse_norm(n_bit) discretize_func = partial(_discretize, bins=bins) bit_sizes = np.empty(T.shape[0]) S = [None] * T.shape[0] for k in range(T.shape[0]): subseqs, _, _ = core.preprocess(T[:, subseq_idx[k] : subseq_idx[k] + m], m) subseqs = core.z_norm(subseqs, axis=1) neighbors, _, _ = core.preprocess(T[:, nn_idx[k] : nn_idx[k] + m], m) neighbors = core.z_norm(neighbors, axis=1) disc_subseqs = discretize_func(subseqs) disc_neighbors = discretize_func(neighbors) D = np.linalg.norm(disc_subseqs - disc_neighbors, axis=1) S[k] = _subspace(D, k, include=include, discords=discords) bit_sizes[k] = _mdl(disc_subseqs, disc_neighbors, S[k], n_bit=n_bit) return bit_sizes, S
def _multi_distance_profile( query_idx, T_A, T_B, m, M_T, Σ_T, μ_Q, σ_Q, include=None, discords=False, excl_zone=None, ): """ Multi-dimensional wrapper to compute the multi-dimensional distance profile for a given query window within the times series or sequence that is denoted by the `query_idx` index. Essentially, this is a convenience wrapper around `_multi_mass`. Parameters ---------- query_idx : int The window index to calculate the multi-dimensional distance profile for T_A : numpy.ndarray The time series or sequence for which the multi-dimensional distance profile is computed T_B : numpy.ndarray The time series or sequence that contains your query subsequences m : int Window size M_T : numpy.ndarray Sliding mean for `T_A` Σ_T : numpy.ndarray Sliding standard deviation for `T_A` μ_Q : numpy.ndarray Sliding mean for the query subsequence `T_B` σ_Q : numpy.ndarray Sliding standard deviation for the query subsequence `T_B` include : numpy.ndarray, default None A list of (zero-based) indices corresponding to the dimensions in `T` that must be included in the constrained multidimensional motif search. For more information, see Section IV D in: `DOI: 10.1109/ICDM.2017.66 \ <https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__ discords : bool, default False When set to `True`, this reverses the distance profile to favor discords rather than motifs. Note that indices in `include` are still maintained and respected. excl_zone : int, default None The half width for the exclusion zone relative to the `query_idx`. Returns ------- D : numpy.ndarray Multi-dimensional distance profile for the window with index equal to `query_idx` """ d, n = T_A.shape k = n - m + 1 start_row_idx = 0 D = _multi_mass( T_B[:, query_idx : query_idx + m], T_A, m, M_T, Σ_T, μ_Q[:, query_idx], σ_Q[:, query_idx], ) if include is not None: _apply_include(D, include) start_row_idx = include.shape[0] if discords: D[start_row_idx:][::-1].sort(axis=0, kind="mergesort") else: D[start_row_idx:].sort(axis=0, kind="mergesort") D_prime = np.zeros(k, dtype=np.float64) for i in range(d): D_prime[:] = D_prime + D[i] D[i, :] = D_prime / (i + 1) if excl_zone is not None: core.apply_exclusion_zone(D, query_idx, excl_zone, np.inf) return D @core.non_normalized(maamp_multi_distance_profile) def multi_distance_profile( query_idx, T, m, include=None, discords=False, normalize=True, p=2.0 ): """ Multi-dimensional wrapper to compute the multi-dimensional distance profile for a given query window within the times series or sequence that is denoted by the `query_idx` index. Parameters ---------- query_idx : int The window index to calculate the multi-dimensional distance profile for T : numpy.ndarray The multi-dimensional time series or sequence for which the multi-dimensional distance profile will be returned m : int Window size include : numpy.ndarray, default None A list of (zero-based) indices corresponding to the dimensions in `T` that must be included in the constrained multidimensional motif search. For more information, see Section IV D in: `DOI: 10.1109/ICDM.2017.66 \ <https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__ discords : bool, default False When set to `True`, this reverses the distance profile to favor discords rather than motifs. Note that indices in `include` are still maintained and respected. normalize : bool, default True When set to `True`, this z-normalizes subsequences prior to computing distances. Otherwise, this function gets re-routed to its complementary non-normalized equivalent set in the `@core.non_normalized` function decorator. p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. This parameter is ignored when `normalize == True`. Returns ------- D : numpy.ndarray Multi-dimensional distance profile for the window with index equal to `query_idx` """ T, M_T, Σ_T = core.preprocess(T, m) if T.ndim <= 1: # pragma: no cover err = f"T is {T.ndim}-dimensional and must be at least 1-dimensional" raise ValueError(f"{err}") core.check_window_size(m, max_size=T.shape[1]) if include is not None: # pragma: no cover include = _preprocess_include(include) excl_zone = int( np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM) ) # See Definition 3 and Figure 3 D = _multi_distance_profile( query_idx, T, T, m, M_T, Σ_T, M_T, Σ_T, include, discords, excl_zone ) return D def _get_first_mstump_profile( start, T_A, T_B, m, excl_zone, M_T, Σ_T, μ_Q, σ_Q, include=None, discords=False ): """ Multi-dimensional wrapper to compute the multi-dimensional matrix profile and multi-dimensional matrix profile index for a given window within the times series or sequence that is denoted by the `start` index. Essentially, this is a convenience wrapper around `_multi_mass`. This is a convenience wrapper for the `_multi_distance_profile` function but does not return the multi-dimensional matrix profile subspace. Parameters ---------- start : int The window index to calculate the first multi-dimensional matrix profile, multi-dimensional matrix profile indices, and multi-dimensional subspace. T_A : numpy.ndarray The time series or sequence for which the multi-dimensional matrix profile, multi-dimensional matrix profile indices, and multi-dimensional subspace will be returned T_B : numpy.ndarray The time series or sequence that contains your query subsequences m : int Window size excl_zone : int The half width for the exclusion zone relative to the `start`. M_T : numpy.ndarray Sliding mean for `T_A` Σ_T : numpy.ndarray Sliding standard deviation for `T_A` μ_Q : numpy.ndarray Sliding mean for `T_B` σ_Q : numpy.ndarray Sliding standard deviation for `T_B` include : numpy.ndarray, default None A list of (zero-based) indices corresponding to the dimensions in `T` that must be included in the constrained multidimensional motif search. For more information, see Section IV D in: `DOI: 10.1109/ICDM.2017.66 \ <https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__ discords : bool, default False When set to `True`, this reverses the distance profile to favor discords rather than motifs. Note that indices in `include` are still maintained and respected. Returns ------- P : numpy.ndarray Multi-dimensional matrix profile for the window with index equal to `start` I : numpy.ndarray Multi-dimensional matrix profile indices for the window with index equal to `start` """ D = _multi_distance_profile( start, T_A, T_B, m, M_T, Σ_T, μ_Q, σ_Q, include, discords, excl_zone ) d = T_A.shape[0] P = np.full(d, np.inf, dtype=np.float64) I = np.full(d, -1, dtype=np.int64) for i in range(d): min_index = np.argmin(D[i]) I[i] = min_index P[i] = D[i, min_index] if np.isinf(P[i]): # pragma nocover I[i] = -1 return P, I def _get_multi_QT(start, T, m): """ Multi-dimensional wrapper to compute the sliding dot product between the query, `T[:, start:start+m])` and the time series, `T`. Additionally, compute QT for the first window. Parameters ---------- start : int The window index for T_B from which to calculate the QT dot product T : numpy.ndarray The time series or sequence for which to compute the dot product m : int Window size Returns ------- QT : numpy.ndarray Given `start`, return the corresponding multi-dimensional QT QT_first : numpy.ndarray Multi-dimensional QT for the first window """ d = T.shape[0] k = T.shape[1] - m + 1 QT = np.empty((d, k), dtype=np.float64) QT_first = np.empty((d, k), dtype=np.float64) for i in range(d): QT[i] = core.sliding_dot_product(T[i, start : start + m], T[i]) QT_first[i] = core.sliding_dot_product(T[i, :m], T[i]) return QT, QT_first @njit( # "(i8, i8, i8, f8[:, :], f8[:, :], i8, i8, f8[:, :], f8[:, :], f8[:, :]," # "f8[:, :], f8[:, :], f8[:, :], f8[:, :])", parallel=True, fastmath=True, ) def _compute_multi_D( d, k, idx, D, T, m, excl_zone, M_T, Σ_T, QT_even, QT_odd, QT_first, μ_Q, σ_Q ): """ A Numba JIT-compiled version of mSTOMP for parallel computation of the multi-dimensional distance profile Parameters ---------- d : int The total number of dimensions in `T` k : int The total number of sliding windows to iterate over idx : int The row index in `T` D : numpy.ndarray The output distance profile T : numpy.ndarray The time series or sequence for which to compute the matrix profile m : int Window size excl_zone : int The half width for the exclusion zone relative to the current sliding window M_T : numpy.ndarray Sliding mean of time series, `T` Σ_T : numpy.ndarray Sliding standard deviation of time series, `T` QT_even : numpy.ndarray Dot product between some query sequence,`Q`, and time series, `T` QT_odd : numpy.ndarray Dot product between some query sequence,`Q`, and time series, `T` QT_first : numpy.ndarray QT for the first window relative to the current sliding window μ_Q : numpy.ndarray Mean of the query sequence, `Q`, relative to the current sliding window σ_Q : numpy.ndarray Standard deviation of the query sequence, `Q`, relative to the current sliding window Notes ----- `DOI: 10.1109/ICDM.2017.66 \ <https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__ See mSTAMP Algorithm """ for i in range(d): # Numba's prange requires incrementing a range by 1 so replace # `for j in range(k-1,0,-1)` with its incrementing compliment for rev_j in prange(1, k): j = k - rev_j # GPU Stomp Parallel Implementation with Numba # DOI: 10.1109/ICDM.2016.0085 # See Figure 5 if idx % 2 == 0: # Even QT_even[i, j] = ( QT_odd[i, j - 1] - T[i, idx - 1] * T[i, j - 1] + T[i, idx + m - 1] * T[i, j + m - 1] ) else: # Odd QT_odd[i, j] = ( QT_even[i, j - 1] - T[i, idx - 1] * T[i, j - 1] + T[i, idx + m - 1] * T[i, j + m - 1] ) if idx % 2 == 0: QT_even[i, 0] = QT_first[i, idx] D[i] = core._calculate_squared_distance_profile( m, QT_even[i], μ_Q[i, idx], σ_Q[i, idx], M_T[i], Σ_T[i] ) else: QT_odd[i, 0] = QT_first[i, idx] D[i] = core._calculate_squared_distance_profile( m, QT_odd[i], μ_Q[i, idx], σ_Q[i, idx], M_T[i], Σ_T[i] ) core._apply_exclusion_zone(D, idx, excl_zone, np.inf) @njit( # "(i8, i8, f8[:, :], f8[:], i8, f8[:, :], i8[:, :], f8)", parallel=True, fastmath=True, ) def _compute_PI(d, idx, D, D_prime, range_start, P, I, p=2.0): """ A Numba JIT-compiled version of mSTOMP for updating the matrix profile and matrix profile indices Parameters ---------- d : int The total number of dimensions in `T` idx : int The row index in `T` D : numpy.ndarray The distance profile D_prime : numpy.ndarray A reusable array for storing the column-wise cumulative sum of `D` range_start : int The starting index value along `T` for which to start the matrix profile calculation P : numpy.ndarray The matrix profile I : numpy.ndarray The matrix profile indices p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. """ D_prime[:] = 0.0 for i in range(d): D_prime = D_prime + np.power(D[i], 1.0 / p) min_index = np.argmin(D_prime) pos = idx - range_start I[i, pos] = min_index P[i, pos] = D_prime[min_index] / (i + 1) if np.isinf(P[i, pos]): # pragma nocover I[i, pos] = -1 def _mstump( T, m, range_stop, excl_zone, M_T, Σ_T, QT, QT_first, μ_Q, σ_Q, k, range_start=1, include=None, discords=False, ): """ A Numba JIT-compiled version of mSTOMP, a variant of mSTAMP, for parallel computation of the multi-dimensional matrix profile and multi-dimensional matrix profile indices. Note that only self-joins are supported. Parameters ---------- T: numpy.ndarray The time series or sequence for which to compute the multi-dimensional matrix profile m : int Window size range_stop : int The index value along T for which to stop the matrix profile calculation. This parameter is here for consistency with the distributed `mstumped` algorithm. excl_zone : int The half width for the exclusion zone relative to the current sliding window M_T : numpy.ndarray Sliding mean of time series, `T` Σ_T : numpy.ndarray Sliding standard deviation of time series, `T` QT : numpy.ndarray Dot product between some query sequence,`Q`, and time series, `T` QT_first : numpy.ndarray QT for the first window relative to the current sliding window μ_Q : numpy.ndarray Mean of the query sequence, `Q`, relative to the current sliding window σ_Q : numpy.ndarray Standard deviation of the query sequence, `Q`, relative to the current sliding window k : int The total number of sliding windows to iterate over range_start : int, default 1 The starting index value along T_B for which to start the matrix profile calculation. Default is 1. include : numpy.ndarray, default None A list of (zero-based) indices corresponding to the dimensions in `T` that must be included in the constrained multidimensional motif search. For more information, see Section IV D in: `DOI: 10.1109/ICDM.2017.66 \ <https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__ discords : bool, default False When set to `True`, this reverses the distance profile to favor discords rather than motifs. Note that indices in `include` are still maintained and respected. Returns ------- P : numpy.ndarray The multi-dimensional matrix profile. Each row of the array corresponds to each matrix profile for a given dimension (i.e., the first row is the 1-D matrix profile and the second row is the 2-D matrix profile). I : numpy.ndarray The multi-dimensional matrix profile index where each row of the array corresponds to each matrix profile index for a given dimension. Notes ----- `DOI: 10.1109/ICDM.2017.66 \ <https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__ See mSTAMP Algorithm """ QT_odd = QT.copy() QT_even = QT.copy() d = T.shape[0] P = np.empty((d, range_stop - range_start), dtype=np.float64) I = np.empty((d, range_stop - range_start), dtype=np.int64) D = np.empty((d, k), dtype=np.float64) D_prime = np.empty(k, dtype=np.float64) start_row_idx = 0 if include is not None: tmp_swap = np.empty((include.shape[0], k), dtype=np.float64) restricted_indices = include[include < include.shape[0]] unrestricted_indices = include[include >= include.shape[0]] mask = np.ones(include.shape[0], dtype=bool) mask[restricted_indices] = False for idx in range(range_start, range_stop): _compute_multi_D( d, k, idx, D, T, m, excl_zone, M_T, Σ_T, QT_even, QT_odd, QT_first, μ_Q, σ_Q ) # `include` processing must occur here since we are dealing with distances if include is not None: _apply_include( D, include, restricted_indices, unrestricted_indices, mask, tmp_swap ) start_row_idx = include.shape[0] if discords: D[start_row_idx:][::-1].sort(axis=0) else: D[start_row_idx:].sort(axis=0) _compute_PI(d, idx, D, D_prime, range_start, P, I) return P, I
[docs]@core.non_normalized(maamp) def mstump(T, m, include=None, discords=False, normalize=True, p=2.0): """ Compute the multi-dimensional z-normalized matrix profile This is a convenience wrapper around the Numba JIT-compiled parallelized `_mstump` function which computes the multi-dimensional matrix profile and multi-dimensional matrix profile index according to mSTOMP, a variant of mSTAMP. Note that only self-joins are supported. Parameters ---------- T : numpy.ndarray The time series or sequence for which to compute the multi-dimensional matrix profile. Each row in `T` represents data from a different dimension while each column in `T` represents data from the same dimension. m : int Window size include : list, numpy.ndarray, default None A list of (zero-based) indices corresponding to the dimensions in `T` that must be included in the constrained multidimensional motif search. For more information, see Section IV D in: `DOI: 10.1109/ICDM.2017.66 \ <https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__ discords : bool, default False When set to `True`, this reverses the distance matrix which results in a multi-dimensional matrix profile that favors larger matrix profile values (i.e., discords) rather than smaller values (i.e., motifs). Note that indices in `include` are still maintained and respected. normalize : bool, default True When set to `True`, this z-normalizes subsequences prior to computing distances. Otherwise, this function gets re-routed to its complementary non-normalized equivalent set in the `@core.non_normalized` function decorator. p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. This parameter is ignored when `normalize == True`. Returns ------- P : numpy.ndarray The multi-dimensional matrix profile. Each row of the array corresponds to each matrix profile for a given dimension (i.e., the first row is the 1-D matrix profile and the second row is the 2-D matrix profile). I : numpy.ndarray The multi-dimensional matrix profile index where each row of the array corresponds to each matrix profile index for a given dimension. See Also -------- stumpy.mstumped : Compute the multi-dimensional z-normalized matrix profile with a distributed dask cluster stumpy.subspace : Compute the k-dimensional matrix profile subspace for a given subsequence index and its nearest neighbor index stumpy.mdl : Compute the number of bits needed to compress one array with another using the minimum description length (MDL) Notes ----- `DOI: 10.1109/ICDM.2017.66 \ <https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__ See mSTAMP Algorithm Examples -------- >>> stumpy.mstump( ... np.array([[584., -11., 23., 79., 1001., 0., -19.], ... [ 1., 2., 4., 8., 16., 0., 32.]]), ... m=3) (array([[0. , 1.43947142, 0. , 2.69407392, 0.11633857], [0.777905 , 2.36179922, 1.50004632, 2.92246722, 0.777905 ]]), array([[2, 4, 0, 1, 0], [4, 4, 0, 1, 0]])) """ T_A = T T_B = T_A T_A, M_T, Σ_T = core.preprocess(T_A, m) T_B, μ_Q, σ_Q = core.preprocess(T_B, m) if T_A.ndim <= 1: # pragma: no cover err = f"T is {T_A.ndim}-dimensional and must be at least 1-dimensional" raise ValueError(f"{err}") core.check_window_size(m, max_size=min(T_A.shape[1], T_B.shape[1])) if include is not None: include = _preprocess_include(include) d, n = T_B.shape k = n - m + 1 excl_zone = int( np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM) ) # See Definition 3 and Figure 3 P = np.empty((d, k), dtype=np.float64) I = np.empty((d, k), dtype=np.int64) start = 0 stop = k P[:, start], I[:, start] = _get_first_mstump_profile( start, T_A, T_B, m, excl_zone, M_T, Σ_T, μ_Q, σ_Q, include, discords ) QT, QT_first = _get_multi_QT(start, T_A, m) P[:, start + 1 : stop], I[:, start + 1 : stop] = _mstump( T_A, m, stop, excl_zone, M_T, Σ_T, QT, QT_first, μ_Q, σ_Q, k, start + 1, include, discords, ) return P, I