Source code for stumpy.mmotifs

# STUMPY
# Copyright 2019 TD Ameritrade. Released under the terms of the 3-Clause BSD license.
# STUMPY is a trademark of TD Ameritrade IP Company, Inc. All rights reserved.

import warnings

import numpy as np

from . import config, core
from .aamp_mmotifs import aamp_mmotifs
from .motifs import match
from .mstump import mdl



[docs]
@core.non_normalized(
    aamp_mmotifs,
    exclude=[
        "normalize",
        "T_subseq_isconstant",
    ],
)
def mmotifs(
    T,
    P,
    I,
    min_neighbors=1,
    max_distance=None,
    cutoffs=None,
    max_matches=10,
    max_motifs=1,
    atol=1e-8,
    k=None,
    include=None,
    normalize=True,
    p=2.0,
    T_subseq_isconstant=None,
):
    """
    Discover the top motifs for the multi-dimensional time series `T`

    Parameters
    ----------
    T : numpy.ndarray
        The multi-dimensional time series or sequence

    P : numpy.ndarray
        Multi-dimensional Matrix Profile of T

    I : numpy.ndarray
        Multi-dimensional Matrix Profile indices

    min_neighbors : int, default 1
        The minimum number of similar matches a subsequence needs to have in order
        to be considered a motif. This defaults to `1`, which means that a subsequence
        must have at least one similar match in order to be considered a motif.

    max_distance : flaot, default None
        Maximal distance that is allowed between a query subsequence
        (a candidate motif) and all subsequences in T to be considered as a match.
        If None, this defaults to
        `np.nanmax([np.nanmean(D) - 2 * np.nanstd(D), np.nanmin(D)])`
        (i.e. at least the closest match will be returned).

    cutoffs : numpy.ndarray or float, default None
        The largest matrix profile value (distance) for each dimension of the
        multidimensional matrix profile that a multidimenisonal candidate motif is
        allowed to have. If `cutoffs` is a scalar value, then this value will be
        applied to every dimension.

    max_matches : int, default 10
        The maximum number of similar matches (nearest neighbors) to return for each
        motif. The first match is always the self/trivial-match for each motif.

    max_motifs : int, default 1
        The maximum number of motifs to return

    atol : float, default 1e-8
        The absolute tolerance parameter. This value will be added to `max_distance`
        when comparing distances between subsequences.

    k : int, default None
        The number of dimensions (`k + 1`) required for discovering all motifs. This
        value is available for doing guided search or, together with `include`, for
        constrained search. If `k is None`, then this will be automatically be computed
        for each motif using MDL (unconstrained search).

    include : numpy.ndarray, default None
        A list of (zero based) indices corresponding to the dimensions in T that must be
        included in the constrained multidimensional motif search.

    normalize : bool, default True
        When set to `True`, this z-normalizes subsequences prior to computing distances.
        Otherwise, this function gets re-routed to its complementary non-normalized
        equivalent set in the `@core.non_normalized` function decorator.

    p : float, default 2.0
        The p-norm to apply for computing the Minkowski distance. Minkowski distance is
        typically used with `p` being 1 or 2, which correspond to the Manhattan distance
        and the Euclidean distance, respectively. This parameter is ignored when
        `normalize == True`.

    T_subseq_isconstant : numpy.ndarray, function, or list, default None
        A parameter that is used to show whether a subsequence of a time series in `T`
        is constant (True) or not. T_subseq_isconstant can be a 2D boolean numpy.ndarry
        or a function that can be applied to each time series in `T`. Alternatively, for
        maximum flexibility, a list (with length equal to the total number of time
        series) may also be used. In this case, T_subseq_isconstant[i] corresponds to
        the i-th time series T[i] and each element in the list can either be a 1D
        boolean np.ndarray, a function, or None.

    Returns
    -------
    motif_distances: numpy.ndarray
        The distances corresponding to a set of subsequence matches for each motif.

    motif_indices: numpy.ndarray
        The indices corresponding to a set of subsequences matches for each motif.

    motif_subspaces: list
        A list consisting of arrays that contain the `k`-dimensional
        subspace for each motif.

    motif_mdls: list
        A list consisting of arrays that contain the mdl results for
        finding the dimension of each motif

    See Also
    --------
    stumpy.motifs : Find the top motifs for time series `T`
    stumpy.match : Find all matches of a query `Q` in a time series `T`
    stumpy.mstump : Compute the multi-dimensional z-normalized matrix profile
    stumpy.mstumped : Compute the multi-dimensional z-normalized matrix profile with
        a distributed dask cluster
    stumpy.subspace : Compute the k-dimensional matrix profile subspace for a given
        subsequence index and its nearest neighbor index
    stumpy.mdl : Compute the number of bits needed to compress one array with another
        using the minimum description length (MDL)

    Notes
    -----
    `DOI: 10.1109/ICDM.2017.66 \
    <https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__

    For more information on `include` and search types, see Section IV D and IV E

    Examples
    --------
    >>> import stumpy
    >>> import numpy as np
    >>> mps, indices = stumpy.mstump(
    ...     np.array([[584., -11., 23., 79., 1001., 0., -19.],
    ...               [  1.,   2.,  4.,  8.,   16., 0.,  32.]]),
    ...     m=3)
    >>> stumpy.mmotifs(
    ...     np.array([[584., -11., 23., 79., 1001., 0., -19.],
    ...               [  1.,   2.,  4.,  8.,   16., 0.,  32.]]),
    ...     mps,
    ...     indices)
    (array([[4.47034836e-08, 4.47034836e-08]]),  array([[0, 2]]), [array([1])],
     [array([ 80.      , 111.509775])])
    """
    T = core._preprocess(T)
    m = T.shape[-1] - P.shape[-1] + 1
    reset_k = False

    if max_motifs < 1:  # pragma: no cover
        msg = "The maximum number of motifs, `max_motifs`, "
        msg += "must be greater than or equal to 1.\n"
        msg += "`max_motifs` has been set to `1`"
        warnings.warn(msg)
        max_motifs = 1

    T, M_T, Σ_T, T_subseq_isconstant = core.preprocess(
        T, m, T_subseq_isconstant=T_subseq_isconstant
    )
    P = P.copy()

    excl_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))

    if max_matches is None:
        max_matches = np.inf

    if cutoffs is None:
        P_copy = P.copy().astype(np.float64)
        P_copy[np.isinf(P_copy)] = np.nan
        cutoffs = np.nanmax(
            [
                np.nanmean(P_copy, axis=1) - 2.0 * np.nanstd(P_copy, axis=1),
                np.nanmin(P_copy, axis=1),
            ],
            axis=0,
        )
    if np.isscalar(cutoffs):
        cutoffs = np.full(P.shape[0], cutoffs)

    motif_distances = []
    motif_indices = []
    motif_subspaces = []
    motif_mdls = []

    candidate_idx = np.argmin(P, axis=1)
    nn_idx = I[np.arange(len(candidate_idx)), candidate_idx]

    while len(motif_distances) < max_motifs:
        mdls, subspaces = mdl(T, m, candidate_idx, nn_idx, include)
        if k is None:
            k = np.argmin(mdls)
            reset_k = True
        subspace_k = subspaces[k]

        motif_idx = candidate_idx[k]
        motif_value = P[k, motif_idx]

        if (
            motif_value > cutoffs[k]
            or not np.isfinite(motif_value)
            or motif_idx < 0
            or nn_idx[k] < 0
            or (isinstance(max_distance, float) and motif_value > max_distance)
        ):  # pragma: no cover
            break

        query_matches = match(
            Q=T[subspace_k, motif_idx : motif_idx + m],
            T=T[subspace_k],
            M_T=M_T[subspace_k],
            Σ_T=Σ_T[subspace_k],
            max_matches=max_matches,
            max_distance=max_distance,
            atol=atol,
            query_idx=motif_idx,
            normalize=normalize,
            p=p,
            T_subseq_isconstant=T_subseq_isconstant[subspace_k],
            Q_subseq_isconstant=np.expand_dims(
                T_subseq_isconstant[subspace_k, motif_idx], axis=1
            ),
        )

        if len(query_matches) > min_neighbors:
            motif_distances.append(query_matches[:, 0])
            motif_indices.append(query_matches[:, 1])
            motif_subspaces.append(subspace_k)
            motif_mdls.append(mdls)

        for idx in query_matches[:, 1]:
            core.apply_exclusion_zone(P, idx, excl_zone, np.inf)
        candidate_idx = np.argmin(P, axis=1)
        nn_idx = I[np.arange(len(candidate_idx)), candidate_idx]
        if reset_k:
            k = None

    motif_distances = core._jagged_list_to_array(
        motif_distances, fill_value=np.nan, dtype=np.float64
    )
    motif_indices = core._jagged_list_to_array(
        motif_indices, fill_value=-1, dtype=np.int64
    )

    return motif_distances, motif_indices, motif_subspaces, motif_mdls