Source code for stumpy.stimp

# STUMPY
# Copyright 2019 TD Ameritrade. Released under the terms of the 3-Clause BSD license.
# STUMPY is a trademark of TD Ameritrade IP Company, Inc. All rights reserved.

import numpy as np

from . import core
from .aamp_stimp import aamp_stimp, aamp_stimped
from .scrump import scrump
from .stump import stump
from .stumped import stumped


def _normalize_pan(pan, ms, bfs_indices, n_processed):
    """
    Normalize the pan matrix profile nearest neighbor distances (inplace) relative
    to the corresponding subsequence length from which they were computed

    Parameters
    ----------
    pan : numpy.ndarray
        The pan matrix profile

    ms : numpy.ndarray
        The breadth-first-search sorted subsequence window sizes

    bfs_indices : numpy.ndarray
        The breadth-first-search indices

    n_processed : numpy.ndarray
        The number of subsequence window sizes and breadth-first-search indices to
        normalize

    Returns
    -------
    None
    """
    idx = bfs_indices[:n_processed]
    norm = 1.0 / (2.0 * np.sqrt(ms[:n_processed]))
    pan[idx] = np.minimum(1.0, pan[idx] * np.expand_dims(norm, 1))


class _stimp:
    """
    Compute the Pan Matrix Profile

    This is based on the SKIMP algorithm.

    Parameters
    ----------
    T : numpy.ndarray
        The time series or sequence for which to compute the pan matrix profile

    min_m : int, default 3
        The starting (or minimum) subsequence window size for which a matrix profile
        may be computed

    max_m : int, default None
        The stopping (or maximum) subsequence window size for which a matrix profile
        may be computed. When `max_m = None`, this is set to the maximum allowable
        subsequence window size

    step : int, default 1
        The step between subsequence window sizes

    percentage : float, default 0.01
        The percentage of the full matrix profile to compute for each subsequence
        window size. When `percentage < 1.0`, then the `scrump` algorithm is used.
        Otherwise, the `stump` algorithm is used when the exact matrix profile is
        requested.

    pre_scrump : bool, default True
        A flag for whether or not to perform the PreSCRIMP calculation prior to
        computing SCRIMP. If set to `True`, this is equivalent to computing
        SCRIMP++. This parameter is ignored when `percentage = 1.0`.

    client : client, default None
        A Dask or Ray Distributed client. Setting up a distributed cluster is beyond
        the scope of this library. Please refer to the Dask or Ray Distributed
        documentation.

    device_id : int or list, default None
        The (GPU) device number to use. The default value is `0`. A list of
        valid device ids (``int``) may also be provided for parallel GPU-STUMP
        computation. A list of all valid device ids can be obtained by
        executing `[device.id for device in numba.cuda.list_devices()]`.

    mp_func : function, default stump
        The matrix profile function to use when `percentage = 1.0`

    T_subseq_isconstant_func : function, default None
        A custom, user-defined function that returns a boolean array that indicates
        whether a subsequence in `T` is constant (True). The function must only take
        two arguments, `a`, a 1-D array, and `w`, the window size, while additional
        arguments may be specified by currying the user-defined function using
        `functools.partial`. Any subsequence with at least one np.nan/np.inf will
        automatically have its corresponding value set to False in this boolean array.

    Attributes
    ----------
    PAN_ : numpy.ndarray
        The transformed (i.e., normalized, contrasted, binarized, and repeated)
        pan matrix profile

    M_ : numpy.ndarray
        The full list of (breadth first search (level) ordered) subsequence window
        sizes

    Methods
    -------
    update():
        Compute the next matrix profile using the next available (breadth-first-search
        (level) ordered) subsequence window size and update the pan matrix profile

    Notes
    -----
    `DOI: 10.1109/ICBK.2019.00031 \
    <https://www.cs.ucr.edu/~eamonn/PAN_SKIMP%20%28Matrix%20Profile%20XX%29.pdf>`__

    See Table 2
    """

    def __init__(
        self,
        T,
        min_m=3,
        max_m=None,
        step=1,
        percentage=0.01,
        pre_scrump=True,
        client=None,
        device_id=None,
        mp_func=stump,
        T_subseq_isconstant_func=None,
    ):
        """
        Initialize the `stimp` object and compute the Pan Matrix Profile

        Parameters
        ----------
        T : numpy.ndarray
            The time series or sequence for which to compute the pan matrix profile

        min_m : int, default 3
            The minimum subsequence window size to consider computing a matrix profile
            for

        max_m : int, default None
            The maximum subsequence window size to consider computing a matrix profile
            for. When `max_m = None`, this is set to the maximum allowable subsequence
            window size

        step : int, default 1
            The step between subsequence window sizes

        percentage : float, default 0.01
            The percentage of the full matrix profile to compute for each subsequence
            window size. When `percentage < 1.0`, then the `scrump` algorithm is used.
            Otherwise, the `stump` algorithm is used when the exact matrix profile is
            requested.

        pre_scrump : bool, default True
            A flag for whether or not to perform the PreSCRIMP calculation prior to
            computing SCRIMP. If set to `True`, this is equivalent to computing
            SCRIMP++. This parameter is ignored when `percentage = 1.0`.

        client : client, default None
            A Dask or Ray Distributed client. Setting up a distributed cluster is beyond
            the scope of this library. Please refer to the Dask or Ray Distributed
            documentation.

        device_id : int or list, default None
            The (GPU) device number to use. The default value is `0`. A list of
            valid device ids (``int``) may also be provided for parallel GPU-STUMP
            computation. A list of all valid device ids can be obtained by
            executing `[device.id for device in numba.cuda.list_devices()]`.

        mp_func : function, default stump
            The matrix profile function to use when `percentage = 1.0`

        T_subseq_isconstant_func : function, default None
            A custom, user-defined function that returns a boolean array that indicates
            whether a subsequence in `T` is constant (True). The function must only take
            two arguments, `a`, a 1-D array, and `w`, the window size, while additional
            arguments may be specified by currying the user-defined function using
            `functools.partial`. Any subsequence with at least one np.nan/np.inf will
            automatically have its corresponding value set to False in this boolean
            array.

        Returns
        -------
        None
        """
        self._T = T.copy()
        if max_m is None:
            max_m = max(min_m + 1, core.get_max_window_size(self._T.shape[0]))
            M = np.arange(min_m, max_m + 1, step).astype(np.int64)
        else:
            min_m, max_m = sorted([min_m, max_m])
            M = np.arange(
                max(3, min_m),
                min(core.get_max_window_size(self._T.shape[0]), max_m) + 1,
                step,
            ).astype(np.int64)
        self._bfs_indices = core._bfs_indices(M.shape[0])
        self._M = M[self._bfs_indices]
        self._n_processed = 0
        percentage = np.clip(percentage, 0.0, 1.0)
        self._percentage = percentage
        self._pre_scrump = pre_scrump
        self._partial_mp_func = core._get_partial_mp_func(
            mp_func, client=client, device_id=device_id
        )

        if T_subseq_isconstant_func is None:
            T_subseq_isconstant_func = core._rolling_isconstant
        if not callable(T_subseq_isconstant_func):  # pragma: no cover
            msg = (
                "`T_subseq_isconstant_func` was expected to be a callable function "
                + f"but {type(T_subseq_isconstant_func)} was found."
            )
            raise ValueError(msg)
        self._T_subseq_isconstant_func = T_subseq_isconstant_func

        self._PAN = np.full(
            (self._M.shape[0], self._T.shape[0]), fill_value=np.inf, dtype=np.float64
        )

    def update(self):
        """
        Update the pan matrix profile by computing a single matrix profile using the
        next available subsequence window size

        Parameters
        ----------
        None

        Returns
        -------
        None

        Notes
        -----
        `DOI: 10.1109/ICBK.2019.00031 \
        <https://www.cs.ucr.edu/~eamonn/PAN_SKIMP%20%28Matrix%20Profile%20XX%29.pdf>`__

        See Table 2
        """
        if self._n_processed < self._M.shape[0]:
            m = self._M[self._n_processed]
            if self._percentage < 1.0:
                approx = scrump(
                    self._T,
                    m,
                    ignore_trivial=True,
                    percentage=self._percentage,
                    pre_scrump=self._pre_scrump,
                    k=1,
                    T_A_subseq_isconstant=self._T_subseq_isconstant_func,
                )
                approx.update()
                self._PAN[
                    self._bfs_indices[self._n_processed], : approx.P_.shape[0]
                ] = approx.P_
            else:
                out = self._partial_mp_func(
                    self._T,
                    m,
                    ignore_trivial=True,
                    T_A_subseq_isconstant=self._T_subseq_isconstant_func,
                )
                self._PAN[
                    self._bfs_indices[self._n_processed], : out[:, 0].shape[0]
                ] = out[:, 0]
            self._n_processed += 1

    def pan(self, threshold=0.2, normalize=True, contrast=True, binary=True, clip=True):
        """
        Generate a transformed (i.e., normalized, contrasted, binarized, and repeated)
        pan matrix profile

        Parameters
        ----------
        threshold : float, default 0.2
            The distance `threshold` in which to center the pan matrix profile around
            for best contrast and this value is also used for binarizing the pan matrix
            profile

        normalize : bool, default True
            A flag for whether or not each individual matrix profile within the pan
            matrix profile is normalized by its corresponding subsequence window size.
            If set to `True`, normalization is performed.

        contrast : bool, default True
            A flag for whether or not the pan matrix profile is centered around the
            desired `threshold` in order to provide higher contrast. If set to `True`,
            centering is performed.

        binary : bool, default True
            A flag for whether or not the pan matrix profile is binarized. If set to
            `True`, all values less than or equal to `threshold` are set to `0.0` while
            all other values are set to `1.0`.

        clip : bool, default True
            A flag for whether or not the pan matrix profile is clipped. If set to
            `True`, all values are ensured to be clipped between `0.0` and `1.0`.

        Returns
        -------
        PAN : numpy.ndarray
            The transformed (i.e., normalized, contrasted, binarized, and repeated)
            pan matrix profile
        """
        PAN = self._PAN.copy()
        # Retrieve the row indices where the matrix profile was actually computed
        idx = self._bfs_indices[: self._n_processed]
        sorted_idx = np.sort(idx)
        PAN[PAN == np.inf] = np.nan

        if normalize:
            _normalize_pan(PAN, self._M, self._bfs_indices, self._n_processed)
        if contrast:
            core._contrast_pan(PAN, threshold, self._bfs_indices, self._n_processed)
        if binary:
            core._binarize_pan(PAN, threshold, self._bfs_indices, self._n_processed)
        if clip:
            PAN[idx] = np.clip(PAN[idx], 0.0, 1.0)

        # Below, for each matrix profile that was computed, we take that matrix profile
        # and copy/repeat it downwards to replace other rows in the `PAN` where the
        # matrix profile has yet to be computed. Instead of only having lines/values in
        # the rows where matrix profiles were computed, this gives us the "blocky" look
        nrepeat = np.diff(np.append(-1, sorted_idx))
        PAN[: np.sum(nrepeat)] = np.repeat(PAN[sorted_idx], nrepeat, axis=0)
        PAN[np.isnan(PAN)] = np.nanmax(PAN)

        return PAN

    @property
    def PAN_(self):
        """
        Get the transformed (i.e., normalized, contrasted, binarized, and repeated) pan
        matrix profile

        Parameters
        ----------
        None

        Returns
        -------
        out : numpy.ndarray
            The transformed (i.e., normalized, contrasted, binarized, and repeated)
            pan matrix profile
        """
        return self.pan().astype(np.float64)

    @property
    def M_(self):
        """
        Get all of the (breadth first searched (level) ordered) subsequence window sizes

        Parameters
        ----------
        None

        Returns
        -------
        out : numpy.ndarray
            The (breadth first searched (level) ordered) subsequence window sizes
        """
        return self._M.astype(np.int64)

    @property
    def P_(self):
        """
        Get all of the raw (i.e., non-transformed) matrix profiles matrix profile in
        (breadth first searched (level) ordered)

        Parameters
        ----------
        None

        Returns
        -------
        out : numpy.ndarray
            The raw (i.e., non-transformed) matrix profiles matrix profile
        """
        P = []
        for i, idx in enumerate(self._bfs_indices):
            P.append(self._PAN[idx][: len(self._T) - self._M[i] + 1])

        return P

    # @property
    # def bfs_indices_(self):
    #     """
    #     Get the breadth first search (level order) indices
    #     """
    #     return self._bfs_indices.astype(np.int64)

    # @property
    # def n_processed_(self):  # pragma: no cover
    #     """
    #     Get the total number of windows that have been processed
    #     """
    #     return self._n_processed



[docs]
@core.non_normalized(
    aamp_stimp,
    exclude=["pre_scrump", "normalize", "p", "T_subseq_isconstant_func", "pre_scraamp"],
    replace={"pre_scrump": "pre_scraamp"},
)
class stimp(_stimp):
    """
    A class to compute the Pan Matrix Profile

    This is based on the SKIMP algorithm.

    Parameters
    ----------
    T : numpy.ndarray
        The time series or sequence for which to compute the pan matrix profile.

    min_m : int, default 3
        The starting (or minimum) subsequence window size for which a matrix profile
        may be computed.

    max_m : int, default None
        The stopping (or maximum) subsequence window size for which a matrix profile
        may be computed. When ``max_m = None``, this is set to the maximum allowable
        subsequence window size.

    step : int, default 1
        The step between subsequence window sizes.

    percentage : float, default 0.01
        The percentage of the full matrix profile to compute for each subsequence
        window size. When ``percentage < 1.0``, then the ``scrump`` algorithm is used.
        Otherwise, the ``stump`` algorithm is used when the exact matrix profile is
        requested.

    pre_scrump : bool, default True
        A flag for whether or not to perform the PreSCRIMP calculation prior to
        computing SCRIMP. If set to ``True``, this is equivalent to computing
        SCRIMP++. This parameter is ignored when ``percentage = 1.0``.

    normalize : bool, default True
        When set to ``True``, this z-normalizes subsequences prior to computing
        distances. Otherwise, this function gets re-routed to its complementary
        non-normalized equivalent set in the ``@core.non_normalized`` function
        decorator.

    p : float, default 2.0
        The p-norm to apply for computing the Minkowski distance. Minkowski distance is
        typically used with ``p`` being ``1`` or ``2``, which correspond to the
        Manhattan distance and the Euclidean distance, respectively. This parameter is
        ignored when ``normalize == True``.

    T_subseq_isconstant_func : function, default None
        A custom, user-defined function that returns a boolean array that indicates
        whether a subsequence in ``T`` is constant (``True``). The function must only
        take two arguments, ``a``, a 1-D array, and ``w``, the window size, while
        additional arguments may be specified by currying the user-defined function
        using ``functools.partial``. Any subsequence with at least one
        ``np.nan``/``np.inf`` will automatically have its corresponding value set to
        ``False`` in this boolean array.

    Attributes
    ----------
    PAN_ : numpy.ndarray
        The transformed (i.e., normalized, contrasted, binarized, and repeated)
        pan matrix profile.

    M_ : numpy.ndarray
        The full list of (breadth first search (level) ordered) subsequence window
        sizes.

    Methods
    -------
    update():
        Compute the next matrix profile using the next available (breadth-first-search
        (level) ordered) subsequence window size and update the pan matrix profile

    See Also
    --------
    stumpy.stimped : Compute the Pan Matrix Profile with a ``dask``/``ray`` cluster
    stumpy.gpu_stimp : Compute the Pan Matrix Profile with with one or more GPU devices

    Notes
    -----
    `DOI: 10.1109/ICBK.2019.00031 \
    <https://www.cs.ucr.edu/~eamonn/PAN_SKIMP%20%28Matrix%20Profile%20XX%29.pdf>`__

    See Table 2

    Examples
    --------
    >>> import stumpy
    >>> import numpy as np
    >>> pmp = stumpy.stimp(np.array([584., -11., 23., 79., 1001., 0., -19.]))
    >>> pmp.update()
    >>> pmp.PAN_
    array([[0., 1., 1., 1., 1., 1., 1.],
           [0., 1., 1., 1., 1., 1., 1.]])
    """

    def __init__(
        self,
        T,
        min_m=3,
        max_m=None,
        step=1,
        percentage=0.01,
        pre_scrump=True,
        normalize=True,
        p=2.0,
        T_subseq_isconstant_func=None,
    ):
        """
        Initialize the ``stimp`` object and compute the Pan Matrix Profile

        Parameters
        ----------
        T : numpy.ndarray
            The time series or sequence for which to compute the pan matrix profile.

        min_m : int, default 3
            The minimum subsequence window size to consider computing a matrix profile
            for.

        max_m : int, default None
            The maximum subsequence window size to consider computing a matrix profile
            for. When ``max_m = None``, this is set to the maximum allowable
            subsequence window size.

        step : int, default 1
            The step between subsequence window sizes.

        percentage : float, default 0.01
            The percentage of the full matrix profile to compute for each subsequence
            window size. When ``percentage < 1.0``, then the ``scrump`` algorithm is
            used. Otherwise, the ``stump`` algorithm is used when the exact matrix
            profile is requested.

        pre_scrump : bool, default True
            A flag for whether or not to perform the PreSCRIMP calculation prior to
            computing SCRIMP. If set to ``True``, this is equivalent to computing
            SCRIMP++. This parameter is ignored when    `percentage = 1.0``.

        normalize : bool, default True
            When set to ``True``, this z-normalizes subsequences prior to computing
            distances. Otherwise, this function gets re-routed to its complementary
            non-normalized equivalent set in the ``@core.non_normalized`` function
            decorator.

        p : float, default 2.0
            The p-norm to apply for computing the Minkowski distance. This parameter is
            ignored when ``normalize == True``.

        T_subseq_isconstant_func : function, default None
            A custom, user-defined function that returns a boolean array that indicates
            whether a subsequence in ``T`` is constant (``True``). The function must
            only take two arguments, ``a``, a 1-D array, and ``w``, the window size,
            while additional arguments may be specified by currying the user-defined
            function using ``functools.partial``. Any subsequence with at least one
            ``np.nan``/``np.inf`` will automatically have its corresponding value set
            to ``False`` in this boolean array.

        Returns
        -------
        None
        """
        super().__init__(
            T,
            min_m=min_m,
            max_m=max_m,
            step=step,
            percentage=percentage,
            pre_scrump=pre_scrump,
            mp_func=stump,
            T_subseq_isconstant_func=T_subseq_isconstant_func,
        )




[docs]
@core.non_normalized(
    aamp_stimped,
    exclude=["pre_scrump", "normalize", "p", "T_subseq_isconstant_func", "pre_scraamp"],
    replace={"pre_scrump": "pre_scraamp"},
)
class stimped(_stimp):
    """
    A class to compute the Pan Matrix Profile with a ``dask``/``ray`` cluster

    This is based on the SKIMP algorithm.

    Parameters
    ----------
    client : client
        A ``dask``/``ray`` client. Setting up a ``dask``/``ray`` cluster is beyond
        the scope of this library. Please refer to the ``dask``/``ray``
        documentation.

    T : numpy.ndarray
        The time series or sequence for which to compute the pan matrix profile.

    min_m : int, default 3
        The starting (or minimum) subsequence window size for which a matrix profile
        may be computed.

    max_m : int, default None
        The stopping (or maximum) subsequence window size for which a matrix profile
        may be computed. When ``max_m = None``, this is set to the maximum allowable
        subsequence window size

    step : int, default 1
        The step between subsequence window sizes.

    normalize : bool, default True
        When set to ``True``, this z-normalizes subsequences prior to computing
        distances. Otherwise, this function gets re-routed to its complementary
        non-normalized equivalent set in the ``@core.non_normalized`` function
        decorator.

    p : float, default 2.0
        The p-norm to apply for computing the Minkowski distance. Minkowski distance is
        typically used with ``p`` being ``1`` or ``2``, which correspond to the
        Manhattan distance and the Euclidean distance, respectively. This parameter is
        ignored when ``normalize == True``.

    T_subseq_isconstant_func : function, default None
            A custom, user-defined function that returns a boolean array that indicates
            whether a subsequence in ``T`` is constant (``True``). The function must
            only take two arguments, ``a``, a 1-D array, and ``w``, the window size,
            while additional arguments may be specified by currying the user-defined
            function using ``functools.partial``. Any subsequence with at least one
            ``np.nan``/``np.inf`` will automatically have its corresponding value set
            to ``False`` in this boolean array.

    Attributes
    ----------
    PAN_ : numpy.ndarray
        The transformed (i.e., normalized, contrasted, binarized, and repeated)
        pan matrix profile.

    M_ : numpy.ndarray
        The full list of (breadth first search (level) ordered) subsequence window
        sizes.

    Methods
    -------
    update():
        Compute the next matrix profile using the next available (breadth-first-search
        (level) ordered) subsequence window size and update the pan matrix profile.

    See Also
    --------
    stumpy.stimp : Compute the Pan Matrix Profile
    stumpy.gpu_stimp : Compute the Pan Matrix Profile with with one or more GPU devices

    Notes
    -----
    `DOI: 10.1109/ICBK.2019.00031 \
    <https://www.cs.ucr.edu/~eamonn/PAN_SKIMP%20%28Matrix%20Profile%20XX%29.pdf>`__

    See Table 2

    Examples
    --------
    >>> import stumpy
    >>> import numpy as np
    >>> from dask.distributed import Client
    >>> if __name__ == "__main__":
    ...     with Client() as dask_client:
    ...         pmp = stumpy.stimped(
    ...             dask_client,
    ...             np.array([584., -11., 23., 79., 1001., 0., -19.]))
    ...         pmp.update()
    ...         pmp.PAN_
    array([[0., 1., 1., 1., 1., 1., 1.],
           [0., 1., 1., 1., 1., 1., 1.]])

    Alternatively, you can also use `ray`

    >>> import ray
    >>> if __name__ == "__main__":
    >>>     ray.init()
    >>>     pmp = stumpy.stimped(
    ...         ray,
    ...         np.array([584., -11., 23., 79., 1001., 0., -19.]))
    >>>     ray.shutdown()
    """

    def __init__(
        self,
        client,
        T,
        min_m=3,
        max_m=None,
        step=1,
        normalize=True,
        p=2.0,
        T_subseq_isconstant_func=None,
    ):
        """
        Initialize the ``stimp`` object and compute the Pan Matrix Profile

        Parameters
        ----------
        client : client
            A ``dask``/``ray`` client. Setting up a ``dask``/``ray`` cluster is beyond
            the scope of this library. Please refer to the ``dask``/``ray``
            documentation.

        T : numpy.ndarray
            The time series or sequence for which to compute the pan matrix profile.

        min_m : int, default 3
            The minimum subsequence window size to consider computing a matrix profile
            for.

        max_m : int, default None
            The maximum subsequence window size to consider computing a matrix profile
            for. When ``max_m = None``, this is set to the maximum allowable
            subsequence window size.

        step : int, default 1
            The step between subsequence window sizes.

        normalize : bool, default True
            When set to ``True``, this z-normalizes subsequences prior to computing
            distances. Otherwise, this function gets re-routed to its complementary
            non-normalized equivalent set in the ``@core.non_normalized`` function
            decorator.

        p : float, default 2.0
            The p-norm to apply for computing the Minkowski distance. Minkowski
            distance is typically used with ``p`` being ``1`` or ``2``, which
            correspond to the Manhattan distance and the Euclidean distance,
            respectively. This parameter is ignored when ``normalize == True``.

        T_subseq_isconstant_func : function, default None
            A custom, user-defined function that returns a boolean array that indicates
            whether a subsequence in ``T`` is constant (``True``). The function must
            only take two arguments, ``a``, a 1-D array, and ``w``, the window size,
            while additional arguments may be specified by currying the user-defined
            function using  `functools.partial  `. Any subsequence with at least one
            ``np.nan``/``np.inf`` will automatically have its corresponding value set
            to ``False`` in this boolean array.

        Returns
        -------
        None
        """
        super().__init__(
            T,
            min_m=min_m,
            max_m=max_m,
            step=step,
            percentage=1.0,
            pre_scrump=False,
            client=client,
            mp_func=stumped,
            T_subseq_isconstant_func=T_subseq_isconstant_func,
        )