sktime.alignment.edit_numba 源代码

"""BaseAligner interface to sktime edit distance aligners in distances module."""

__author__ = ["fkiraly"]

import numpy as np
import pandas as pd

from sktime.alignment.base import BaseAligner


[文档]class AlignerEditNumba(BaseAligner): r"""Interface to sktime native edit distance aligners. Interface to the following edit distance aligners: LCSS - longest common subsequence distance ERP - Edit distance for real penalty EDR - Edit distance for real sequences TWE - Time warp edit distance LCSS [1]_ attempts to find the longest common sequence between two time series and returns a value that is the percentage that longest common sequence assumes. LCSS is computed by matching indexes that are similar up until a defined threshold (epsilon). The value returned will be between 0.0 and 1.0, where 0.0 means the two time series are exactly the same and 1.0 means they are complete opposites. EDR [2]_ computes the minimum number of elements (as a percentage) that must be removed from x and y so that the sum of the distance between the remaining signal elements lies within the tolerance (epsilon). The value returned will be between 0 and 1 per time series. The value will represent as a percentage of elements that must be removed for the time series to be an exact match. ERP [3]_ attempts align time series by better considering how indexes are carried forward through the cost matrix. Usually in the dtw cost matrix, if an alignment can't be found the previous value is carried forward. ERP instead proposes the idea of gaps or sequences of points that have no matches. These gaps are then punished based on their distance from 'g'. TWE [4]_ is a distance measure for discrete time series matching with time 'elasticity'. In comparison to other distance measures, (e.g. DTW (Dynamic Time Warping) or LCS (Longest Common Subsequence Problem)), TWE is a metric. Its computational time complexity is O(n^2), but can be drastically reduced in some specific situation by using a corridor to reduce the search space. Its memory space complexity can be reduced to O(n). Parameters ---------- distance: str, one of ["lcss", "edr", "erp", "twe"], optional, default = "lcss" name of the distance that is calculated window: float, default = None Float that is the radius of the sakoe chiba window (if using Sakoe-Chiba lower bounding). Value must be between 0. and 1. itakura_max_slope: float, default = None Gradient of the slope for itakura parallelogram (if using Itakura Parallelogram lower bounding) bounding_matrix: 2D np.ndarray, optional, default = None if passed, must be of shape (len(X), len(X2)) for X, X2 in ``transform`` Custom bounding matrix to use. If defined then other lower_bounding params are ignored. The matrix should be structure so that indexes considered in bound should be the value 0. and indexes outside the bounding matrix should be infinity. epsilon : float, defaults = 1. Used in LCSS, EDR, ERP, otherwise ignored Matching threshold to determine if two subsequences are considered close enough to be considered 'common'. g: float, defaults = 0. Used in ERP, otherwise ignored. The reference value to penalise gaps. lmbda: float, optional, default = 1.0 Used in TWE, otherwise ignored. A constant penalty that punishes the editing efforts. Must be >= 1.0. nu: float optional, default = 0.001 Used in TWE, otherwise ignored. A non-negative constant which characterizes the stiffness of the elastic twe measure. Must be > 0. p: int optional, default = 2 Used in TWE, otherwise ignored. Order of the p-norm for local cost. References ---------- .. [1] M. Vlachos, D. Gunopoulos, and G. Kollios. 2002. "Discovering Similar Multidimensional Trajectories", In Proceedings of the 18th International Conference on Data Engineering (ICDE '02). IEEE Computer Society, USA, 673. .. [2] Lei Chen, M. Tamer Özsu, and Vincent Oria. 2005. Robust and fast similarity search for moving object trajectories. In Proceedings of the 2005 ACM SIGMOD international conference on Management of data (SIGMOD '05). Association for Computing Machinery, New York, NY, USA, 491-502. DOI:https://doi.org/10.1145/1066157.1066213 .. [3] Lei Chen and Raymond Ng. 2004. On the marriage of Lp-norms and edit distance. In Proceedings of the Thirtieth international conference on Very large data bases - Volume 30 (VLDB '04). VLDB Endowment, 792-803. .. [4] Marteau, P.; F. (2009). "Time Warp Edit Distance with Stiffness Adjustment for Time Series Matching". IEEE Transactions on Pattern Analysis and Machine Intelligence. 31 (2): 306-318. Examples -------- >>> from sktime.datasets import load_unit_test >>> from sktime.dists_kernels.edit_dist import EditDist >>> >>> X, _ = load_unit_test(return_type="pd-multiindex") # doctest: +SKIP >>> d = EditDist("edr") # doctest: +SKIP >>> distmat = d.transform(X) # doctest: +SKIP distances are also callable, this does the same: >>> distmat = d(X) # doctest: +SKIP """ _tags = { # packaging info # -------------- "authors": ["chrisholder", "TonyBagnall", "fkiraly"], "python_dependencies": "numba", # estimator type # -------------- "symmetric": True, # all the distances are symmetric "capability:multiple-alignment": False, # can align more than two sequences? "capability:distance": True, # does compute/return overall distance? "capability:distance-matrix": True, # does compute/return distance matrix? "capability:unequal_length": False, # can align sequences of unequal length? "alignment_type": "partial", "X_inner_mtype": "numpy3D", } ALLOWED_DISTANCE_STR = ["lcss", "edr", "erp", "twe"] def __init__( self, distance: str = "lcss", window=None, itakura_max_slope=None, bounding_matrix: np.ndarray = None, epsilon: float = 1.0, g: float = 0.0, lmbda: float = 1.0, nu: float = 0.001, p: int = 2, ): self.distance = distance self.window = window self.itakura_max_slope = itakura_max_slope self.bounding_matrix = bounding_matrix self.epsilon = epsilon self.g = g self.lmbda = lmbda self.nu = nu self.p = p super().__init__() kwargs = { "window": window, "itakura_max_slope": itakura_max_slope, "bounding_matrix": bounding_matrix, } if distance not in self.ALLOWED_DISTANCE_STR: raise ValueError( "distance must be one of the strings" f"{self.ALLOWED_DISTANCE_STR}, but found" f" {distance}" ) # epsilon is used only for lcss, edr, erp if distance in ["lcss", "edr", "erp"]: kwargs["epsilon"] = epsilon # g is used only for erp if distance == "erp": kwargs["g"] = g # twe has three unique params if distance == "twe": kwargs["lmbda"] = lmbda kwargs["nu"] = nu kwargs["p"] = p self.kwargs = kwargs def _fit(self, X, Z=None): """Fit alignment given series/sequences to align. core logic Writes to self: alignment : computed alignment from dtw package (nested struct) Parameters ---------- X: list of pd.DataFrame (sequence) of length n - panel of series to align Z: pd.DataFrame with n rows, optional; metadata, row correspond to indices of X """ from sktime.distances import distance_alignment_path X1 = X[0] X2 = X[1] metric_key = self.distance kwargs = self.kwargs path, dist = distance_alignment_path(X1, X2, metric=metric_key, **kwargs) self.path_ = path self.dist_ = dist return self def _get_alignment(self): """Return alignment for sequences/series passed in fit (iloc indices). Behaviour: returns an alignment for sequences in X passed to fit model should be in fitted state, fitted model parameters read from self Returns ------- pd.DataFrame in alignment format, with columns 'ind'+str(i) for integer i cols contain iloc index of X[i] mapped to alignment coordinate for alignment """ # retrieve alignment path = self.path_ ind0, ind1 = zip(*path) # convert to required data frame format and return aligndf = pd.DataFrame({"ind0": ind0, "ind1": ind1}) return aligndf def _get_distance(self): """Return overall distance of alignment. Behaviour: returns overall distance corresponding to alignment not all aligners will return or implement this (optional) Returns ------- distance: float - overall distance between all elements of X passed to fit """ return self.dist_ def _get_distance_matrix(self): """Return distance matrix of alignment. Behaviour: returns pairwise distance matrix of alignment distances not all aligners will return or implement this (optional) Returns ------- distmat: a (2 x 2) np.array of floats [i,j]-th entry is alignment distance between X[i] and X[j] passed to fit """ # since dtw does only pairwise alignments, this is always a 2x2 matrix distmat = np.zeros((2, 2), dtype="float") distmat[0, 1] = self.dist_ distmat[1, 0] = self.dist_ return distmat
[文档] @classmethod def get_test_params(cls, parameter_set="default"): """Return testing parameter settings for the estimator. Parameters ---------- parameter_set : str, default="default" Name of the set of test parameters to return, for use in tests. If no special parameters are defined for a value, will return ``"default"`` set. There are currently no reserved values for aligners. Returns ------- params : dict or list of dict, default = {} Parameters to create testing instances of the class Each dict are parameters to construct an "interesting" test instance, i.e., ``MyClass(**params)`` or ``MyClass(**params[i])`` creates a valid test instance. ``create_test_instance`` uses the first (or only) dictionary in ``params`` """ params0 = {} params1 = {"distance": "twe"} return [params0, params1]