sktime.classification.compose._pipeline 源代码

"""Pipeline with a classifier."""

# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
import numpy as np

from sktime.base import _HeterogenousMetaEstimator
from sktime.classification.base import BaseClassifier
from sktime.datatypes import convert_to
from sktime.transformations.base import BaseTransformer
from sktime.transformations.compose import TransformerPipeline
from sktime.utils.sklearn import is_sklearn_classifier

__author__ = ["fkiraly"]
__all__ = ["ClassifierPipeline", "SklearnClassifierPipeline"]


[文档]class ClassifierPipeline(_HeterogenousMetaEstimator, BaseClassifier): """Pipeline of transformers and a classifier. The ``ClassifierPipeline`` compositor chains transformers and a single classifier. The pipeline is constructed with a list of sktime transformers, plus a classifier, i.e., estimators following the BaseTransformer resp BaseClassifier interface. The transformer list can be unnamed - a simple list of transformers - or string named - a list of pairs of string, estimator. For a list of transformers ``trafo1``, ``trafo2``, ..., ``trafoN`` and a classifier ``clf``, the pipeline behaves as follows: ``fit(X, y)`` - changes styte by running ``trafo1.fit_transform`` on ``X``, them ``trafo2.fit_transform`` on the output of ``trafo1.fit_transform``, etc sequentially, with ``trafo[i]`` receiving the output of ``trafo[i-1]``, and then running ``clf.fit`` with ``X`` being the output of ``trafo[N]``, and ``y`` identical with the input to ``self.fit`` ``predict(X)`` - result is of executing ``trafo1.transform``, ``trafo2.transform``, etc with ``trafo[i].transform`` input = output of ``trafo[i-1].transform``, then running ``clf.predict`` on the output of ``trafoN.transform``, and returning the output of ``clf.predict`` ``predict_proba(X)`` - result is of executing ``trafo1.transform``, ``trafo2.transform``, etc, with ``trafo[i].transform`` input = output of ``trafo[i-1].transform``, then running ``clf.predict_proba`` on the output of ``trafoN.transform``, and returning the output of ``clf.predict_proba`` ``get_params``, ``set_params`` uses ``sklearn`` compatible nesting interface if list is unnamed, names are generated as names of classes if names are non-unique, ``f"_{str(i)}"`` is appended to each name string where ``i`` is the total count of occurrence of a non-unique string inside the list of names leading up to it (inclusive) ``ClassifierPipeline`` can also be created by using the magic multiplication on any classifier, i.e., if ``my_clf`` inherits from ``BaseClassifier``, and ``my_trafo1``, ``my_trafo2`` inherit from ``BaseTransformer``, then, for instance, ``my_trafo1 * my_trafo2 * my_clf`` will result in the same object as obtained from the constructor ``ClassifierPipeline(classifier=my_clf, transformers=[my_trafo1, my_trafo2])`` magic multiplication can also be used with (str, transformer) pairs, as long as one element in the chain is a transformer Parameters ---------- classifier : sktime classifier, i.e., estimator inheriting from BaseClassifier this is a "blueprint" classifier, state does not change when ``fit`` is called transformers : list of sktime transformers, or list of tuples (str, transformer) of sktime transformers these are "blueprint" transformers, states do not change when ``fit`` is called Attributes ---------- classifier_ : sktime classifier, clone of classifier in ``classifier`` this clone is fitted in the pipeline when ``fit`` is called transformers_ : list of tuples (str, transformer) of sktime transformers clones of transformers in ``transformers`` which are fitted in the pipeline is always in (str, transformer) format, even if transformers is just a list strings not passed in transformers are unique generated strings i-th transformer in ``transformers_`` is clone of i-th in ``transformers`` Examples -------- >>> from sktime.transformations.panel.pca import PCATransformer >>> from sktime.classification.interval_based import TimeSeriesForestClassifier >>> from sktime.datasets import load_unit_test >>> from sktime.classification.compose import ClassifierPipeline >>> X_train, y_train = load_unit_test(split="train") >>> X_test, y_test = load_unit_test(split="test") >>> pipeline = ClassifierPipeline( ... TimeSeriesForestClassifier(n_estimators=5), [PCATransformer()] ... ) >>> pipeline.fit(X_train, y_train) ClassifierPipeline(...) >>> y_pred = pipeline.predict(X_test) Alternative construction via dunder method: >>> pipeline = PCATransformer() * TimeSeriesForestClassifier(n_estimators=5) """ _tags = { "authors": ["fkiraly"], "X_inner_mtype": "pd-multiindex", # which type do _fit/_predict accept "capability:multivariate": False, "capability:unequal_length": False, "capability:missing_values": False, "capability:train_estimate": False, "capability:contractable": False, "capability:multithreading": False, "capability:predict_proba": True, } # no default tag values - these are set dynamically below def __init__(self, classifier, transformers): self.classifier = classifier self.classifier_ = classifier.clone() self.transformers = transformers self.transformers_ = TransformerPipeline(transformers) super().__init__() # can handle multivariate iff: both classifier and all transformers can multivariate = classifier.get_tag("capability:multivariate", False) multivariate = multivariate and not self.transformers_.get_tag( "univariate-only", True ) # can handle missing values iff: both classifier and all transformers can, # *or* transformer chain removes missing data missing = classifier.get_tag("capability:missing_values", False) missing = missing and self.transformers_.get_tag("handles-missing-data", False) missing = missing or self.transformers_.get_tag( "capability:missing_values:removes", False ) # can handle unequal length iff: classifier can and transformers can, # *or* transformer chain renders the series equal length unequal = classifier.get_tag("capability:unequal_length") unequal = unequal and self.transformers_.get_tag( "capability:unequal_length", False ) unequal = unequal or self.transformers_.get_tag( "capability:unequal_length:removes", False ) # predict_proba is same as that of classifier predict_proba = classifier.get_tag("capability:predict_proba") # last three tags are always False, since not supported by transformers tags_to_set = { "capability:multivariate": multivariate, "capability:missing_values": missing, "capability:unequal_length": unequal, "capability:contractable": False, "capability:train_estimate": False, "capability:multithreading": False, "capability:predict_proba": predict_proba, } self.set_tags(**tags_to_set) @property def _transformers(self): return self.transformers_._steps @_transformers.setter def _transformers(self, value): self.transformers_._steps = value def __rmul__(self, other): """Magic * method, return concatenated ClassifierPipeline, transformers on left. Implemented for ``other`` being a transformer, otherwise returns ``NotImplemented``. Parameters ---------- other: ``sktime`` transformer, must inherit from BaseTransformer otherwise, ``NotImplemented`` is returned Returns ------- ClassifierPipeline object, concatenation of ``other`` (first) with ``self`` (last). """ if isinstance(other, BaseTransformer): # use the transformers dunder to get a TransformerPipeline trafo_pipeline = other * self.transformers_ # then stick the expanded pipeline in a ClassifierPipeline new_pipeline = ClassifierPipeline( classifier=self.classifier, transformers=trafo_pipeline.steps, ) return new_pipeline else: return NotImplemented def _fit(self, X, y): """Fit time series classifier to training data. core logic Parameters ---------- X : Training data of type self.get_tag("X_inner_mtype") y : array-like, shape = [n_instances] - the class labels Returns ------- self : reference to self. State change ------------ creates fitted model (attributes ending in "_") """ Xt = self.transformers_.fit_transform(X=X, y=y) self.classifier_.fit(X=Xt, y=y) return self def _predict(self, X) -> np.ndarray: """Predict labels for sequences in X. core logic Parameters ---------- X : data not used in training, of type self.get_tag("X_inner_mtype") Returns ------- y : predictions of labels for X, np.ndarray """ Xt = self.transformers_.transform(X=X) return self.classifier_.predict(X=Xt) def _predict_proba(self, X) -> np.ndarray: """Predicts labels probabilities for sequences in X. Default behaviour is to call _predict and set the predicted class probability to 1, other class probabilities to 0. Override if better estimates are obtainable. Parameters ---------- X : data to predict y with, of type self.get_tag("X_inner_mtype") Returns ------- y : predictions of probabilities for class values of X, np.ndarray """ Xt = self.transformers_.transform(X) return self.classifier_.predict_proba(Xt)
[文档] def get_params(self, deep=True): """Get parameters of estimator in ``transformers``. Parameters ---------- deep : boolean, optional, default=True If True, will return the parameters for this estimator and contained sub-objects that are estimators. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ params = dict() trafo_params = self._get_params("_transformers", deep=deep) params.update(trafo_params) return params
[文档] def set_params(self, **kwargs): """Set the parameters of estimator in ``transformers``. Valid parameter keys can be listed with ``get_params()``. Returns ------- self : returns an instance of self. """ if "classifier" in kwargs.keys(): if not isinstance(kwargs["classifier"], BaseClassifier): raise TypeError('"classifier" arg must be an sktime classifier') trafo_keys = self._get_params("_transformers", deep=True).keys() classif_keys = self.classifier.get_params(deep=True).keys() trafo_args = self._subset_dict_keys(dict_to_subset=kwargs, keys=trafo_keys) classif_args = self._subset_dict_keys( dict_to_subset=kwargs, keys=classif_keys, prefix="classifier" ) if len(classif_args) > 0: self.classifier.set_params(**classif_args) if len(trafo_args) > 0: self._set_params("_transformers", **trafo_args) return self
[文档] @classmethod def get_test_params(cls, parameter_set="default"): """Return testing parameter settings for the estimator. Parameters ---------- parameter_set : str, default="default" Name of the set of test parameters to return, for use in tests. If no special parameters are defined for a value, will return ``"default"`` set. For classifiers, a "default" set of parameters should be provided for general testing, and a "results_comparison" set for comparing against previously recorded results if the general set does not produce suitable probabilities to compare against. Returns ------- params : dict or list of dict, default={} Parameters to create testing instances of the class. Each dict are parameters to construct an "interesting" test instance, i.e., ``MyClass(**params)`` or ``MyClass(**params[i])`` creates a valid test instance. ``create_test_instance`` uses the first (or only) dictionary in ``params``. """ # imports from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier from sktime.classification.dummy import DummyClassifier from sktime.transformations.series.exponent import ExponentTransformer t1 = ExponentTransformer(power=2) t2 = ExponentTransformer(power=0.5) c = KNeighborsTimeSeriesClassifier() another_c = DummyClassifier() params1 = {"transformers": [t1, t2], "classifier": c} params2 = {"transformers": [t1], "classifier": another_c} return [params1, params2]
[文档]class SklearnClassifierPipeline(_HeterogenousMetaEstimator, BaseClassifier): """Pipeline of transformers and a classifier. The ``SklearnClassifierPipeline`` chains transformers and an single classifier. Similar to ``ClassifierPipeline``, but uses a tabular ``sklearn`` classifier. The pipeline is constructed with a list of sktime transformers, plus a classifier, i.e., transformers following the BaseTransformer interface, classifier follows the ``scikit-learn`` classifier interface. The transformer list can be unnamed - a simple list of transformers - or string named - a list of pairs of string, estimator. For a list of transformers ``trafo1``, ``trafo2``, ..., ``trafoN`` and a classifier ``clf``, the pipeline behaves as follows: ``fit(X, y)`` - changes styte by running ``trafo1.fit_transform`` on ``X``, them ``trafo2.fit_transform`` on the output of ``trafo1.fit_transform``, etc sequentially, with ``trafo[i]`` receiving the output of ``trafo[i-1]``, and then running ``clf.fit`` with ``X`` the output of ``trafo[N]`` converted to numpy, and ``y`` identical with the input to ``self.fit``. ``X`` is converted to ``numpyflat`` mtype if ``X`` is of ``Panel`` scitype; ``X`` is converted to ``numpy2D`` mtype if ``X`` is of ``Table`` scitype. ``predict(X)`` - result is of executing ``trafo1.transform``, ``trafo2.transform``, etc with ``trafo[i].transform`` input = output of ``trafo[i-1].transform``, then running ``clf.predict`` on the numpy converted output of ``trafoN.transform``, and returning the output of ``clf.predict``. Output of ``trasfoN.transform`` is converted to numpy, as in ``fit``. ``predict_proba(X)`` - result is of executing ``trafo1.transform``, ``trafo2.transform``, etc, with ``trafo[i].transform`` input = output of ``trafo[i-1].transform``, then running ``clf.predict_proba`` on the output of ``trafoN.transform``, and returning the output of ``clf.predict_proba``. Output of ``trasfoN.transform`` is converted to numpy, as in ``fit``. ``get_params``, ``set_params`` uses ``sklearn`` compatible nesting interface if list is unnamed, names are generated as names of classes if names are non-unique, ``f"_{str(i)}"`` is appended to each name string where ``i`` is the total count of occurrence of a non-unique string inside the list of names leading up to it (inclusive) ``SklearnClassifierPipeline`` can also be created by using the magic multiplication between ``sktime`` transformers and ``sklearn`` classifiers, and ``my_trafo1``, ``my_trafo2`` inherit from ``BaseTransformer``, then, for instance, ``my_trafo1 * my_trafo2 * my_clf`` will result in the same object as obtained from the constructor ``SklearnClassifierPipeline(classifier=my_clf, transformers=[t1, t2])`` magic multiplication can also be used with (str, transformer) pairs, as long as one element in the chain is a transformer Parameters ---------- classifier : sklearn classifier, i.e., inheriting from sklearn ClassifierMixin this is a "blueprint" classifier, state does not change when ``fit`` is called transformers : list of sktime transformers, or list of tuples (str, transformer) of sktime transformers these are "blueprint" transformers, states do not change when ``fit`` is called Attributes ---------- classifier_ : sklearn classifier, clone of classifier in ``classifier`` this clone is fitted in the pipeline when ``fit`` is called transformers_ : list of tuples (str, transformer) of sktime transformers clones of transformers in ``transformers`` which are fitted in the pipeline is always in (str, transformer) format, even if transformers is just a list strings not passed in transformers are unique generated strings i-th transformer in ``transformers_`` is clone of i-th in ``transformers`` Examples -------- >>> from sklearn.neighbors import KNeighborsClassifier >>> from sktime.transformations.series.exponent import ExponentTransformer >>> from sktime.transformations.series.summarize import SummaryTransformer >>> from sktime.datasets import load_unit_test >>> from sktime.classification.compose import SklearnClassifierPipeline >>> X_train, y_train = load_unit_test(split="train") >>> X_test, y_test = load_unit_test(split="test") >>> t1 = ExponentTransformer() >>> t2 = SummaryTransformer() >>> pipeline = SklearnClassifierPipeline(KNeighborsClassifier(), [t1, t2]) >>> pipeline = pipeline.fit(X_train, y_train) >>> y_pred = pipeline.predict(X_test) Alternative construction via dunder method: >>> pipeline = t1 * t2 * KNeighborsClassifier() """ _tags = { "X_inner_mtype": "pd-multiindex", # which type do _fit/_predict accept "capability:multivariate": True, "capability:unequal_length": True, "capability:missing_values": True, "capability:train_estimate": False, "capability:contractable": False, "capability:multithreading": False, "capability:predict_proba": True, } # no default tag values - these are set dynamically below def __init__(self, classifier, transformers): from sklearn.base import clone self.classifier = classifier self.classifier_ = clone(classifier) self.transformers = transformers self.transformers_ = TransformerPipeline(transformers) super().__init__() # all sktime and sklearn transformers always support multivariate multivariate = True # can handle missing values iff transformer chain removes missing data # sklearn classifiers might be able to handle missing data (but no tag there) # so better set the tag liberally missing = self.transformers_.get_tag("handles-missing-data", False) missing = missing or self.transformers_.get_tag( "capability:missing_values:removes", False ) # can handle unequal length iff transformer chain renders series equal length # because sklearn classifiers require equal length (number of variables) input unequal = self.transformers_.get_tag("capability:unequal_length:removes", False) # last three tags are always False, since not supported by transformers tags_to_set = { "capability:multivariate": multivariate, "capability:missing_values": missing, "capability:unequal_length": unequal, "capability:contractable": False, "capability:train_estimate": False, "capability:multithreading": False, } self.set_tags(**tags_to_set) @property def _transformers(self): return self.transformers_._steps @_transformers.setter def _transformers(self, value): self.transformers_._steps = value def __rmul__(self, other): """Magic * method, return concatenated ClassifierPipeline, transformers on left. Implemented for ``other`` being a transformer, otherwise returns ``NotImplemented``. Parameters ---------- other: ``sktime`` transformer, must inherit from BaseTransformer otherwise, ``NotImplemented`` is returned Returns ------- ClassifierPipeline object, concatenation of ``other`` (first) with ``self`` (last). """ if isinstance(other, BaseTransformer): # use the transformers dunder to get a TransformerPipeline trafo_pipeline = other * self.transformers_ # then stick the expanded pipeline in a SklearnClassifierPipeline new_pipeline = SklearnClassifierPipeline( classifier=self.classifier, transformers=trafo_pipeline.steps, ) return new_pipeline else: return NotImplemented def _convert_X_to_sklearn(self, X): """Convert a Table or Panel X to 2D numpy required by sklearn.""" X_scitype = self.transformers_.get_tag("scitype:transform-output") # if X_scitype is Primitives, output is Table, convert to 2D numpy array if X_scitype == "Primitives": Xt = convert_to(X, to_type="numpy2D", as_scitype="Table") # if X_scitype is Series, output is Panel, convert to 2D numpy array (numpyflat) elif X_scitype == "Series": Xt = convert_to(X, to_type="numpyflat", as_scitype="Panel") else: raise TypeError( f"unexpected X output type in {type(self.classifier).__name__}, " f'in tag "scitype:transform-output", found "{X_scitype}", ' 'expected one of "Primitives" or "Series"' ) return Xt def _fit(self, X, y): """Fit time series classifier to training data. core logic Parameters ---------- X : Training data of type self.get_tag("X_inner_mtype") y : array-like, shape = [n_instances] - the class labels Returns ------- self : reference to self. State change ------------ creates fitted model (attributes ending in "_") """ Xt = self.transformers_.fit_transform(X=X, y=y) Xt_sklearn = self._convert_X_to_sklearn(Xt) self.classifier_.fit(Xt_sklearn, y) return self def _predict(self, X) -> np.ndarray: """Predict labels for sequences in X. core logic Parameters ---------- X : data not used in training, of type self.get_tag("X_inner_mtype") Returns ------- y : predictions of labels for X, np.ndarray """ Xt = self.transformers_.transform(X=X) Xt_sklearn = self._convert_X_to_sklearn(Xt) return self.classifier_.predict(Xt_sklearn) def _predict_proba(self, X) -> np.ndarray: """Predicts labels probabilities for sequences in X. Default behaviour is to call _predict and set the predicted class probability to 1, other class probabilities to 0. Override if better estimates are obtainable. Parameters ---------- X : data to predict y with, of type self.get_tag("X_inner_mtype") Returns ------- y : predictions of probabilities for class values of X, np.ndarray """ Xt = self.transformers_.transform(X) if hasattr(self.classifier_, "predict_proba"): Xt_sklearn = self._convert_X_to_sklearn(Xt) return self.classifier_.predict_proba(Xt_sklearn) else: # if sklearn classifier does not have predict_proba return BaseClassifier._predict_proba(self, X)
[文档] def get_params(self, deep=True): """Get parameters of estimator in ``transformers``. Parameters ---------- deep : boolean, optional, default=True If True, will return the parameters for this estimator and contained sub-objects that are estimators. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ params = dict() trafo_params = self._get_params("_transformers", deep=deep) params.update(trafo_params) return params
[文档] def set_params(self, **kwargs): """Set the parameters of estimator in ``transformers``. Valid parameter keys can be listed with ``get_params()``. Returns ------- self : returns an instance of self. """ if "classifier" in kwargs.keys(): if not is_sklearn_classifier(kwargs["classifier"]): raise TypeError('"classifier" arg must be an sklearn classifier') trafo_keys = self._get_params("_transformers", deep=True).keys() classif_keys = self.classifier.get_params(deep=True).keys() trafo_args = self._subset_dict_keys(dict_to_subset=kwargs, keys=trafo_keys) classif_args = self._subset_dict_keys( dict_to_subset=kwargs, keys=classif_keys, prefix="classifier" ) if len(classif_args) > 0: self.classifier.set_params(**classif_args) if len(trafo_args) > 0: self._set_params("_transformers", **trafo_args) return self
[文档] @classmethod def get_test_params(cls, parameter_set="default"): """Return testing parameter settings for the estimator. Parameters ---------- parameter_set : str, default="default" Name of the set of test parameters to return, for use in tests. If no special parameters are defined for a value, will return ``"default"`` set. For classifiers, a "default" set of parameters should be provided for general testing, and a "results_comparison" set for comparing against previously recorded results if the general set does not produce suitable probabilities to compare against. Returns ------- params : dict or list of dict, default={} Parameters to create testing instances of the class. Each dict are parameters to construct an "interesting" test instance, i.e., ``MyClass(**params)`` or ``MyClass(**params[i])`` creates a valid test instance. ``create_test_instance`` uses the first (or only) dictionary in ``params``. """ from sklearn.neighbors import KNeighborsClassifier from sktime.transformations.series.exponent import ExponentTransformer from sktime.transformations.series.summarize import SummaryTransformer # example with series-to-series transformer before sklearn classifier t1 = ExponentTransformer(power=2) t2 = ExponentTransformer(power=0.5) c = KNeighborsClassifier() params1 = {"transformers": [t1, t2], "classifier": c} # example with series-to-primitive transformer before sklearn classifier t1 = ExponentTransformer(power=2) t2 = SummaryTransformer() c = KNeighborsClassifier() params2 = {"transformers": [t1, t2], "classifier": c} # construct without names return [params1, params2]