feature_engine.encoding.decision_tree 源代码

# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause

from typing import List, Optional, Union

import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.utils.multiclass import check_classification_targets, type_of_target

from feature_engine._docstrings.fit_attributes import (
    _feature_names_in_docstring,
    _n_features_in_docstring,
    _variables_attribute_docstring,
)
from feature_engine._docstrings.init_parameters.all_trasnformers import (
    _variables_categorical_docstring,
)
from feature_engine._docstrings.init_parameters.encoders import (
    _ignore_format_docstring,
    _unseen_docstring,
)
from feature_engine._docstrings.methods import (
    _fit_transform_docstring,
    _inverse_transform_docstring,
)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import _check_contains_na, check_X_y
from feature_engine.discretisation import DecisionTreeDiscretiser
from feature_engine.encoding._helper_functions import check_parameter_unseen
from feature_engine.encoding.base_encoder import (
    CategoricalInitMixin,
    CategoricalMethodsMixin,
)
from feature_engine.encoding.ordinal import OrdinalEncoder
from feature_engine.tags import _return_tags


_unseen_docstring = (
    _unseen_docstring
    + """ If `'encode'` unseen categories will be encoded as `fill_value`."""
)


[文档]@Substitution( ignore_format=_ignore_format_docstring, variables=_variables_categorical_docstring, variables_=_variables_attribute_docstring, unseen=_unseen_docstring, feature_names_in_=_feature_names_in_docstring, n_features_in_=_n_features_in_docstring, fit_transform=_fit_transform_docstring, inverse_transform=_inverse_transform_docstring, ) class DecisionTreeEncoder(CategoricalInitMixin, CategoricalMethodsMixin): """ The DecisionTreeEncoder() encodes categorical variables with the predictions of a decision tree. The encoder fits a single feature decision tree to predict the target, and with that, it creates mappings from category to prediction value. Then, it uses these mappings to replace the categories of the feature. The encoder trains a decision tree per feature to encode. The DecisionTreeEncoder() will encode only categorical variables by default (type 'object' or 'categorical'). You can pass a list of variables to encode or the encoder will find and encode all categorical variables. With `ignore_format=True` you have the option to encode numerical variables as well. In this case, you can either enter the list of variables to encode, or the transformer will automatically select all variables. More details in the :ref:`User Guide <decisiontree_encoder>`. Parameters ---------- encoding_method: str, default='arbitrary' The method used to encode the categories to numerical values before fitting the decision tree. **'ordered'**: the categories are numbered in ascending order according to the target mean value per category. **'arbitrary'** : categories are numbered arbitrarily. cv: int, cross-validation generator or an iterable, default=3 Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use cross_validate's default 5-fold cross validation - int, to specify the number of folds in a (Stratified)KFold, - CV splitter - (https://scikit-learn.org/stable/glossary.html#term-CV-splitter) - An iterable yielding (train, test) splits as arrays of indices. For int/None inputs, if the estimator is a classifier and y is either binary or multiclass, StratifiedKFold is used. In all other cases, KFold is used. These splitters are instantiated with `shuffle=False` so the splits will be the same across calls. For more details check Scikit-learn's `cross_validate`'s documentation. scoring: str, default='neg_mean_squared_error' Desired metric to optimise the performance for the decision tree. Comes from sklearn.metrics. See the DecisionTreeRegressor or DecisionTreeClassifier model evaluation documentation for more options: https://scikit-learn.org/stable/modules/model_evaluation.html param_grid: dictionary, default=None The hyperparameters for the decision tree to test with a grid search. The `param_grid` can contain any of the permitted hyperparameters for Scikit-learn's DecisionTreeRegressor() or DecisionTreeClassifier(). If None, then param_grid will optimise the 'max_depth' over `[1, 2, 3, 4]`. regression: boolean, default=True Indicates whether the encoder should train a regression or a classification decision tree. random_state: int, default=None The random_state to initialise the training of the decision tree. It is one of the parameters of the Scikit-learn's DecisionTreeRegressor() or DecisionTreeClassifier(). For reproducibility it is recommended to set the random_state to an integer. {variables} {ignore_format} precision: int, default=None The precision at which to store and display the category mappings. In other words, the number of decimals after the comma for the tree predictions. {unseen} fill_value: float, default=None The value used to encode unseen categories. Only used when `unseen='encode'`. Attributes ---------- encoder_dict_: Dictionary with the prediction per category, per variable. {variables_} {feature_names_in_} {n_features_in_} Methods ------- fit: Fit a decision tree per variable. {fit_transform} {inverse_transform} transform: Replace categorical variable by the predictions of the decision tree. Notes ----- The authors designed this method originally to work with numerical variables. We can replace numerical variables by the predictions of a decision tree utilising the DecisionTreeDiscretiser(). Here, we extend this functionality to work also with categorical variables. See Also -------- sklearn.ensemble.DecisionTreeRegressor sklearn.ensemble.DecisionTreeClassifier feature_engine.discretisation.DecisionTreeDiscretiser feature_engine.encoding.RareLabelEncoder feature_engine.encoding.OrdinalEncoder References ---------- .. [1] Niculescu-Mizil, et al. "Winning the KDD Cup Orange Challenge with Ensemble Selection". JMLR: Workshop and Conference Proceedings 7: 23-34. KDD 2009 http://proceedings.mlr.press/v7/niculescu09/niculescu09.pdf Examples -------- >>> import pandas as pd >>> from feature_engine.encoding import DecisionTreeEncoder >>> X = pd.DataFrame(dict(x1 = [1,2,3,4,5], x2 = ["b", "b", "b", "a", "a"])) >>> y = pd.Series([2.2,4, 1.5, 3.2, 1.1]) >>> dte = DecisionTreeEncoder(cv=2) >>> dte.fit(X, y) >>> dte.transform(X) x1 x2 0 1 2.566667 1 2 2.566667 2 3 2.566667 3 4 2.150000 4 5 2.150000 You can also use it for classification by using `regression=False`. >>> y = pd.Series([0,1,1,1,0]) >>> dte = DecisionTreeEncoder(regression=False, cv=2) >>> dte.fit(X, y) >>> dte.transform(X) x1 x2 0 1 0.666667 1 2 0.666667 2 3 0.666667 3 4 0.500000 4 5 0.500000 """ def __init__( self, encoding_method: str = "arbitrary", cv=3, scoring: str = "neg_mean_squared_error", param_grid: Optional[dict] = None, regression: bool = True, random_state: Optional[int] = None, variables: Union[None, int, str, List[Union[str, int]]] = None, ignore_format: bool = False, precision: Optional[int] = None, unseen: str = "raise", fill_value: Optional[float] = None, ) -> None: if encoding_method not in ["ordered", "arbitrary"]: raise ValueError( "`encoding_method` takes only values 'ordered' and 'arbitrary'." f" Got {encoding_method} instead." ) if unseen == "encode" and ( fill_value is None or not isinstance(fill_value, (int, float)) ): raise ValueError( "When `unseen='encode'` you need to pass a number to `fill_value`. " f"Got {fill_value} instead." ) if precision is not None and (not isinstance(precision, int) or precision < 0): raise ValueError( "Parameter `precision` takes integers or None. " f"Got {precision} instead." ) check_parameter_unseen(unseen, ["ignore", "raise", "encode"]) super().__init__(variables, ignore_format) self.encoding_method = encoding_method self.cv = cv self.scoring = scoring self.regression = regression self.param_grid = param_grid self.random_state = random_state self.precision = precision self.unseen = unseen self.fill_value = fill_value
[文档] def fit(self, X: pd.DataFrame, y: pd.Series): """ Fit a decision tree per variable. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the categorical variables. y : pandas series. The target variable. Required to train the decision tree and for ordered ordinal encoding. """ X, y = check_X_y(X, y) # confirm model type and target variables are compatible. if self.regression is True: if type_of_target(y) == "binary": raise ValueError( "Trying to fit a regression to a binary target is not " "allowed by this transformer. Check the target values " "or set regression to False." ) else: check_classification_targets(y) variables_ = self._check_or_select_variables(X) _check_contains_na(X, variables_) param_grid = self._assign_param_grid() encoder = OrdinalEncoder( encoding_method=self.encoding_method, variables=variables_, missing_values="raise", ignore_format=self.ignore_format, ) tree = DecisionTreeDiscretiser( cv=self.cv, scoring=self.scoring, variables=variables_, param_grid=param_grid, regression=self.regression, random_state=self.random_state, ) # pipeline for the encoder pipe = Pipeline( [ ("encoder", encoder), ("tree", tree), ] ) Xt = pipe.fit_transform(X, y) encoder_ = {} if self.precision is None: for var in variables_: encoder_[var] = dict(zip(X[var], Xt[var])) else: for var in variables_: encoder_[var] = dict(zip(X[var], np.round(Xt[var], self.precision))) if self.unseen == "encode": self._unseen = self.fill_value self.encoder_dict_ = encoder_ self.variables_ = variables_ self._get_feature_names_in(X) return self
[文档] def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Replace categorical variables by the predictions of the decision tree. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input samples. Returns ------- X_new : pandas dataframe of shape = [n_samples, n_features]. Dataframe with variables encoded with decision tree predictions. """ X = self._check_transform_input_and_state(X) _check_contains_na(X, self.variables_) X = self._encode(X) return X
def _assign_param_grid(self): if self.param_grid: param_grid = self.param_grid else: param_grid = {"max_depth": [1, 2, 3, 4]} return param_grid def _more_tags(self): tags_dict = _return_tags() tags_dict["variables"] = "categorical" tags_dict["requires_y"] = True # the below test will fail because sklearn requires to check for inf, but # you can't check inf of categorical data, numpy returns and error. # so we need to leave without this test tags_dict["_xfail_checks"]["check_estimators_nan_inf"] = "transformer allows NA" return tags_dict