feature_engine.selection.shuffle_features 源代码

from types import GeneratorType
from typing import List, MutableSequence, Union

import numpy as np
import pandas as pd
from sklearn.base import is_classifier
from sklearn.metrics import get_scorer
from sklearn.model_selection import check_cv, cross_validate
from sklearn.utils.validation import _check_sample_weight, check_random_state

from feature_engine._check_init_parameters.check_variables import (
    _check_variables_input_value,
)
from feature_engine._docstrings.fit_attributes import (
    _feature_names_in_docstring,
    _n_features_in_docstring,
)
from feature_engine._docstrings.init_parameters.selection import (
    _confirm_variables_docstring,
)
from feature_engine._docstrings.methods import _fit_transform_docstring
from feature_engine._docstrings.selection._docstring import (
    _cv_docstring,
    _estimator_docstring,
    _features_to_drop_docstring,
    _fit_docstring,
    _get_support_docstring,
    _initial_model_performance_docstring,
    _scoring_docstring,
    _threshold_docstring,
    _transform_docstring,
    _variables_attribute_docstring,
    _variables_numerical_docstring,
)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import check_X_y
from feature_engine.selection.base_selector import BaseSelector
from feature_engine.tags import _return_tags

from .base_selection_functions import _select_numerical_variables

Variables = Union[None, int, str, List[Union[str, int]]]


[文档]@Substitution(
    estimator=_estimator_docstring,
    scoring=_scoring_docstring,
    threshold=_threshold_docstring,
    cv=_cv_docstring,
    variables=_variables_numerical_docstring,
    confirm_variables=_confirm_variables_docstring,
    initial_model_performance_=_initial_model_performance_docstring,
    features_to_drop_=_features_to_drop_docstring,
    variables_=_variables_attribute_docstring,
    feature_names_in_=_feature_names_in_docstring,
    n_features_in_=_n_features_in_docstring,
    fit=_fit_docstring,
    transform=_transform_docstring,
    fit_transform=_fit_transform_docstring,
    get_support=_get_support_docstring,
)
class SelectByShuffling(BaseSelector):
    """
    SelectByShuffling() selects features by determining the drop in machine learning
    model performance when each feature's values are randomly shuffled.

    If the variables are important, a random permutation of their values will
    decrease dramatically the machine learning model performance. Contrarily, the
    permutation of the values should have little to no effect on the model performance
    metric we are assessing if the feature is not predictive.

    The SelectByShuffling() first trains a machine learning model utilising all
    features. Next, it shuffles the values of 1 feature, obtains a prediction with the
    pre-trained model, and determines the performance drop (if any). If the drop in
    performance is bigger than a threshold then the feature is retained, otherwise
    removed. It continues until all features have been shuffled and examined.

    The user can determine the model for which performance drop after feature shuffling
    should be assessed. The user also determines the threshold in performance under
    which a feature will be removed, and the performance metric to evaluate.

    Model training and performance calculation are done with cross-validation.

    More details in the :ref:`User Guide <feature_shuffling>`.

    Parameters
    ----------
    {estimator}

    {variables}

    {scoring}

    {threshold}

    {cv}

    random_state: int, default=None
        Controls the randomness when shuffling features.

    {confirm_variables}

    Attributes
    ----------
    {initial_model_performance_}

    performance_drifts_:
        Dictionary with the performance drift per shuffled feature.

    performance_drifts_std_:
        Dictionary with the standard deviation of performance drift per shuffled
        feature.

    {features_to_drop_}

    {variables_}

    {feature_names_in_}

    {n_features_in_}

    Methods
    -------
    {fit}

    {fit_transform}

    {get_support}

    {transform}

    Notes
    -----
    This transformer is a similar concept to the `permutation_importance` from
    Scikit-learn. The function in Scikit-learn is used to evaluate feature importance
    instead of to select features.

    See Also
    --------
    sklearn.inspection.permutation_importance

    Examples
    --------

    >>> import pandas as pd
    >>> from sklearn.ensemble import RandomForestClassifier
    >>> from feature_engine.selection import SelectByShuffling
    >>> X = pd.DataFrame(dict(x1 = [1000,2000,1000,1000,2000,3000],
    >>>                     x2 = [2,4,3,1,2,2],
    >>>                     x3 = [1,1,1,0,0,0],
    >>>                     x4 = [1,2,1,1,0,1],
    >>>                     x5 = [1,1,1,1,1,1]))
    >>> y = pd.Series([1,0,0,1,1,0])
    >>> sbs = SelectByShuffling(
    >>>         RandomForestClassifier(random_state=42),
    >>>         cv=2,
    >>>         random_state=42,
    >>>       )
    >>> sbs.fit_transform(X, y)
       x2  x4  x5
    0   2   1   1
    1   4   2   1
    2   3   1   1
    3   1   1   1
    4   2   0   1
    5   2   1   1
    """

    def __init__(
        self,
        estimator,
        scoring: str = "roc_auc",
        cv=3,
        threshold: Union[float, int, None] = None,
        variables: Variables = None,
        random_state: Union[int, None] = None,
        confirm_variables: bool = False,
    ):

        if threshold and not isinstance(threshold, (int, float)):
            raise ValueError("threshold can only be integer or float or None")

        super().__init__(confirm_variables)

        self.variables = _check_variables_input_value(variables)
        self.estimator = estimator
        self.scoring = scoring
        self.threshold = threshold
        self.cv = cv
        self.random_state = random_state

[文档]    def fit(
        self,
        X: pd.DataFrame,
        y: pd.Series,
        sample_weight: Union[MutableSequence, None] = None,
    ):
        """
        Find the important features.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
           The input dataframe.

        y: array-like of shape (n_samples)
           Target variable. Required to train the estimator.

        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights. If None, then samples are equally weighted.
        """

        X, y = check_X_y(X, y)

        # reset the index
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight, X)

        self.variables_ = _select_numerical_variables(
            X, self.variables, self.confirm_variables
        )

        # check that there are more than 1 variable to select from
        self._check_variable_number()

        cv = list(self.cv) if isinstance(self.cv, GeneratorType) else self.cv

        # train model with all features and cross-validation
        model = cross_validate(
            estimator=self.estimator,
            X=X[self.variables_],
            y=y,
            cv=cv,
            return_estimator=True,
            scoring=self.scoring,
            params={"sample_weight": sample_weight},
        )

        # store initial model performance
        self.initial_model_performance_ = model["test_score"].mean()

        # extract the validation folds
        cv_ = check_cv(cv, y=y, classifier=is_classifier(self.estimator))
        validation_indices = [val_index for _, val_index in cv_.split(X, y)]

        # get performance metric
        scorer = get_scorer(self.scoring)

        # seed
        random_state = check_random_state(self.random_state)

        # dict to collect features and their performance_drift after shuffling
        self.performance_drifts_ = {}
        self.performance_drifts_std_ = {}

        # shuffle features and save feature performance drift into a dict
        for feature in self.variables_:

            X_shuffled = X[self.variables_].copy()

            # shuffle individual feature
            X_shuffled[feature] = (
                X_shuffled[feature]
                .sample(frac=1, random_state=random_state)
                .reset_index(drop=True)
            )

            # determine the performance with the shuffled feature
            performance = [
                scorer(m, X_shuffled.iloc[idx], y.iloc[idx])
                for m, idx in zip(model["estimator"], validation_indices)
            ]

            performance_std = np.std(performance)
            performance = np.mean(performance)

            # determine drift in performance
            # Note, sklearn negates the log and error scores, so no need to manually
            # do the inversion
            # https://scikit-learn.org/stable/modules/model_evaluation.html
            # (https://scikit-learn.org/stable/modules/model_evaluation.html
            # #the-scoring-parameter-defining-model-evaluation-rules)
            performance_drift = self.initial_model_performance_ - performance

            # Save feature and performance drift
            self.performance_drifts_[feature] = performance_drift
            self.performance_drifts_std_[feature] = performance_std

        # select features
        if not self.threshold:
            threshold = pd.Series(self.performance_drifts_).mean()
        else:
            threshold = self.threshold

        self.features_to_drop_ = [
            f
            for f in self.performance_drifts_.keys()
            if self.performance_drifts_[f] < threshold
        ]

        # save input features
        self._get_feature_names_in(X)

        return self

    def _more_tags(self):
        tags_dict = _return_tags()
        tags_dict["variables"] = "numerical"
        tags_dict["requires_y"] = True
        # add additional test that fails
        tags_dict["_xfail_checks"]["check_estimators_nan_inf"] = "transformer allows NA"
        tags_dict["_xfail_checks"][
            "check_parameters_default_constructible"
        ] = "transformer has 1 mandatory parameter"

        msg = "transformers need more than 1 feature to work"
        tags_dict["_xfail_checks"]["check_fit2d_1feature"] = msg

        return tags_dict
This site uses cookies

feature_engine.selection.shuffle_features 源代码