from types import GeneratorType
from typing import List, MutableSequence, Union
import numpy as np
import pandas as pd
from sklearn.base import is_classifier
from sklearn.metrics import get_scorer
from sklearn.model_selection import check_cv, cross_validate
from sklearn.utils.validation import _check_sample_weight, check_random_state
from feature_engine._check_init_parameters.check_variables import (
_check_variables_input_value,
)
from feature_engine._docstrings.fit_attributes import (
_feature_names_in_docstring,
_n_features_in_docstring,
)
from feature_engine._docstrings.init_parameters.selection import (
_confirm_variables_docstring,
)
from feature_engine._docstrings.methods import _fit_transform_docstring
from feature_engine._docstrings.selection._docstring import (
_cv_docstring,
_estimator_docstring,
_features_to_drop_docstring,
_fit_docstring,
_get_support_docstring,
_initial_model_performance_docstring,
_scoring_docstring,
_threshold_docstring,
_transform_docstring,
_variables_attribute_docstring,
_variables_numerical_docstring,
)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import check_X_y
from feature_engine.selection.base_selector import BaseSelector
from feature_engine.tags import _return_tags
from .base_selection_functions import _select_numerical_variables
Variables = Union[None, int, str, List[Union[str, int]]]
[文档]@Substitution(
estimator=_estimator_docstring,
scoring=_scoring_docstring,
threshold=_threshold_docstring,
cv=_cv_docstring,
variables=_variables_numerical_docstring,
confirm_variables=_confirm_variables_docstring,
initial_model_performance_=_initial_model_performance_docstring,
features_to_drop_=_features_to_drop_docstring,
variables_=_variables_attribute_docstring,
feature_names_in_=_feature_names_in_docstring,
n_features_in_=_n_features_in_docstring,
fit=_fit_docstring,
transform=_transform_docstring,
fit_transform=_fit_transform_docstring,
get_support=_get_support_docstring,
)
class SelectByShuffling(BaseSelector):
"""
SelectByShuffling() selects features by determining the drop in machine learning
model performance when each feature's values are randomly shuffled.
If the variables are important, a random permutation of their values will
decrease dramatically the machine learning model performance. Contrarily, the
permutation of the values should have little to no effect on the model performance
metric we are assessing if the feature is not predictive.
The SelectByShuffling() first trains a machine learning model utilising all
features. Next, it shuffles the values of 1 feature, obtains a prediction with the
pre-trained model, and determines the performance drop (if any). If the drop in
performance is bigger than a threshold then the feature is retained, otherwise
removed. It continues until all features have been shuffled and examined.
The user can determine the model for which performance drop after feature shuffling
should be assessed. The user also determines the threshold in performance under
which a feature will be removed, and the performance metric to evaluate.
Model training and performance calculation are done with cross-validation.
More details in the :ref:`User Guide <feature_shuffling>`.
Parameters
----------
{estimator}
{variables}
{scoring}
{threshold}
{cv}
random_state: int, default=None
Controls the randomness when shuffling features.
{confirm_variables}
Attributes
----------
{initial_model_performance_}
performance_drifts_:
Dictionary with the performance drift per shuffled feature.
performance_drifts_std_:
Dictionary with the standard deviation of performance drift per shuffled
feature.
{features_to_drop_}
{variables_}
{feature_names_in_}
{n_features_in_}
Methods
-------
{fit}
{fit_transform}
{get_support}
{transform}
Notes
-----
This transformer is a similar concept to the `permutation_importance` from
Scikit-learn. The function in Scikit-learn is used to evaluate feature importance
instead of to select features.
See Also
--------
sklearn.inspection.permutation_importance
Examples
--------
>>> import pandas as pd
>>> from sklearn.ensemble import RandomForestClassifier
>>> from feature_engine.selection import SelectByShuffling
>>> X = pd.DataFrame(dict(x1 = [1000,2000,1000,1000,2000,3000],
>>> x2 = [2,4,3,1,2,2],
>>> x3 = [1,1,1,0,0,0],
>>> x4 = [1,2,1,1,0,1],
>>> x5 = [1,1,1,1,1,1]))
>>> y = pd.Series([1,0,0,1,1,0])
>>> sbs = SelectByShuffling(
>>> RandomForestClassifier(random_state=42),
>>> cv=2,
>>> random_state=42,
>>> )
>>> sbs.fit_transform(X, y)
x2 x4 x5
0 2 1 1
1 4 2 1
2 3 1 1
3 1 1 1
4 2 0 1
5 2 1 1
"""
def __init__(
self,
estimator,
scoring: str = "roc_auc",
cv=3,
threshold: Union[float, int, None] = None,
variables: Variables = None,
random_state: Union[int, None] = None,
confirm_variables: bool = False,
):
if threshold and not isinstance(threshold, (int, float)):
raise ValueError("threshold can only be integer or float or None")
super().__init__(confirm_variables)
self.variables = _check_variables_input_value(variables)
self.estimator = estimator
self.scoring = scoring
self.threshold = threshold
self.cv = cv
self.random_state = random_state
[文档] def fit(
self,
X: pd.DataFrame,
y: pd.Series,
sample_weight: Union[MutableSequence, None] = None,
):
"""
Find the important features.
Parameters
----------
X: pandas dataframe of shape = [n_samples, n_features]
The input dataframe.
y: array-like of shape (n_samples)
Target variable. Required to train the estimator.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, then samples are equally weighted.
"""
X, y = check_X_y(X, y)
# reset the index
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X)
self.variables_ = _select_numerical_variables(
X, self.variables, self.confirm_variables
)
# check that there are more than 1 variable to select from
self._check_variable_number()
cv = list(self.cv) if isinstance(self.cv, GeneratorType) else self.cv
# train model with all features and cross-validation
model = cross_validate(
estimator=self.estimator,
X=X[self.variables_],
y=y,
cv=cv,
return_estimator=True,
scoring=self.scoring,
params={"sample_weight": sample_weight},
)
# store initial model performance
self.initial_model_performance_ = model["test_score"].mean()
# extract the validation folds
cv_ = check_cv(cv, y=y, classifier=is_classifier(self.estimator))
validation_indices = [val_index for _, val_index in cv_.split(X, y)]
# get performance metric
scorer = get_scorer(self.scoring)
# seed
random_state = check_random_state(self.random_state)
# dict to collect features and their performance_drift after shuffling
self.performance_drifts_ = {}
self.performance_drifts_std_ = {}
# shuffle features and save feature performance drift into a dict
for feature in self.variables_:
X_shuffled = X[self.variables_].copy()
# shuffle individual feature
X_shuffled[feature] = (
X_shuffled[feature]
.sample(frac=1, random_state=random_state)
.reset_index(drop=True)
)
# determine the performance with the shuffled feature
performance = [
scorer(m, X_shuffled.iloc[idx], y.iloc[idx])
for m, idx in zip(model["estimator"], validation_indices)
]
performance_std = np.std(performance)
performance = np.mean(performance)
# determine drift in performance
# Note, sklearn negates the log and error scores, so no need to manually
# do the inversion
# https://scikit-learn.org/stable/modules/model_evaluation.html
# (https://scikit-learn.org/stable/modules/model_evaluation.html
# #the-scoring-parameter-defining-model-evaluation-rules)
performance_drift = self.initial_model_performance_ - performance
# Save feature and performance drift
self.performance_drifts_[feature] = performance_drift
self.performance_drifts_std_[feature] = performance_std
# select features
if not self.threshold:
threshold = pd.Series(self.performance_drifts_).mean()
else:
threshold = self.threshold
self.features_to_drop_ = [
f
for f in self.performance_drifts_.keys()
if self.performance_drifts_[f] < threshold
]
# save input features
self._get_feature_names_in(X)
return self
def _more_tags(self):
tags_dict = _return_tags()
tags_dict["variables"] = "numerical"
tags_dict["requires_y"] = True
# add additional test that fails
tags_dict["_xfail_checks"]["check_estimators_nan_inf"] = "transformer allows NA"
tags_dict["_xfail_checks"][
"check_parameters_default_constructible"
] = "transformer has 1 mandatory parameter"
msg = "transformers need more than 1 feature to work"
tags_dict["_xfail_checks"]["check_fit2d_1feature"] = msg
return tags_dict