feature_engine.selection.single_feature_performance 源代码

import warnings
from typing import List, Union

import pandas as pd

from feature_engine._check_init_parameters.check_variables import (
    _check_variables_input_value,
)
from feature_engine._docstrings.fit_attributes import (
    _feature_names_in_docstring,
    _n_features_in_docstring,
)
from feature_engine._docstrings.init_parameters.selection import (
    _confirm_variables_docstring,
)
from feature_engine._docstrings.methods import _fit_transform_docstring
from feature_engine._docstrings.selection._docstring import (
    _cv_docstring,
    _groups_docstring,
    _estimator_docstring,
    _features_to_drop_docstring,
    _fit_docstring,
    _get_support_docstring,
    _initial_model_performance_docstring,
    _scoring_docstring,
    _threshold_docstring,
    _transform_docstring,
    _variables_attribute_docstring,
    _variables_numerical_docstring,
)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import check_X_y
from feature_engine.selection.base_selector import BaseSelector
from feature_engine.tags import _return_tags

from .base_selection_functions import (
    _select_numerical_variables,
    single_feature_performance,
)

Variables = Union[None, int, str, List[Union[str, int]]]


[文档]@Substitution( estimator=_estimator_docstring, scoring=_scoring_docstring, threshold=_threshold_docstring, cv=_cv_docstring, groups=_groups_docstring, variables=_variables_numerical_docstring, confirm_variables=_confirm_variables_docstring, initial_model_performance_=_initial_model_performance_docstring, features_to_drop_=_features_to_drop_docstring, variables_=_variables_attribute_docstring, feature_names_in_=_feature_names_in_docstring, n_features_in_=_n_features_in_docstring, fit=_fit_docstring, transform=_transform_docstring, fit_transform=_fit_transform_docstring, get_support=_get_support_docstring, ) class SelectBySingleFeaturePerformance(BaseSelector): """ SelectBySingleFeaturePerformance() selects features based on the performance of a machine learning model trained utilising a single feature. In other words, it trains a machine learning model for every single feature, then determines each model's performance. If the performance of the model is greater than a user specified threshold, then the feature is retained, otherwise removed. The models are trained on each individual features using cross-validation. The performance metric to evaluate and the machine learning model to train are specified by the user. More details in the :ref:`User Guide <single_feat_performance>`. Parameters ---------- {estimator} {variables} {scoring} {threshold} {cv} {groups} {confirm_variables} Attributes ---------- {features_to_drop_} feature_performance_: Dictionary with the single feature model performance per feature. feature_performance_std_: Dictionary with the standard deviation of the single feature model performance. {variables_} {feature_names_in_} {n_features_in_} Methods ------- {fit} {fit_transform} {get_support} {transform} References ---------- Selection based on single feature performance was used in Credit Risk modelling as discussed in the following talk at PyData London 2017: .. [1] Galli S. "Machine Learning in Financial Risk Assessment". https://www.youtube.com/watch?v=KHGGlozsRtA Examples -------- >>> import pandas as pd >>> from sklearn.ensemble import RandomForestClassifier >>> from feature_engine.selection import SelectBySingleFeaturePerformance >>> X = pd.DataFrame(dict(x1 = [1000,2000,1000,1000,2000,3000], >>> x2 = [2,4,3,1,2,2], >>> x3 = [1,1,1,0,0,0], >>> x4 = [1,2,1,1,0,1], >>> x5 = [1,1,1,1,1,1])) >>> y = pd.Series([1,0,0,1,1,0]) >>> sfp = SelectBySingleFeaturePerformance( >>> RandomForestClassifier(random_state=42), >>> cv=2) >>> sfp.fit_transform(X, y) x2 x3 0 2 1 1 4 1 2 3 1 3 1 0 4 2 0 5 2 0 """ def __init__( self, estimator, scoring: str = "roc_auc", cv=3, groups=None, threshold: Union[int, float, None] = None, variables: Variables = None, confirm_variables: bool = False, ): if threshold: if not isinstance(threshold, (int, float)): raise ValueError( "`threshold` can only be integer, float or None. " f"Got {threshold} instead." ) if scoring == "roc_auc" and (threshold < 0.5 or threshold > 1): raise ValueError( "`threshold` for roc-auc score should be between 0.5 and 1. " f"Got {threshold} instead." ) if scoring == "r2" and (threshold < 0 or threshold > 1): raise ValueError( "`threshold` for r2 score should be between 0 and 1. " f"Got {threshold} instead." ) super().__init__(confirm_variables) self.variables = _check_variables_input_value(variables) self.estimator = estimator self.scoring = scoring self.threshold = threshold self.cv = cv self.groups = groups
[文档] def fit(self, X: pd.DataFrame, y: pd.Series): """ Determines model performance based on single features. Selects features whose performance is above the threshold. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The input dataframe y: array-like of shape (n_samples) Target variable. Required to train the estimator. """ # check input dataframe X, y = check_X_y(X, y) self.variables_ = _select_numerical_variables( X, self.variables, self.confirm_variables ) if len(self.variables_) == 1 and self.threshold is None: raise ValueError( "When evaluating a single feature you need to manually set a value " "for the threshold. " f"The transformer is evaluating the performance of {self.variables_} " f"and the threshold was left to {self.threshold} when initializing " f"the transformer." ) self.feature_performance_, self.feature_performance_std_ = ( single_feature_performance( X=X, y=y, variables=self.variables_, estimator=self.estimator, cv=self.cv, groups=self.groups, scoring=self.scoring, ) ) # select features if not self.threshold: threshold = pd.Series(self.feature_performance_).mean() else: threshold = self.threshold self.features_to_drop_ = [ f for f in self.feature_performance_.keys() if self.feature_performance_[f] < threshold ] # check we are not dropping all the columns in the df if len(self.features_to_drop_) == len(X.columns): warnings.warn("All features will be dropped, try changing the threshold.") # save input features self._get_feature_names_in(X) return self
def _more_tags(self): tags_dict = _return_tags() tags_dict["variables"] = "numerical" tags_dict["requires_y"] = True # add additional test that fails tags_dict["_xfail_checks"][ "check_parameters_default_constructible" ] = "transformer has 1 mandatory parameter" tags_dict["_xfail_checks"]["check_estimators_nan_inf"] = "transformer allows NA" msg = "transformers need more than 1 feature to work" tags_dict["_xfail_checks"]["check_fit2d_1feature"] = msg return tags_dict