feature_engine.selection.smart_correlation_selection 源代码

from types import GeneratorType
from typing import List, Union

import pandas as pd

from feature_engine._check_init_parameters.check_variables import (
    _check_variables_input_value,
)
from feature_engine._docstrings.fit_attributes import (
    _feature_names_in_docstring,
    _n_features_in_docstring,
)
from feature_engine._docstrings.init_parameters.selection import (
    _confirm_variables_docstring,
)
from feature_engine._docstrings.methods import _fit_transform_docstring
from feature_engine._docstrings.selection._docstring import (
    _cv_docstring,
    _groups_docstring,
    _estimator_docstring,
    _get_support_docstring,
    _missing_values_docstring,
    _scoring_docstring,
    _variables_attribute_docstring,
    _variables_numerical_docstring,
)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import (
    _check_contains_inf,
    _check_contains_na,
    check_X,
)
from feature_engine.selection.base_selector import BaseSelector

from .base_selection_functions import (
    _select_numerical_variables,
    find_correlated_features,
    single_feature_performance,
)

Variables = Union[None, int, str, List[Union[str, int]]]


[文档]@Substitution( estimator=_estimator_docstring, scoring=_scoring_docstring, cv=_cv_docstring, groups=_groups_docstring, confirm_variables=_confirm_variables_docstring, variables=_variables_numerical_docstring, missing_values=_missing_values_docstring, variables_=_variables_attribute_docstring, feature_names_in_=_feature_names_in_docstring, n_features_in_=_n_features_in_docstring, fit_transform=_fit_transform_docstring, get_support=_get_support_docstring, ) class SmartCorrelatedSelection(BaseSelector): """ SmartCorrelatedSelection() finds groups of correlated features and then selects, from each group, a feature following certain criteria: - Feature with the least missing values. - Feature with the highest cardinality (greatest number of unique values). - Feature with the highest variance. - Feature with the highest importance according to an estimator. SmartCorrelatedSelection() returns a dataframe containing from each group of correlated features, the selected variable, plus all the features that were not correlated to any other. Correlation is calculated with `pandas.corr()`. SmartCorrelatedSelection() works only with numerical variables. Categorical variables will need to be encoded to numerical or will be excluded from the analysis. More details in the :ref:`User Guide <smart_correlation>`. Parameters ---------- {variables} method: string or callable, default='pearson' Can take 'pearson', 'spearman', 'kendall' or callable. It refers to the correlation method to be used to identify the correlated features. - 'pearson': standard correlation coefficient - 'kendall': Kendall Tau correlation coefficient - 'spearman': Spearman rank correlation - callable: callable with input two 1d ndarrays and returning a float. For more details on this parameter visit the `pandas.corr()` documentation. threshold: float, default=0.8 The correlation threshold above which a feature will be deemed correlated with another one and removed from the dataset. {missing_values} selection_method: str, default= "missing_values" Takes the values "missing_values", "cardinality", "variance" and "model_performance". **"missing_values"**: keeps the feature from the correlated group with the least missing observations. **"cardinality"**: keeps the feature from the correlated group with the highest cardinality. **"variance"**: keeps the feature from the correlated group with the highest variance. **"model_performance"**: trains a machine learning model using each of the features in a correlated group and retains the feature with the highest importance. {estimator} {scoring} {cv} {groups} {confirm_variables} Attributes ---------- correlated_feature_sets_: Groups of correlated features. Each list is a group of correlated features. correlated_feature_dict_: dict Dictionary containing the correlated feature groups. The key is the feature against which all other features were evaluated. The values are the features correlated with the key. Key + values should be the same as the set found in `correlated_feature_groups`. We introduced this attribute in version 1.17.0 because from the set, it is not easy to see which feature will be retained and which ones will be removed. The key is retained, the values will be dropped. features_to_drop_: The correlated features to remove from the dataset. {variables_} {feature_names_in_} {n_features_in_} Methods ------- fit: Find best feature from each correlated group. {fit_transform} {get_support} transform: Return selected features. Notes ----- For brute-force correlation selection, check Feature-engine's DropCorrelatedFeatures(). See Also -------- pandas.corr feature_engine.selection.DropCorrelatedFeatures Examples -------- >>> import pandas as pd >>> from feature_engine.selection import SmartCorrelatedSelection >>> X = pd.DataFrame(dict(x1 = [1,2,1,1], >>> x2 = [2,4,3,1], >>> x3 = [1, 0, 0, 0])) >>> scs = SmartCorrelatedSelection(threshold=0.7) >>> scs.fit_transform(X) x2 x3 0 2 1 1 4 0 2 3 0 3 1 0 It is also possible to use alternative selection methods. Here, we select those features with the higher variance: >>> X = pd.DataFrame(dict(x1 = [2,4,3,1], >>> x2 = [1000,2000,1500,500], >>> x3 = [1, 0, 0, 0])) >>> scs = SmartCorrelatedSelection(threshold=0.7, selection_method="variance") >>> scs.fit_transform(X) x2 x3 0 1000 1 1 2000 0 2 1500 0 3 500 0 """ def __init__( self, variables: Variables = None, method: str = "pearson", threshold: float = 0.8, missing_values: str = "ignore", selection_method: str = "missing_values", estimator=None, scoring: str = "roc_auc", cv=3, groups=None, confirm_variables: bool = False, ): if not isinstance(threshold, float) or threshold < 0 or threshold > 1: raise ValueError( f"`threshold` must be a float between 0 and 1. Got {threshold} instead." ) if missing_values not in ["raise", "ignore"]: raise ValueError( "missing_values takes only values 'raise' or 'ignore'. " f"Got {missing_values} instead." ) if selection_method not in [ "missing_values", "cardinality", "variance", "model_performance", ]: raise ValueError( "selection_method takes only values 'missing_values', 'cardinality', " f"'variance' or 'model_performance'. Got {selection_method} instead." ) if selection_method == "model_performance" and estimator is None: raise ValueError( "Please provide an estimator, e.g., " "RandomForestClassifier or select another " "selection_method." ) if selection_method == "missing_values" and missing_values == "raise": raise ValueError( "When `selection_method = 'missing_values'`, you need to set " f"`missing_values` to `'ignore'`. Got {missing_values} instead." ) super().__init__(confirm_variables) self.variables = _check_variables_input_value(variables) self.method = method self.threshold = threshold self.missing_values = missing_values self.selection_method = selection_method self.estimator = estimator self.scoring = scoring self.cv = cv self.groups = groups
[文档] def fit(self, X: pd.DataFrame, y: pd.Series = None): """ Find the correlated feature groups. Determine which feature should be selected from each group. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. y: pandas series. Default = None y is needed if selection_method == 'model_performance'. """ # check input dataframe X = check_X(X) self.variables_ = _select_numerical_variables( X, self.variables, self.confirm_variables ) # check that there are more than 1 variable to select from self._check_variable_number() if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables_) _check_contains_inf(X, self.variables_) if self.selection_method == "model_performance" and y is None: raise ValueError( "When `selection_method = 'model_performance'` y is needed to " "fit the transformer." ) if self.selection_method == "missing_values": features = ( X[self.variables_] .isnull() .sum() .sort_values(ascending=True) .index.to_list() ) elif self.selection_method == "variance": features = ( X[self.variables_].std().sort_values(ascending=False).index.to_list() ) elif self.selection_method == "cardinality": features = ( X[self.variables_] .nunique() .sort_values(ascending=False) .index.to_list() ) else: features = sorted(self.variables_) correlated_groups, features_to_drop, correlated_dict = find_correlated_features( X, features, self.method, self.threshold ) # select best performing feature according to estimator if self.selection_method == "model_performance": correlated_dict = dict() cv = list(self.cv) if isinstance(self.cv, GeneratorType) else self.cv for feature_group in correlated_groups: feature_performance, _ = single_feature_performance( X=X, y=y, variables=feature_group, estimator=self.estimator, cv=cv, groups=self.groups, scoring=self.scoring, ) # get most important feature f_i = ( pd.Series(feature_performance).sort_values(ascending=False).index[0] ) correlated_dict[f_i] = feature_group.difference({f_i}) # convoluted way to pick up the variables from the sets in the # order shown in the dictionary. Helps make transformer deterministic features_to_drop = [ variable for set_ in correlated_dict.values() for variable in sorted(set_) ] self.features_to_drop_ = features_to_drop self.correlated_feature_sets_ = correlated_groups self.correlated_feature_dict_ = correlated_dict # save input features self._get_feature_names_in(X) return self