feature_engine.selection.smart_correlation_selection 源代码

from types import GeneratorType
from typing import List, Union

import pandas as pd

from feature_engine._check_init_parameters.check_variables import (
    _check_variables_input_value,
)
from feature_engine._docstrings.fit_attributes import (
    _feature_names_in_docstring,
    _n_features_in_docstring,
)
from feature_engine._docstrings.init_parameters.selection import (
    _confirm_variables_docstring,
)
from feature_engine._docstrings.methods import _fit_transform_docstring
from feature_engine._docstrings.selection._docstring import (
    _cv_docstring,
    _groups_docstring,
    _estimator_docstring,
    _get_support_docstring,
    _missing_values_docstring,
    _scoring_docstring,
    _variables_attribute_docstring,
    _variables_numerical_docstring,
)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import (
    _check_contains_inf,
    _check_contains_na,
    check_X,
)
from feature_engine.selection.base_selector import BaseSelector

from .base_selection_functions import (
    _select_numerical_variables,
    find_correlated_features,
    single_feature_performance,
)

Variables = Union[None, int, str, List[Union[str, int]]]


[文档]@Substitution(
    estimator=_estimator_docstring,
    scoring=_scoring_docstring,
    cv=_cv_docstring,
    groups=_groups_docstring,
    confirm_variables=_confirm_variables_docstring,
    variables=_variables_numerical_docstring,
    missing_values=_missing_values_docstring,
    variables_=_variables_attribute_docstring,
    feature_names_in_=_feature_names_in_docstring,
    n_features_in_=_n_features_in_docstring,
    fit_transform=_fit_transform_docstring,
    get_support=_get_support_docstring,
)
class SmartCorrelatedSelection(BaseSelector):
    """
    SmartCorrelatedSelection() finds groups of correlated features and then selects,
    from each group, a feature following certain criteria:

    - Feature with the least missing values.
    - Feature with the highest cardinality (greatest number of unique values).
    - Feature with the highest variance.
    - Feature with the highest importance according to an estimator.

    SmartCorrelatedSelection() returns a dataframe containing from each group of
    correlated features, the selected variable, plus all the features that were
    not correlated to any other.

    Correlation is calculated with `pandas.corr()`.

    SmartCorrelatedSelection() works only with numerical variables. Categorical
    variables will need to be encoded to numerical or will be excluded from the
    analysis.

    More details in the :ref:`User Guide <smart_correlation>`.

    Parameters
    ----------
    {variables}

    method: string or callable, default='pearson'
        Can take 'pearson', 'spearman', 'kendall' or callable. It refers to the
        correlation method to be used to identify the correlated features.

        - 'pearson': standard correlation coefficient
        - 'kendall': Kendall Tau correlation coefficient
        - 'spearman': Spearman rank correlation
        - callable: callable with input two 1d ndarrays and returning a float.

        For more details on this parameter visit the `pandas.corr()` documentation.

    threshold: float, default=0.8
        The correlation threshold above which a feature will be deemed correlated with
        another one and removed from the dataset.

    {missing_values}

    selection_method: str, default= "missing_values"
        Takes the values "missing_values", "cardinality", "variance" and
        "model_performance".

        **"missing_values"**: keeps the feature from the correlated group with the least
        missing observations.

        **"cardinality"**: keeps the feature from the correlated group with the highest
        cardinality.

        **"variance"**: keeps the feature from the correlated group with the highest
        variance.

        **"model_performance"**: trains a machine learning model using each of the
        features in a correlated group and retains the feature with the highest
        importance.

    {estimator}

    {scoring}

    {cv}

    {groups}

    {confirm_variables}

    Attributes
    ----------
    correlated_feature_sets_:
        Groups of correlated features. Each list is a group of correlated features.

    correlated_feature_dict_: dict
        Dictionary containing the correlated feature groups. The key is the feature
        against which all other features were evaluated. The values are the features
        correlated with the key. Key + values should be the same as the set found in
        `correlated_feature_groups`. We introduced this attribute in version 1.17.0
        because from the set, it is not easy to see which feature will be retained and
        which ones will be removed. The key is retained, the values will be dropped.

    features_to_drop_:
        The correlated features to remove from the dataset.

    {variables_}

    {feature_names_in_}

    {n_features_in_}

    Methods
    -------
    fit:
        Find best feature from each correlated group.

    {fit_transform}

    {get_support}

    transform:
        Return selected features.

    Notes
    -----
    For brute-force correlation selection, check Feature-engine's
    DropCorrelatedFeatures().

    See Also
    --------
    pandas.corr
    feature_engine.selection.DropCorrelatedFeatures

    Examples
    --------

    >>> import pandas as pd
    >>> from feature_engine.selection import SmartCorrelatedSelection
    >>> X = pd.DataFrame(dict(x1 = [1,2,1,1],
    >>>                 x2 = [2,4,3,1],
    >>>                 x3 = [1, 0, 0, 0]))
    >>> scs = SmartCorrelatedSelection(threshold=0.7)
    >>> scs.fit_transform(X)
       x2  x3
    0   2   1
    1   4   0
    2   3   0
    3   1   0

    It is also possible to use alternative selection methods. Here, we select those
    features with the higher variance:

    >>> X = pd.DataFrame(dict(x1 = [2,4,3,1],
    >>>                 x2 = [1000,2000,1500,500],
    >>>                 x3 = [1, 0, 0, 0]))
    >>> scs = SmartCorrelatedSelection(threshold=0.7, selection_method="variance")
    >>> scs.fit_transform(X)
         x2  x3
    0  1000   1
    1  2000   0
    2  1500   0
    3   500   0
    """

    def __init__(
        self,
        variables: Variables = None,
        method: str = "pearson",
        threshold: float = 0.8,
        missing_values: str = "ignore",
        selection_method: str = "missing_values",
        estimator=None,
        scoring: str = "roc_auc",
        cv=3,
        groups=None,
        confirm_variables: bool = False,
    ):
        if not isinstance(threshold, float) or threshold < 0 or threshold > 1:
            raise ValueError(
                f"`threshold` must be a float between 0 and 1. Got {threshold} instead."
            )

        if missing_values not in ["raise", "ignore"]:
            raise ValueError(
                "missing_values takes only values 'raise' or 'ignore'. "
                f"Got {missing_values} instead."
            )

        if selection_method not in [
            "missing_values",
            "cardinality",
            "variance",
            "model_performance",
        ]:
            raise ValueError(
                "selection_method takes only values 'missing_values', 'cardinality', "
                f"'variance' or 'model_performance'. Got {selection_method} instead."
            )

        if selection_method == "model_performance" and estimator is None:
            raise ValueError(
                "Please provide an estimator, e.g., "
                "RandomForestClassifier or select another "
                "selection_method."
            )

        if selection_method == "missing_values" and missing_values == "raise":
            raise ValueError(
                "When `selection_method = 'missing_values'`, you need to set "
                f"`missing_values` to `'ignore'`. Got {missing_values} instead."
            )

        super().__init__(confirm_variables)

        self.variables = _check_variables_input_value(variables)
        self.method = method
        self.threshold = threshold
        self.missing_values = missing_values
        self.selection_method = selection_method
        self.estimator = estimator
        self.scoring = scoring
        self.cv = cv
        self.groups = groups

[文档]    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        """
        Find the correlated feature groups. Determine which feature should be selected
        from each group.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y: pandas series. Default = None
            y is needed if selection_method == 'model_performance'.
        """

        # check input dataframe
        X = check_X(X)

        self.variables_ = _select_numerical_variables(
            X, self.variables, self.confirm_variables
        )

        # check that there are more than 1 variable to select from
        self._check_variable_number()

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables_)
            _check_contains_inf(X, self.variables_)

        if self.selection_method == "model_performance" and y is None:
            raise ValueError(
                "When `selection_method = 'model_performance'` y is needed to "
                "fit the transformer."
            )

        if self.selection_method == "missing_values":
            features = (
                X[self.variables_]
                .isnull()
                .sum()
                .sort_values(ascending=True)
                .index.to_list()
            )
        elif self.selection_method == "variance":
            features = (
                X[self.variables_].std().sort_values(ascending=False).index.to_list()
            )
        elif self.selection_method == "cardinality":
            features = (
                X[self.variables_]
                .nunique()
                .sort_values(ascending=False)
                .index.to_list()
            )
        else:
            features = sorted(self.variables_)

        correlated_groups, features_to_drop, correlated_dict = find_correlated_features(
            X, features, self.method, self.threshold
        )

        # select best performing feature according to estimator
        if self.selection_method == "model_performance":
            correlated_dict = dict()
            cv = list(self.cv) if isinstance(self.cv, GeneratorType) else self.cv
            for feature_group in correlated_groups:
                feature_performance, _ = single_feature_performance(
                    X=X,
                    y=y,
                    variables=feature_group,
                    estimator=self.estimator,
                    cv=cv,
                    groups=self.groups,
                    scoring=self.scoring,
                )
                # get most important feature
                f_i = (
                    pd.Series(feature_performance).sort_values(ascending=False).index[0]
                )
                correlated_dict[f_i] = feature_group.difference({f_i})

            # convoluted way to pick up the variables from the sets in the
            # order shown in the dictionary. Helps make transformer deterministic
            features_to_drop = [
                variable
                for set_ in correlated_dict.values()
                for variable in sorted(set_)
            ]

        self.features_to_drop_ = features_to_drop
        self.correlated_feature_sets_ = correlated_groups
        self.correlated_feature_dict_ = correlated_dict

        # save input features
        self._get_feature_names_in(X)

        return self
This site uses cookies

feature_engine.selection.smart_correlation_selection 源代码