feature_engine.selection.probe_feature_selection 源代码

from typing import List, Union

import numpy as np
import pandas as pd

from feature_engine._docstrings.fit_attributes import (
    _feature_names_in_docstring,
    _n_features_in_docstring,
)
from feature_engine._docstrings.init_parameters.selection import (
    _confirm_variables_docstring,
    _estimator_docstring,
)
from feature_engine._docstrings.methods import _fit_transform_docstring
from feature_engine._docstrings.selection._docstring import (
    _cv_docstring,
    _groups_docstring,
    _features_to_drop_docstring,
    _fit_docstring,
    _get_support_docstring,
    _scoring_docstring,
    _transform_docstring,
    _variables_attribute_docstring,
    _variables_numerical_docstring,
)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import check_X_y
from feature_engine.selection.base_selector import BaseSelector
from feature_engine.tags import _return_tags

from .base_selection_functions import (
    _select_numerical_variables,
    find_feature_importance,
    single_feature_performance,
)

Variables = Union[None, int, str, List[Union[str, int]]]


[文档]@Substitution(
    estimator=_estimator_docstring,
    scoring=_scoring_docstring,
    cv=_cv_docstring,
    groups=_groups_docstring,
    confirm_variables=_confirm_variables_docstring,
    variables=_variables_numerical_docstring,
    feature_names_in_=_feature_names_in_docstring,
    features_to_drop_=_features_to_drop_docstring,
    variables_=_variables_attribute_docstring,
    n_features_in_=_n_features_in_docstring,
    fit=_fit_docstring,
    transform=_transform_docstring,
    fit_transform=_fit_transform_docstring,
    get_support=_get_support_docstring,
)
class ProbeFeatureSelection(BaseSelector):
    """
    ProbeFeatureSelection() generates one or more probe (i.e., random) features based
    on a user-selected distribution. The distribution options are 'normal', 'binomial',
    'uniform', or 'all'. 'all' creates at least one feature for each of the
    three aforementioned distributions.

    Using cross validation, ProbeFeatureSelection() fits a Scikit-learn estimator
    to the provided variables plus the probe features. Next, it derives the
    feature importance for each variable and probe feature from the fitted model.

    Alternatively, ProbeFeatureSelection() fits a Scikit-learn estimator per feature
    and probe feature (single feature models), and then determines the performance
    returned by that model,, using a metric of choice.

    Finally, ProbeFeatureSelection() selects the features whose importance is greater
    than those of the probes. In the case of there being more than one probe feature,
    ProbeFeatureSelection() takes the average feature importance of all the probe
    features.

    The variables whose feature importance is smaller than the feature importance of
    the probe feature(s) are dropped from the dataset.

    More details in the :ref:`User Guide <probe_features>`.

    Parameters
    ----------
    estimator: object
        A Scikit-learn estimator for regression or classification. If `collective=True`,
        the estimator must have either a `feature_importances_` or a `coef_` attribute
        after fitting.

    {variables}

    collective: bool, default=True
         Whether the feature importance should be derived from an estimator trained on
         the entire dataset (True), or trained using individual features (False).

    {scoring}

    n_probes: int, default=1
        Number of probe features to be created. If distribution is 'all', n_probes must
        be a multiple of 3.

    distribution: str, default='normal'
        The distribution used to create the probe features. The options are
        'normal', 'binomial', 'uniform', and 'all'. 'all' creates at least 1 or more
        probe features comprised of each distribution type, i.e., normal, binomial,
        and uniform. The remaining options create `n_probes` features of the selected
        distribution.

    {cv}

    {groups}

    Attributes
    ----------
    probe_features_:
        A dataframe comprised of the pseudo-randomly generated features based
        on the selected distribution.

    feature_importances_:
        Pandas Series with the feature importance. If `collective=True`, the feature
        importance is given by the coefficients of linear models or the importance
        derived from tree-based models. If `collective=False`, the feature importance
        is given by a performance metric returned by a model trained using that
        individual feature.

    feature_importances_std_:
        Pandas Series with the standard deviation of the feature importance.

    {features_to_drop_}

    {variables_}

    {feature_names_in_}

    {n_features_in_}

    Methods
    -------
    {fit}

    {fit_transform}

    {get_support}

    {transform}

    References
    ----------
    .. [1] Stoppiglia, et al. "Ranking a Random Feature for Variable and Feature
        Selection". JMLR: 1399-1414, 2003
        https://jmlr.org/papers/volume3/stoppiglia03a/stoppiglia03a.pdf

    Examples
    --------

    >>> from sklearn.datasets import load_breast_cancer
    >>> from sklearn.linear_model import LogisticRegression
    >>> from feature_engine.selection import ProbeFeatureSelection
    >>> X, y = load_breast_cancer(return_X_y=True, as_frame=True)
    >>> sel = ProbeFeatureSelection(
    >>>     estimator=LogisticRegression(),
    >>>     scoring="roc_auc",
    >>>     n_probes=3,
    >>>     distribution="normal",
    >>>     cv=3,
    >>>     random_state=150,
    >>> )
    >>> X_tr = sel.fit_transform(X, y)
    >>> print(X.shape, X_tr.shape)
    (569, 30) (569, 9)
    """

    def __init__(
        self,
        estimator,
        variables: Variables = None,
        collective: bool = True,
        scoring: str = "roc_auc",
        n_probes: int = 1,
        distribution: str = "normal",
        cv=5,
        groups=None,
        random_state: int = 0,
        confirm_variables: bool = False,
    ):
        if not isinstance(collective, bool):
            raise ValueError(
                f"collective takes values True or False. Got {collective} instead."
            )

        if distribution not in ["normal", "binary", "uniform", "all"]:
            raise ValueError(
                "distribution takes values 'normal', 'binary', 'uniform', or 'all'. "
                f"Got {distribution} instead."
            )

        if distribution == "all" and n_probes % 3 != 0:
            raise ValueError(
                "If distribution is 'all' the n_probes must be a multiple of 3. "
                f"Got {n_probes} instead."
            )

        if not isinstance(n_probes, int):
            raise ValueError(f"n_probes must be an integer. Got {n_probes} instead.")

        super().__init__(confirm_variables)
        self.estimator = estimator
        self.variables = variables
        self.collective = collective
        self.scoring = scoring
        self.distribution = distribution
        self.cv = cv
        self.groups = groups
        self.n_probes = n_probes
        self.random_state = random_state

[文档]    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Find the important features.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]

        y: array-like of shape (n_samples)
            Target variable. Required to train the estimator.
        """
        # check input dataframe
        X, y = check_X_y(X, y)

        self.variables_ = _select_numerical_variables(
            X, self.variables, self.confirm_variables
        )

        # save input features
        self._get_feature_names_in(X)

        # create probe feature distributions
        self.probe_features_ = self._generate_probe_features(X.shape[0])

        # required for a (train/test) split dataset
        X.reset_index(drop=True, inplace=True)

        X_new = pd.concat([X[self.variables_], self.probe_features_], axis=1)

        if self.collective is True:
            # train model using entire dataset and derive feature importance
            f_importance_mean, f_importance_std = find_feature_importance(
                X=X_new,
                y=y,
                estimator=self.estimator,
                cv=self.cv,
                groups=self.groups,
                scoring=self.scoring,
            )
            self.feature_importances_ = f_importance_mean
            self.feature_importances_std_ = f_importance_std

        else:
            # trains a model per feature (single feature models)
            f_importance_mean, f_importance_std = single_feature_performance(
                X=X_new,
                y=y,
                variables=X_new.columns,
                estimator=self.estimator,
                cv=self.cv,
                groups=self.groups,
                scoring=self.scoring,
            )
            self.feature_importances_ = pd.Series(f_importance_mean)
            self.feature_importances_std_ = pd.Series(f_importance_std)

        # get features with lower importance than the probe features
        self.features_to_drop_ = self._get_features_to_drop()

        return self

    def _generate_probe_features(self, n_obs: int) -> pd.DataFrame:
        """
        Returns a dataframe comprised of the probe feature using the
        selected distribution.
        """
        # create dataframe
        df = pd.DataFrame()

        # set random state
        np.random.seed(self.random_state)
        if self.distribution == "all":
            generation_cnt = self.n_probes // 3
            for i in range(generation_cnt):
                df[f"gaussian_probe_{i}"] = np.random.normal(0, 3, n_obs)
                df[f"binary_probe_{i}"] = np.random.randint(0, 2, n_obs)
                df[f"uniform_probe_{i}"] = np.random.uniform(0, 1, n_obs)

        # when distribution is normal, binary, or uniform
        else:
            for i in range(self.n_probes):
                if self.distribution == "normal":
                    df[f"gaussian_probe_{i}"] = np.random.normal(0, 3, n_obs)

                elif self.distribution == "binary":
                    df[f"binary_probe_{i}"] = np.random.randint(0, 2, n_obs)

                else:
                    df[f"uniform_probe_{i}"] = np.random.uniform(0, 1, n_obs)

        return df

    def _get_features_to_drop(self):
        """
        Identify the variables that have a lower feature importance than the average
        feature importance of all the probe features.
        """

        # if more than 1 probe feature, calculate average feature importance
        if self.n_probes > 1:
            probe_features_avg_importance = self.feature_importances_[
                self.probe_features_.columns
            ].values.mean()

        else:
            probe_features_avg_importance = self.feature_importances_[
                self.probe_features_.columns
            ].values

        features_to_drop = []

        for var in self.variables_:
            if self.feature_importances_[var] < probe_features_avg_importance:
                features_to_drop.append(var)

        return features_to_drop

    def _more_tags(self):
        tags_dict = _return_tags()
        tags_dict["variables"] = "numerical"
        tags_dict["requires_y"] = True
        # add additional test that fails
        tags_dict["_xfail_checks"]["check_estimators_nan_inf"] = "transformer allows NA"
        tags_dict["_xfail_checks"][
            "check_parameters_default_constructible"
        ] = "transformer has 1 mandatory parameter"

        # msg = "transformers need more than 1 feature to work"
        # tags_dict["_xfail_checks"]["check_fit2d_1feature"] = msg

        return tags_dict
This site uses cookies

feature_engine.selection.probe_feature_selection 源代码