feature_engine.selection.drop_duplicate_features 源代码

from collections import defaultdict
from typing import List, Union

import pandas as pd

from feature_engine._check_init_parameters.check_variables import (
    _check_variables_input_value,
)
from feature_engine._docstrings.fit_attributes import (
    _feature_names_in_docstring,
    _n_features_in_docstring,
)
from feature_engine._docstrings.init_parameters.selection import (
    _confirm_variables_docstring,
)
from feature_engine._docstrings.methods import _fit_transform_docstring
from feature_engine._docstrings.selection._docstring import (
    _get_support_docstring,
    _missing_values_docstring,
    _variables_all_docstring,
    _variables_attribute_docstring,
)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import _check_contains_na, check_X
from feature_engine.selection.base_selector import BaseSelector
from feature_engine.tags import _return_tags

from .base_selection_functions import _select_all_variables

Variables = Union[None, int, str, List[Union[str, int]]]


[文档]@Substitution(
    confirm_variables=_confirm_variables_docstring,
    variables=_variables_all_docstring,
    missing_values=_missing_values_docstring,
    variables_=_variables_attribute_docstring,
    feature_names_in_=_feature_names_in_docstring,
    n_features_in_=_n_features_in_docstring,
    fit_transform=_fit_transform_docstring,
    get_support=_get_support_docstring,
)
class DropDuplicateFeatures(BaseSelector):
    """
    DropDuplicateFeatures() finds and removes duplicated features in a dataframe.

    Duplicated features are identical features, regardless of the variable or column
    name. If they show the same values for every observation, then they are considered
    duplicated.

    This transformer works with numerical and categorical variables. The user can
    indicate a list of variables to examine. Alternatively, the transformer will
    evaluate all the variables in the dataset.

    The transformer will first identify and store the duplicated variables. Next, the
    transformer will drop these variables from a dataframe.

    More details in the :ref:`User Guide <drop_duplicate>`.

    Parameters
    ----------
    {variables}

    {missing_values}

    {confirm_variables}

    Attributes
    ----------
    features_to_drop_:
        Set with the duplicated features that will be dropped.

    duplicated_feature_sets_:
        Groups of duplicated features. Each list is a group of duplicated features.

    {variables_}

    {feature_names_in_}

    {n_features_in_}

    Methods
    -------
    fit:
        Find duplicated features.

    {fit_transform}

    {get_support}

    transform:
        Remove duplicated features.

    Examples
    --------

    >>> import pandas as pd
    >>> from feature_engine.selection import DropDuplicateFeatures
    >>> X = pd.DataFrame(dict(x1 = [1,1,1,1],
    >>>                     x2 = [1,1,1,1],
    >>>                     x3 = [True, False, False, False]))
    >>> ddf = DropDuplicateFeatures()
    >>> ddf.fit_transform(X)
        x1     x3
    0   1   True
    1   1  False
    2   1  False
    3   1  False
    """

    def __init__(
        self,
        variables: Variables = None,
        missing_values: str = "ignore",
        confirm_variables: bool = False,
    ):
        if missing_values not in ["raise", "ignore"]:
            raise ValueError("missing_values takes only values 'raise' or 'ignore'.")

        super().__init__(confirm_variables)

        self.variables = _check_variables_input_value(variables)
        self.missing_values = missing_values

[文档]    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        """
        Find duplicated features.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The input dataframe.
        y: None
            y is not needed for this transformer. You can pass y or None.
        """

        # check input dataframe
        X = check_X(X)

        self.variables_ = _select_all_variables(
            X, self.variables, self.confirm_variables
        )

        # check that there are more than 1 variable to select from
        self._check_variable_number()

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables_)

        # collect duplicate features
        _features_hashmap = defaultdict(list)

        # hash the features
        _X_hash = pd.util.hash_pandas_object(X[self.variables_].T, index=False)

        # group the features by hash
        for feature, feature_hash in _X_hash.items():
            _features_hashmap[feature_hash].append(feature)

        # create tuples of duplicated feature groups
        self.duplicated_feature_sets_ = [
            set(duplicate)
            for duplicate in _features_hashmap.values()
            if len(duplicate) > 1
        ]

        # set to collect features that are duplicated
        self.features_to_drop_ = {
            item
            for duplicates in _features_hashmap.values()
            for item in duplicates[1:]
            if duplicates and len(duplicates) > 1
        }

        # save input features
        self._get_feature_names_in(X)

        return self

    def _more_tags(self):
        tags_dict = _return_tags()
        tags_dict["allow_nan"] = True
        tags_dict["variables"] = "all"

        msg = "transformers need more than 1 feature to work"
        tags_dict["_xfail_checks"]["check_fit2d_1feature"] = msg

        return tags_dict
This site uses cookies

feature_engine.selection.drop_duplicate_features 源代码