feature_engine.selection.drop_duplicate_features 源代码

from collections import defaultdict
from typing import List, Union

import pandas as pd

from feature_engine._check_init_parameters.check_variables import (
    _check_variables_input_value,
)
from feature_engine._docstrings.fit_attributes import (
    _feature_names_in_docstring,
    _n_features_in_docstring,
)
from feature_engine._docstrings.init_parameters.selection import (
    _confirm_variables_docstring,
)
from feature_engine._docstrings.methods import _fit_transform_docstring
from feature_engine._docstrings.selection._docstring import (
    _get_support_docstring,
    _missing_values_docstring,
    _variables_all_docstring,
    _variables_attribute_docstring,
)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import _check_contains_na, check_X
from feature_engine.selection.base_selector import BaseSelector
from feature_engine.tags import _return_tags

from .base_selection_functions import _select_all_variables

Variables = Union[None, int, str, List[Union[str, int]]]


[文档]@Substitution( confirm_variables=_confirm_variables_docstring, variables=_variables_all_docstring, missing_values=_missing_values_docstring, variables_=_variables_attribute_docstring, feature_names_in_=_feature_names_in_docstring, n_features_in_=_n_features_in_docstring, fit_transform=_fit_transform_docstring, get_support=_get_support_docstring, ) class DropDuplicateFeatures(BaseSelector): """ DropDuplicateFeatures() finds and removes duplicated features in a dataframe. Duplicated features are identical features, regardless of the variable or column name. If they show the same values for every observation, then they are considered duplicated. This transformer works with numerical and categorical variables. The user can indicate a list of variables to examine. Alternatively, the transformer will evaluate all the variables in the dataset. The transformer will first identify and store the duplicated variables. Next, the transformer will drop these variables from a dataframe. More details in the :ref:`User Guide <drop_duplicate>`. Parameters ---------- {variables} {missing_values} {confirm_variables} Attributes ---------- features_to_drop_: Set with the duplicated features that will be dropped. duplicated_feature_sets_: Groups of duplicated features. Each list is a group of duplicated features. {variables_} {feature_names_in_} {n_features_in_} Methods ------- fit: Find duplicated features. {fit_transform} {get_support} transform: Remove duplicated features. Examples -------- >>> import pandas as pd >>> from feature_engine.selection import DropDuplicateFeatures >>> X = pd.DataFrame(dict(x1 = [1,1,1,1], >>> x2 = [1,1,1,1], >>> x3 = [True, False, False, False])) >>> ddf = DropDuplicateFeatures() >>> ddf.fit_transform(X) x1 x3 0 1 True 1 1 False 2 1 False 3 1 False """ def __init__( self, variables: Variables = None, missing_values: str = "ignore", confirm_variables: bool = False, ): if missing_values not in ["raise", "ignore"]: raise ValueError("missing_values takes only values 'raise' or 'ignore'.") super().__init__(confirm_variables) self.variables = _check_variables_input_value(variables) self.missing_values = missing_values
[文档] def fit(self, X: pd.DataFrame, y: pd.Series = None): """ Find duplicated features. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The input dataframe. y: None y is not needed for this transformer. You can pass y or None. """ # check input dataframe X = check_X(X) self.variables_ = _select_all_variables( X, self.variables, self.confirm_variables ) # check that there are more than 1 variable to select from self._check_variable_number() if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables_) # collect duplicate features _features_hashmap = defaultdict(list) # hash the features _X_hash = pd.util.hash_pandas_object(X[self.variables_].T, index=False) # group the features by hash for feature, feature_hash in _X_hash.items(): _features_hashmap[feature_hash].append(feature) # create tuples of duplicated feature groups self.duplicated_feature_sets_ = [ set(duplicate) for duplicate in _features_hashmap.values() if len(duplicate) > 1 ] # set to collect features that are duplicated self.features_to_drop_ = { item for duplicates in _features_hashmap.values() for item in duplicates[1:] if duplicates and len(duplicates) > 1 } # save input features self._get_feature_names_in(X) return self
def _more_tags(self): tags_dict = _return_tags() tags_dict["allow_nan"] = True tags_dict["variables"] = "all" msg = "transformers need more than 1 feature to work" tags_dict["_xfail_checks"]["check_fit2d_1feature"] = msg return tags_dict