feature_engine.preprocessing.match_categories 源代码

import warnings
from typing import List, Optional, Union

import pandas as pd

from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin
from feature_engine._docstrings.fit_attributes import (
    _feature_names_in_docstring,
    _n_features_in_docstring,
    _variables_attribute_docstring,
)
from feature_engine._docstrings.init_parameters.all_trasnformers import (
    _missing_values_docstring,
    _variables_categorical_docstring,
)
from feature_engine._docstrings.init_parameters.encoders import _ignore_format_docstring
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import _check_optional_contains_na, check_X
from feature_engine.encoding.base_encoder import (
    CategoricalInitMixinNA,
    CategoricalMethodsMixin,
)


[文档]@Substitution(
    ignore_format=_ignore_format_docstring,
    missing_values=_missing_values_docstring,
    variables=_variables_categorical_docstring,
    variables_=_variables_attribute_docstring,
    feature_names_in_=_feature_names_in_docstring,
    n_features_in_=_n_features_in_docstring,
)
class MatchCategories(
    CategoricalInitMixinNA, CategoricalMethodsMixin, GetFeatureNamesOutMixin
):
    """
    MatchCategories() ensures that categorical variables are encoded as pandas
    `'categorical'` dtype, instead of generic python `'object'` or other dtypes.

    Under the hood, `'categorical'` dtype is a representation that maps each
    category to an integer, thus providing a more memory-efficient object
    structure than, e.g., 'str', and allowing faster grouping, mapping, and similar
    operations on the resulting object.

    MatchCategories() remembers the encodings or levels that represent each
    category, and can thus can be used to ensure that the correct encoding gets
    applied when passing categorical data to modeling packages that support this
    dtype, or to prevent unseen categories from reaching a further transformer
    or estimator in a pipeline, for example.

    More details in the :ref:`User Guide <match_categories>`.

    Parameters
    ----------
    {variables}

    {ignore_format}

    {missing_values}

    Attributes
    ----------
    category_dict_:
        Dictionary with the category encodings assigned to each variable.

    {variables_}

    {feature_names_in_}

    {n_features_in_}

    Methods
    -------
    fit:
        Learn the encodings or levels to use for each variable.

    fit_transform:
        Fit to the data. Then transform it.

    get_feature_names_out:
        Get output feature names for transformation.

    get_params:
        Get parameters for this estimator.

    set_params:
        Set the parameters of this estimator.

    transform:
        Enforce the type of categorical variables as dtype `categorical`.

    Examples
    --------

    >>> import pandas as pd
    >>> from feature_engine.preprocessing import MatchCategories
    >>> X_train = pd.DataFrame(dict(x1 = ["a","b","c"], x2 = [4,5,6]))
    >>> X_test = pd.DataFrame(dict(x1 = ["c","b","a","d"], x2 = [5,6,4,7]))
    >>> mc = MatchCategories(missing_values="ignore")
    >>> mc.fit(X_train)
    >>> mc.transform(X_train)
      x1  x2
    0  a   4
    1  b   5
    2  c   6
    >>> mc.transform(X_test)
        x1  x2
    0    c   5
    1    b   6
    2    a   4
    3  NaN   7
    """

    def __init__(
        self,
        variables: Union[None, int, str, List[Union[str, int]]] = None,
        ignore_format: bool = False,
        missing_values: str = "raise",
    ) -> None:

        super().__init__(variables, missing_values, ignore_format)

[文档]    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the encodings or levels to use for representing categorical variables.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset. Can be the entire dataframe, not just the
            variables to be transformed.

        y: pandas Series, default = None
            y is not needed in this encoder. You can pass y or None.
        """
        X = check_X(X)
        variables_ = self._check_or_select_variables(X)

        if self.missing_values == "raise":
            _check_optional_contains_na(X, variables_)

        self.category_dict_ = dict()
        for var in variables_:
            self.category_dict_[var] = pd.Categorical(X[var]).categories

        self.variables_ = variables_
        self._get_feature_names_in(X)
        return self

[文档]    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Encode categorical variables as pandas categorical dtype.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features].
            The dataset to encode.

        Returns
        -------
        X_new: pandas dataframe of shape = [n_samples, n_features].
            The dataframe with the variables encoded as pandas categorical dtype.
        """
        X = self._check_transform_input_and_state(X)

        if self.missing_values == "raise":
            _check_optional_contains_na(X, self.variables_)

        for feature, levels in self.category_dict_.items():
            X[feature] = pd.Categorical(X[feature], levels)

        self._check_nas_in_result(X)
        return X

    def _check_nas_in_result(self, X: pd.DataFrame):
        # check if NaN values were introduced by the encoding
        if X[self.category_dict_.keys()].isnull().sum().sum() > 0:

            # obtain the name(s) of the columns that have null values
            nan_columns = (
                X[self.category_dict_.keys()]
                .columns[X[self.category_dict_.keys()].isnull().any()]
                .tolist()
            )

            if len(nan_columns) > 1:
                nan_columns_str = ", ".join(nan_columns)
            else:
                nan_columns_str = nan_columns[0]

            if self.missing_values == "ignore":
                warnings.warn(
                    "During the encoding, NaN values were introduced in the feature(s) "
                    f"{nan_columns_str}."
                )
            elif self.missing_values == "raise":
                raise ValueError(
                    "During the encoding, NaN values were introduced in the feature(s) "
                    f"{nan_columns_str}."
                )
This site uses cookies

feature_engine.preprocessing.match_categories 源代码