feature_engine.encoding.mean_encoding 源代码

# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause
from typing import List, Union

import pandas as pd

from feature_engine._docstrings.fit_attributes import (
    _feature_names_in_docstring,
    _n_features_in_docstring,
    _variables_attribute_docstring,
)
from feature_engine._docstrings.init_parameters.all_trasnformers import (
    _missing_values_docstring,
    _variables_categorical_docstring,
)
from feature_engine._docstrings.init_parameters.encoders import (
    _ignore_format_docstring,
    _unseen_docstring,
)
from feature_engine._docstrings.methods import (
    _fit_transform_docstring,
    _inverse_transform_docstring,
    _transform_encoders_docstring,
)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import check_X_y
from feature_engine.encoding._helper_functions import check_parameter_unseen
from feature_engine.encoding.base_encoder import (
    CategoricalInitMixinNA,
    CategoricalMethodsMixin,
)

_unseen_docstring = (
    _unseen_docstring
    + """ If `'encode'`, unseen categories will be encoded with the prior."""
)


[文档]@Substitution( missing_values=_missing_values_docstring, ignore_format=_ignore_format_docstring, variables=_variables_categorical_docstring, unseen=_unseen_docstring, variables_=_variables_attribute_docstring, feature_names_in_=_feature_names_in_docstring, n_features_in_=_n_features_in_docstring, fit_transform=_fit_transform_docstring, transform=_transform_encoders_docstring, inverse_transform=_inverse_transform_docstring, ) class MeanEncoder(CategoricalInitMixinNA, CategoricalMethodsMixin): """ The MeanEncoder() replaces categories by the mean value of the target for each category. For example in the variable colour, if the mean of the target for blue, red and grey is 0.5, 0.8 and 0.1 respectively, blue is replaced by 0.5, red by 0.8 and grey by 0.1. For rare categories, i.e., those with few observations, the mean target value might be less reliable. To mitigate poor estimates returned for rare categories, the mean target value can be determined as a mixture of the target mean value for the entire data set (also called the prior) and the mean target value for the category (the posterior), weighted by the number of observations: .. math:: mapping = (w_i) posterior + (1-w_i) prior where the weight is calculated as: .. math:: w_i = n_i t / (s + n_i t) In the previous equation, t is the target variance in the entire dataset, s is the target variance within the category and n is the number of observations for the category. The encoder will encode only categorical variables by default (type 'object' or 'categorical'). You can pass a list of variables to encode. Alternatively, the encoder will find and encode all categorical variables (type 'object' or 'categorical'). With `ignore_format=True` you have the option to encode numerical variables as well. The procedure is identical, you can either enter the list of variables to encode, or the transformer will automatically select all variables. The encoder first maps the categories to the numbers for each variable (fit). The encoder then replaces the categories with those numbers (transform). More details in the :ref:`User Guide <mean_encoder>`. Parameters ---------- {variables} {missing_values} {ignore_format} {unseen} smoothing: int, float, str, default=0.0 Smoothing factor. Should be >= 0. If 0 then no smoothing is applied, and the mean target value per category is returned without modification. If 'auto' then wi is calculated as described above and the category is encoded as the blended values of the prior and the posterior. If int or float, then the wi is calculated as ni / (ni+smoothing). Higher values lead to stronger smoothing (higher weight of prior). Attributes ---------- encoder_dict_: Dictionary with the target mean value per category per variable. {variables_} {feature_names_in_} {n_features_in_} Methods ------- fit: Learn the target mean value per category, per variable. {fit_transform} {inverse_transform} {transform} Notes ----- NAN are introduced when encoding categories that were not present in the training dataset. If this happens, try grouping infrequent categories using the RareLabelEncoder(). Check also the related transformers in the the open-source package `Category encoders <https://contrib.scikit-learn.org/category_encoders/>`_ See Also -------- feature_engine.encoding.RareLabelEncoder category_encoders.target_encoder.TargetEncoder category_encoders.m_estimate.MEstimateEncoder References ---------- .. [1] Micci-Barreca D. "A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification and Prediction Problems". ACM SIGKDD Explorations Newsletter, 2001. https://dl.acm.org/citation.cfm?id=507538 Examples -------- >>> import pandas as pd >>> from feature_engine.encoding import MeanEncoder >>> X = pd.DataFrame(dict(x1 = [1,2,3,4,5], x2 = ["c", "c", "c", "b", "a"])) >>> y = pd.Series([0,1,1,1,0]) >>> me = MeanEncoder() >>> me.fit(X,y) >>> me.transform(X) x1 x2 0 1 0.666667 1 2 0.666667 2 3 0.666667 3 4 1.000000 4 5 0.000000 """ def __init__( self, variables: Union[None, int, str, List[Union[str, int]]] = None, missing_values: str = "raise", ignore_format: bool = False, unseen: str = "ignore", smoothing: Union[int, float, str] = 0.0, ) -> None: super().__init__(variables, missing_values, ignore_format) if ( not isinstance(smoothing, (str, float, int)) or isinstance(smoothing, str) and (smoothing != "auto") ) or (isinstance(smoothing, (float, int)) and smoothing < 0): raise ValueError( f"smoothing must be greater than 0 or 'auto'. " f"Got {smoothing} instead." ) self.smoothing = smoothing check_parameter_unseen(unseen, ["ignore", "raise", "encode"]) self.unseen = unseen
[文档] def fit(self, X: pd.DataFrame, y: pd.Series): """ Learn the mean value of the target for each category of the variable. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to be encoded. y: pandas series The target. """ X, y = check_X_y(X, y) variables_ = self._check_or_select_variables(X) self._check_na(X, variables_) self.encoder_dict_ = {} y_prior = y.mean() if self.unseen == "encode": self._unseen = y_prior if self.smoothing == "auto": y_var = y.var(ddof=0) for var in variables_: if self.smoothing == "auto": damping = y.groupby(X[var]).var(ddof=0) / y_var else: damping = self.smoothing counts = X[var].value_counts() counts.index = counts.index.infer_objects() _lambda = counts / (counts + damping) self.encoder_dict_[var] = ( _lambda * y.groupby(X[var], observed=False).mean() + (1.0 - _lambda) * y_prior ).to_dict() # assign underscore parameters at the end in case code above fails self.variables_ = variables_ self._get_feature_names_in(X) return self
[文档] def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame: """Convert the encoded variable back to the original values. Note that if unseen was set to 'encode', then this method is not implemented. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features]. The transformed dataframe. Returns ------- X_tr: pandas dataframe of shape = [n_samples, n_features]. The un-transformed dataframe, with the categorical variables containing the original values. """ if self.unseen == "encode": raise NotImplementedError( "inverse_transform is not implemented for this transformer when " "`unseen='encode'`." ) else: return super().inverse_transform(X)
def _more_tags(self): tags_dict = super()._more_tags() tags_dict["requires_y"] = True return tags_dict