feature_engine.encoding.ordinal 源代码

# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause

from typing import List, Optional, Union

import pandas as pd

from feature_engine._docstrings.fit_attributes import (
    _feature_names_in_docstring,
    _n_features_in_docstring,
    _variables_attribute_docstring,
)
from feature_engine._docstrings.init_parameters.all_trasnformers import (
    _missing_values_docstring,
    _variables_categorical_docstring,
)
from feature_engine._docstrings.init_parameters.encoders import (
    _ignore_format_docstring,
    _unseen_docstring,
)
from feature_engine._docstrings.methods import (
    _fit_transform_docstring,
    _inverse_transform_docstring,
    _transform_encoders_docstring,
)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import check_X, check_X_y
from feature_engine.encoding._helper_functions import check_parameter_unseen
from feature_engine.encoding.base_encoder import (
    CategoricalInitMixinNA,
    CategoricalMethodsMixin,
)

_unseen_docstring = (
    _unseen_docstring + """ If `'encode'`, unseen categories will be encoded as -1."""
)


[文档]@Substitution( missing_values=_missing_values_docstring, ignore_format=_ignore_format_docstring, variables=_variables_categorical_docstring, unseen=_unseen_docstring, variables_=_variables_attribute_docstring, feature_names_in_=_feature_names_in_docstring, n_features_in_=_n_features_in_docstring, fit_transform=_fit_transform_docstring, transform=_transform_encoders_docstring, inverse_transform=_inverse_transform_docstring, ) class OrdinalEncoder(CategoricalInitMixinNA, CategoricalMethodsMixin): """ The OrdinalEncoder() replaces categories by ordinal numbers (0, 1, 2, 3, etc). The numbers can be ordered based on the mean of the target per category, or assigned arbitrarily. The encoder will encode only categorical variables by default (type 'object' or 'categorical'). You can pass a list of variables to encode. Alternatively, the encoder will find and encode all categorical variables (type 'object' or 'categorical'). With `ignore_format=True` you have the option to encode numerical variables as well. The procedure is identical, you can either enter the list of variables to encode, or the transformer will automatically select all variables. The encoder first maps the categories to the numbers for each variable (fit). The encoder then transforms the categories to the mapped numbers (transform). More details in the :ref:`User Guide <ordinal_encoder>`. Parameters ---------- encoding_method: str, default='ordered' Desired method of encoding. **'ordered'**: the categories are numbered in ascending order according to the target mean value per category. **'arbitrary'**: categories are numbered arbitrarily. {variables} {missing_values} {ignore_format} {unseen} Attributes ---------- encoder_dict_: Dictionary with the ordinal number per category, per variable. {variables_} {feature_names_in_} {n_features_in_} Methods ------- fit: Find the integer to replace each category in each variable. {fit_transform} {inverse_transform} {transform} Notes ----- NAN are introduced when encoding categories that were not present in the training dataset. If this happens, try grouping infrequent categories using the RareLabelEncoder(). There is a similar implementation in the the open-source package `Category encoders <https://contrib.scikit-learn.org/category_encoders/>`_ See Also -------- feature_engine.encoding.RareLabelEncoder category_encoders.ordinal.OrdinalEncoder References ---------- Encoding into integers ordered following target mean was discussed in the following talk at PyData London 2017: .. [1] Galli S. "Machine Learning in Financial Risk Assessment". https://www.youtube.com/watch?v=KHGGlozsRtA Examples -------- >>> import pandas as pd >>> from feature_engine.encoding import OrdinalEncoder >>> X = pd.DataFrame(dict(x1 = [1,2,3,4], x2 = ["c", "a", "b", "c"])) >>> y = pd.Series([0,1,1,0]) >>> od = OrdinalEncoder(encoding_method='arbitrary') >>> od.fit(X) >>> od.transform(X) x1 x2 0 1 0 1 2 1 2 3 2 3 4 0 You can also consider the order of the target variable: >>> y = pd.Series([1,0,1,1]) >>> od = OrdinalEncoder(encoding_method='ordered') >>> od.fit(X, y) >>> od.transform(X) x1 x2 0 1 2 1 2 0 2 3 1 3 4 2 """ def __init__( self, encoding_method: str = "ordered", variables: Union[None, int, str, List[Union[str, int]]] = None, missing_values: str = "raise", ignore_format: bool = False, unseen: str = "ignore", ) -> None: if encoding_method not in ["ordered", "arbitrary"]: raise ValueError( "encoding_method takes only values 'ordered' and 'arbitrary'" ) check_parameter_unseen(unseen, ["ignore", "raise", "encode"]) super().__init__(variables, missing_values, ignore_format) self.encoding_method = encoding_method self.unseen = unseen
[文档] def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """Learn the numbers to be used to replace the categories in each variable. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to be encoded. y: pandas series, default=None The Target. Can be None if `encoding_method='arbitrary'`. Otherwise, y needs to be passed when fitting the transformer. """ if self.encoding_method == "ordered": X, y = check_X_y(X, y) else: X = check_X(X) variables_ = self._check_or_select_variables(X) self._check_na(X, variables_) self.encoder_dict_ = {} for var in variables_: if self.encoding_method == "ordered": t = y.groupby(X[var], observed=False).mean() # type: ignore t = t.sort_values(ascending=True).index elif self.encoding_method == "arbitrary": if self.missing_values == "ignore": t = X[var].dropna().unique() else: t = X[var].unique() else: raise ValueError( "Unrecognized value for encoding_method. It should be 'arbitrary' " f"or 'frequency'. Got {self.encoding_method} instead." ) self.encoder_dict_[var] = {k: i for i, k in enumerate(t, 0)} if self.unseen == "encode": self._unseen = -1 # assign underscore parameters at the end in case code above fails self.variables_ = variables_ self._get_feature_names_in(X) return self