feature_engine.outliers.trimmer 源代码

# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause

import pandas as pd

from feature_engine._base_transformers.mixins import TransformXyMixin
from feature_engine._docstrings.fit_attributes import (
    _feature_names_in_docstring,
    _left_tail_caps_docstring,
    _n_features_in_docstring,
    _right_tail_caps_docstring,
    _variables_attribute_docstring,
)
from feature_engine._docstrings.init_parameters.all_trasnformers import (
    _missing_values_docstring,
    _variables_numerical_docstring,
)
from feature_engine._docstrings.init_parameters.outliers import (
    _capping_method_docstring,
    _fold_docstring,
    _tail_docstring,
)
from feature_engine._docstrings.methods import _fit_transform_docstring
from feature_engine._docstrings.substitute import Substitution
from feature_engine.outliers.base_outlier import WinsorizerBase


[文档]@Substitution(
    intro_docstring=WinsorizerBase._intro_docstring,
    capping_method=_capping_method_docstring,
    tail=_tail_docstring,
    fold=_fold_docstring,
    variables=_variables_numerical_docstring,
    missing_values=_missing_values_docstring,
    right_tail_caps_=_right_tail_caps_docstring,
    left_tail_caps_=_left_tail_caps_docstring,
    variables_=_variables_attribute_docstring,
    feature_names_in_=_feature_names_in_docstring,
    n_features_in_=_n_features_in_docstring,
    fit_transform=_fit_transform_docstring,
)
class OutlierTrimmer(WinsorizerBase, TransformXyMixin):
    """The OutlierTrimmer() removes observations with outliers from the dataset.

    The OutlierTrimmer() first calculates the maximum and /or minimum values
    beyond which a value will be considered an outlier, and thus removed.

    {intro_docstring}

    The OutlierTrimmer() works only with numerical variables. A list of variables can
    be indicated. Alternatively, it will select all numerical variables.

    The transformer first finds the values at one or both tails of the distributions
    (fit). The transformer then removes observations with outliers from the dataframe
    (transform).

    More details in the :ref:`User Guide <outlier_trimmer>`.

    Parameters
    ----------
    {capping_method}

    {tail}

    {fold}

    {variables}

    {missing_values}

    Attributes
    ----------
    {right_tail_caps_}

    {left_tail_caps_}

    {variables_}

    {feature_names_in_}

    {n_features_in_}

    fold_:
        Factor multiplying the std, mad, iqr or alternative the percentile. Only
        different from `fold` when `fold="auto"`.

    Methods
    -------
    fit:
        Find maximum and minimum values.

    {fit_transform}

    transform:
        Remove outliers.

    transform_x_y:
        Remove rows with outliers from X set and y.

    References
    ----------
    .. [1] Rousseeuw, Croux. "Alternatives to the mean absolute deviation". Journal of
       the American Statistical Association, 1993. http://www.jstor.org/stable/2291267 .

    .. [2] Leys, et. al. "Do not use standard deviation around the mean, use absolute
       deviation around the median". Journal of Experimental Social Psychology, 2013.
       http://dx.doi.org/10.1016/j.jesp.2013.03.013.

    .. [3] Thériault, et. al. Check your outliers! An introduction to identifying
       statistical outliers in R with easystats. Behavior Research Methods, 2024.
       https://doi.org/10.3758/s13428-024-02356-w

    .. [4] Dixon. Simplified Estimation from Censored Normal Samples. The Annals of
       Mathematical Statistics, 1960. http://www.jstor.org/stable/2237953

    Examples
    --------

    >>> import pandas as pd
    >>> from feature_engine.outliers import OutlierTrimmer
    >>> X = pd.DataFrame(dict(x = [0.49671,
    >>>                         -0.1382,
    >>>                          0.64768,
    >>>                          1.52302,
    >>>                         -0.2341,
    >>>                         -17.2341,
    >>>                          1.57921,
    >>>                          0.76743,
    >>>                         -0.4694,
    >>>                          0.54256]))
    >>> ot = OutlierTrimmer(capping_method='gaussian', tail='left', fold=3)
    >>> ot.fit(X)
    >>> ot.transform(X)
              x
    0   0.49671
    1  -0.13820
    2   0.64768
    3   1.52302
    4  -0.23410
    5 -17.23410
    6   1.57921
    7   0.76743
    8  -0.46940
    9   0.54256

    >>> import pandas as pd
    >>> from feature_engine.outliers import OutlierTrimmer
    >>> X = pd.DataFrame(dict(x = [0.49671,
    >>>                         -0.1382,
    >>>                          0.64768,
    >>>                          1.52302,
    >>>                         -0.2341,
    >>>                         -17.2341,
    >>>                          1.57921,
    >>>                          0.76743,
    >>>                         -0.4694,
    >>>                          0.54256]))
    >>> ot = OutlierTrimmer(capping_method='mad', tail='left', fold=3)
    >>> ot.fit(X)
    >>> ot.transform(X)
             x
    0  0.49671
    1 -0.13820
    2  0.64768
    3  1.52302
    4 -0.23410
    6  1.57921
    7  0.76743
    8 -0.46940
    9  0.54256
    """

[文档]    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Remove observations with outliers from the dataframe.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The data to be transformed.

        Returns
        -------
        X_new: pandas dataframe of shape = [n_samples, n_features]
            The dataframe without outlier observations.
        """

        X = self._check_transform_input_and_state(X)

        for feature in self.right_tail_caps_.keys():
            inliers = X[feature].le(self.right_tail_caps_[feature])
            X = X.loc[inliers]

        for feature in self.left_tail_caps_.keys():
            inliers = X[feature].ge(self.left_tail_caps_[feature])
            X = X.loc[inliers]

        return X
This site uses cookies

feature_engine.outliers.trimmer 源代码