# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause
import pandas as pd
from feature_engine._base_transformers.mixins import TransformXyMixin
from feature_engine._docstrings.fit_attributes import (
_feature_names_in_docstring,
_left_tail_caps_docstring,
_n_features_in_docstring,
_right_tail_caps_docstring,
_variables_attribute_docstring,
)
from feature_engine._docstrings.init_parameters.all_trasnformers import (
_missing_values_docstring,
_variables_numerical_docstring,
)
from feature_engine._docstrings.init_parameters.outliers import (
_capping_method_docstring,
_fold_docstring,
_tail_docstring,
)
from feature_engine._docstrings.methods import _fit_transform_docstring
from feature_engine._docstrings.substitute import Substitution
from feature_engine.outliers.base_outlier import WinsorizerBase
[文档]@Substitution(
intro_docstring=WinsorizerBase._intro_docstring,
capping_method=_capping_method_docstring,
tail=_tail_docstring,
fold=_fold_docstring,
variables=_variables_numerical_docstring,
missing_values=_missing_values_docstring,
right_tail_caps_=_right_tail_caps_docstring,
left_tail_caps_=_left_tail_caps_docstring,
variables_=_variables_attribute_docstring,
feature_names_in_=_feature_names_in_docstring,
n_features_in_=_n_features_in_docstring,
fit_transform=_fit_transform_docstring,
)
class OutlierTrimmer(WinsorizerBase, TransformXyMixin):
"""The OutlierTrimmer() removes observations with outliers from the dataset.
The OutlierTrimmer() first calculates the maximum and /or minimum values
beyond which a value will be considered an outlier, and thus removed.
{intro_docstring}
The OutlierTrimmer() works only with numerical variables. A list of variables can
be indicated. Alternatively, it will select all numerical variables.
The transformer first finds the values at one or both tails of the distributions
(fit). The transformer then removes observations with outliers from the dataframe
(transform).
More details in the :ref:`User Guide <outlier_trimmer>`.
Parameters
----------
{capping_method}
{tail}
{fold}
{variables}
{missing_values}
Attributes
----------
{right_tail_caps_}
{left_tail_caps_}
{variables_}
{feature_names_in_}
{n_features_in_}
fold_:
Factor multiplying the std, mad, iqr or alternative the percentile. Only
different from `fold` when `fold="auto"`.
Methods
-------
fit:
Find maximum and minimum values.
{fit_transform}
transform:
Remove outliers.
transform_x_y:
Remove rows with outliers from X set and y.
References
----------
.. [1] Rousseeuw, Croux. "Alternatives to the mean absolute deviation". Journal of
the American Statistical Association, 1993. http://www.jstor.org/stable/2291267 .
.. [2] Leys, et. al. "Do not use standard deviation around the mean, use absolute
deviation around the median". Journal of Experimental Social Psychology, 2013.
http://dx.doi.org/10.1016/j.jesp.2013.03.013.
.. [3] Thériault, et. al. Check your outliers! An introduction to identifying
statistical outliers in R with easystats. Behavior Research Methods, 2024.
https://doi.org/10.3758/s13428-024-02356-w
.. [4] Dixon. Simplified Estimation from Censored Normal Samples. The Annals of
Mathematical Statistics, 1960. http://www.jstor.org/stable/2237953
Examples
--------
>>> import pandas as pd
>>> from feature_engine.outliers import OutlierTrimmer
>>> X = pd.DataFrame(dict(x = [0.49671,
>>> -0.1382,
>>> 0.64768,
>>> 1.52302,
>>> -0.2341,
>>> -17.2341,
>>> 1.57921,
>>> 0.76743,
>>> -0.4694,
>>> 0.54256]))
>>> ot = OutlierTrimmer(capping_method='gaussian', tail='left', fold=3)
>>> ot.fit(X)
>>> ot.transform(X)
x
0 0.49671
1 -0.13820
2 0.64768
3 1.52302
4 -0.23410
5 -17.23410
6 1.57921
7 0.76743
8 -0.46940
9 0.54256
>>> import pandas as pd
>>> from feature_engine.outliers import OutlierTrimmer
>>> X = pd.DataFrame(dict(x = [0.49671,
>>> -0.1382,
>>> 0.64768,
>>> 1.52302,
>>> -0.2341,
>>> -17.2341,
>>> 1.57921,
>>> 0.76743,
>>> -0.4694,
>>> 0.54256]))
>>> ot = OutlierTrimmer(capping_method='mad', tail='left', fold=3)
>>> ot.fit(X)
>>> ot.transform(X)
x
0 0.49671
1 -0.13820
2 0.64768
3 1.52302
4 -0.23410
6 1.57921
7 0.76743
8 -0.46940
9 0.54256
"""