feature_engine.transformation.yeojohnson 源代码
# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause
from typing import List, Optional, Union
import numpy as np
import pandas as pd
import scipy.stats as stats
from feature_engine._base_transformers.base_numerical import BaseNumericalTransformer
from feature_engine._check_init_parameters.check_variables import (
_check_variables_input_value,
)
from feature_engine._docstrings.fit_attributes import (
_feature_names_in_docstring,
_n_features_in_docstring,
_variables_attribute_docstring,
)
from feature_engine._docstrings.init_parameters.all_trasnformers import (
_variables_numerical_docstring,
)
from feature_engine._docstrings.methods import (
_fit_transform_docstring,
_inverse_transform_docstring,
)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.tags import _return_tags
[文档]@Substitution(
variables=_variables_numerical_docstring,
variables_=_variables_attribute_docstring,
feature_names_in_=_feature_names_in_docstring,
n_features_in_=_n_features_in_docstring,
fit_transform=_fit_transform_docstring,
inverse_transform=_inverse_transform_docstring,
)
class YeoJohnsonTransformer(BaseNumericalTransformer):
"""
The YeoJohnsonTransformer() applies the Yeo-Johnson transformation to the
numerical variables.
The Yeo-Johnson transformation implemented by this transformer is that of
SciPy.stats:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.yeojohnson.html
The YeoJohnsonTransformer() works only with numerical variables.
A list of variables can be passed as an argument. Alternatively, the
transformer will automatically select and transform all numerical
variables.
More details in the :ref:`User Guide <yeojohnson>`.
Parameters
----------
{variables}
Attributes
----------
lambda_dict_
Dictionary containing the best lambda for the Yeo-Johnson per variable.
{variables_}
{feature_names_in_}
{n_features_in_}
Methods
-------
fit:
Learn the optimal lambda for the Yeo-Johnson transformation.
{fit_transform}
{inverse_transform}
transform:
Apply the Yeo-Johnson transformation.
References
----------
.. [1] Yeo, In-Kwon and Johnson, Richard (2000).
A new family of power transformations to improve normality or symmetry.
Biometrika, 87, 954-959.
.. [2] Weisberg S. "Yeo-Johnson Power Transformations".
https://www.stat.umn.edu/arc/yjpower.pdf
Examples
--------
>>> import numpy as np
>>> import pandas as pd
>>> from feature_engine.transformation import YeoJohnsonTransformer
>>> np.random.seed(42)
>>> X = pd.DataFrame(dict(x = np.random.lognormal(size = 100) - 10))
>>> yjt = YeoJohnsonTransformer()
>>> yjt.fit(X)
>>> X = yjt.transform(X)
>>> X.head()
x
0 -267042.906453
1 -444357.138990
2 -221626.115742
3 -23647.632651
4 -467264.993249
"""
def __init__(
self, variables: Union[None, int, str, List[Union[str, int]]] = None
) -> None:
self.variables = _check_variables_input_value(variables)
[文档] def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""
Learn the optimal lambda for the Yeo-Johnson transformation.
Parameters
----------
X: pandas dataframe of shape = [n_samples, n_features]
The training input samples. Can be the entire dataframe, not just the
variables to transform.
y: pandas Series, default=None
It is not needed in this transformer. You can pass y or None.
"""
# check input dataframe
X = super().fit(X)
self.lambda_dict_ = {}
for var in self.variables_:
_, self.lambda_dict_[var] = stats.yeojohnson(X[var])
return self
[文档] def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Apply the Yeo-Johnson transformation.
Parameters
----------
X: Pandas DataFrame of shape = [n_samples, n_features]
The data to be transformed.
Returns
-------
X: pandas dataframe
The dataframe with the transformed variables.
"""
# check input dataframe and if class was fitted
X = self._check_transform_input_and_state(X)
for feature in self.variables_:
X[feature] = stats.yeojohnson(X[feature], lmbda=self.lambda_dict_[feature])
return X
[文档] def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Convert the data back to the original representation.
Parameters
----------
X: Pandas DataFrame of shape = [n_samples, n_features]
The data to be transformed.
Returns
-------
X_tr: pandas dataframe
The dataframe with the transformed variables.
"""
# check input dataframe and if class was fitted
X = self._check_transform_input_and_state(X)
for feature in self.variables_:
X[feature] = self._inverse_transform_series(
X[feature], lmbda=self.lambda_dict_[feature]
)
return X
def _inverse_transform_series(self, X: pd.Series, lmbda: float) -> pd.Series:
x_inv = pd.Series(np.zeros_like(X), index=X.index)
pos = X >= 0
# when x >= 0
if lmbda == 0:
x_inv[pos] = np.exp(X[pos]) - 1
else: # lmbda != 0
x_inv[pos] = np.power(X[pos] * lmbda + 1, 1 / lmbda) - 1
# when x < 0
if lmbda != 2:
x_inv[~pos] = 1 - np.power(-(2 - lmbda) * X[~pos] + 1, 1 / (2 - lmbda))
else: # lmbda == 2
x_inv[~pos] = 1 - np.exp(-X[~pos])
return x_inv
def _more_tags(self):
tags_dict = _return_tags()
tags_dict["variables"] = "numerical"
# ======= this tests fail because the transformers throw an error
# when the values are 0. Nothing to do with the test itself but
# mostly with the data created and used in the test
msg = (
"Transformer raises error when it can't find the optimal lambda for "
"the transformation, thus this check fails."
)
tags_dict["_xfail_checks"]["check_fit2d_1sample"] = msg
return tags_dict