feature_engine.imputation.mean_median 源代码
# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause
from typing import List, Optional, Union
import pandas as pd
from feature_engine._check_init_parameters.check_variables import (
_check_variables_input_value,
)
from feature_engine._docstrings.fit_attributes import (
_feature_names_in_docstring,
_imputer_dict_docstring,
_n_features_in_docstring,
_variables_attribute_docstring,
)
from feature_engine._docstrings.init_parameters.all_trasnformers import (
_variables_numerical_docstring,
)
from feature_engine._docstrings.methods import (
_fit_transform_docstring,
_transform_imputers_docstring,
)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import check_X
from feature_engine.imputation.base_imputer import BaseImputer
from feature_engine.variable_handling import (
check_numerical_variables,
find_numerical_variables,
)
[文档]@Substitution(
variables=_variables_numerical_docstring,
imputer_dict_=_imputer_dict_docstring,
variables_=_variables_attribute_docstring,
feature_names_in_=_feature_names_in_docstring,
n_features_in_=_n_features_in_docstring,
transform=_transform_imputers_docstring,
fit_transform=_fit_transform_docstring,
)
class MeanMedianImputer(BaseImputer):
"""
The MeanMedianImputer() replaces missing data by the mean or median value of the
variable. It works only with numerical variables.
You can pass a list of variables to impute. Alternatively, the
MeanMedianImputer() will automatically select all variables of type numeric in the
training set.
More details in the :ref:`User Guide <mean_median_imputer>`.
Parameters
----------
imputation_method: str, default='median'
Desired method of imputation. Can take 'mean' or 'median'.
{variables}
Attributes
----------
{imputer_dict_}
{variables_}
{feature_names_in_}
{n_features_in_}
Methods
-------
fit:
Learn the mean or median values.
{fit_transform}
{transform}
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from feature_engine.imputation import MeanMedianImputer
>>> X = pd.DataFrame(dict(
>>> x1 = [np.nan,1,1,0,np.nan],
>>> x2 = ["a", np.nan, "b", np.nan, "a"],
>>> ))
>>> mmi = MeanMedianImputer(imputation_method='median')
>>> mmi.fit(X)
>>> mmi.transform(X)
x1 x2
0 1.0 a
1 1.0 NaN
2 1.0 b
3 0.0 NaN
4 1.0 a
"""
def __init__(
self,
imputation_method: str = "median",
variables: Union[None, int, str, List[Union[str, int]]] = None,
) -> None:
if imputation_method not in ["median", "mean"]:
raise ValueError("imputation_method takes only values 'median' or 'mean'")
self.imputation_method = imputation_method
self.variables = _check_variables_input_value(variables)
[文档] def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""
Learn the mean or median values.
Parameters
----------
X: pandas dataframe of shape = [n_samples, n_features]
The training dataset.
y: pandas series or None, default=None
y is not needed in this imputation. You can pass None or y.
"""
# check input dataframe
X = check_X(X)
# find or check for numerical variables
if self.variables is None:
self.variables_ = find_numerical_variables(X)
else:
self.variables_ = check_numerical_variables(X, self.variables)
# find imputation parameters: mean or median
if self.imputation_method == "mean":
self.imputer_dict_ = X[self.variables_].mean().to_dict()
elif self.imputation_method == "median":
self.imputer_dict_ = X[self.variables_].median().to_dict()
self._get_feature_names_in(X)
return self