feature_engine.imputation.drop_missing_data 源代码

# Authors: Pradumna Suryawanshi <pradumnasuryawanshi@gmail.com>
# License: BSD 3 clause

from typing import List, Optional, Union

import pandas as pd

from feature_engine._base_transformers.mixins import TransformXyMixin
from feature_engine._check_init_parameters.check_variables import (
    _check_variables_input_value,
)
from feature_engine._docstrings.fit_attributes import (
    _feature_names_in_docstring,
    _n_features_in_docstring,
)
from feature_engine._docstrings.methods import _fit_transform_docstring
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import check_X
from feature_engine.imputation.base_imputer import BaseImputer
from feature_engine.tags import _return_tags
from feature_engine.variable_handling import check_all_variables, find_all_variables


[文档]@Substitution(
    feature_names_in_=_feature_names_in_docstring,
    n_features_in_=_n_features_in_docstring,
    fit_transform=_fit_transform_docstring,
)
class DropMissingData(BaseImputer, TransformXyMixin):
    """
    DropMissingData() deletes rows containing missing values. It provides
    similar functionality to `pandas.drop_na()`, but within the `fit` and `transform`
    framework.

    It works for numerical and categorical variables. You can enter the list of
    variables for which missing values should be removed. Alternatively, the imputer
    will find and remove missing data in all dataframe variables.

    More details in the :ref:`User Guide <drop_missing_data>`.

    Parameters
    ----------
    variables: list, default=None
        The list of variables to consider for the imputation. If `None`, the imputer
        will check missing data in all variables in the dataframe. Alternatively, the
        imputer will evaluate missing data only in the variables in the list.

        Note that if `missing_only=True`, missing data will be removed from variables
        that had missing data in the train set. These might be a subset of the
        variables indicated in the list.

    missing_only: bool, default=True
        If `True`, rows will be dropped when they show missing data in variables that
        had missing data during `fit()`. If `False`, rows will be dropped if there is
        missing data in any of the variables. This parameter only works when
        `threshold=None`, otherwise it is ignored.

    threshold: int or float, default=None
        Require that percentage of non-NA values in a row to keep it. If
        `threshold=1`, all variables need to have data to keep the row. If
        `threshold=0.5`, 50% of the variables need to have data to keep the row.
        If `threshold=0.01`, 10% of the variables need to have data to keep the row.
        If `thresh=None`, rows with NA in any of the variables will be dropped.

    Attributes
    ----------
    variables_:
        The variables for which missing data will be examined to decide if a row is
        dropped. The attribute `variables_` is different from the parameter `variables`
        when the latter is `None`, or when only a subset of the indicated variables
        show NA in the train set if `missing_only=True`.

    {feature_names_in_}

    {n_features_in_}

    Methods
    -------
    fit:
        Find the variables for which missing data should be evaluated.

    {fit_transform}

    return_na_data:
        Returns a dataframe with the rows that contain missing data.

    transform:
        Remove rows with missing data.

    transform_x_y:
        Remove rows with missing data from X and y.

    Examples
    --------

    >>> import pandas as pd
    >>> import numpy as np
    >>> from feature_engine.imputation import DropMissingData
    >>> X = pd.DataFrame(dict(
    >>>        x1 = [np.nan,1,1,0,np.nan],
    >>>        x2 = ["a", np.nan, "b", np.nan, "a"],
    >>>        ))
    >>> dmd = DropMissingData()
    >>> dmd.fit(X)
    >>> dmd.transform(X)
        x1 x2
    2  1.0  b
    """

    def __init__(
        self,
        missing_only: bool = True,
        threshold: Union[None, int, float] = None,
        variables: Union[None, int, str, List[Union[str, int]]] = None,
    ) -> None:

        if not isinstance(missing_only, bool):
            raise ValueError(
                "missing_only takes values True or False. "
                f"Got {missing_only} instead."
            )

        if threshold is not None:
            if not isinstance(threshold, (int, float)) or not (0 < threshold <= 1):
                raise ValueError(
                    "threshold must be a value between 0 < x <= 1. "
                    f"Got {threshold} instead."
                )

        self.variables = _check_variables_input_value(variables)
        self.missing_only = missing_only
        self.threshold = threshold

[文档]    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Find the variables for which missing data should be evaluated to decide if a
        row should be dropped.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training data set.

        y: pandas Series or dataframe, default=None
            y is not needed in this imputation. You can pass None or y.
        """

        # check input dataframe
        X = check_X(X)

        # find variables for which indicator should be added
        if self.variables is None:
            self.variables_ = find_all_variables(X)
        else:
            self.variables_ = check_all_variables(X, self.variables)

        # If user passes a threshold, then missing_only is ignored:
        if self.threshold is None and self.missing_only is True:
            self.variables_ = [
                var for var in self.variables_ if X[var].isnull().sum() > 0
            ]

        self._get_feature_names_in(X)

        return self

[文档]    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Remove rows with missing data.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The dataframe to be transformed.

        Returns
        -------
        X_new: pandas dataframe
            The complete case dataframe for the selected variables, of shape
            [n_samples - n_samples_with_na, n_features]
        """

        X = self._transform(X)

        if self.threshold:
            X.dropna(
                thresh=len(self.variables_) * self.threshold,
                subset=self.variables_,
                axis=0,
                inplace=True,
            )
        else:
            X.dropna(axis=0, how="any", subset=self.variables_, inplace=True)

        return X

[文档]    def return_na_data(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Returns the subset of the dataframe with the rows with missing values. That is,
        the subset of the dataframe that would be removed with the `transform()` method.
        This method may be useful in production, for example if we want to store or log
        the removed observations, that is, rows that will not be fed into the model.

        Parameters
        ----------
        X_na: pandas dataframe of shape = [n_samples_with_na, features]
            The subset of the dataframe with the rows with missing data.
        """

        X = self._transform(X)

        if self.threshold:
            idx = pd.isnull(X[self.variables_]).mean(axis=1) >= self.threshold
            idx = idx[idx]
        else:
            idx = pd.isnull(X[self.variables_]).any(axis=1)
            idx = idx[idx]

        return X.loc[idx.index, :]

    def _more_tags(self):
        tags_dict = _return_tags()
        tags_dict["allow_nan"] = True
        tags_dict["variables"] = "all"
        return tags_dict
This site uses cookies

feature_engine.imputation.drop_missing_data 源代码