feature_engine.variable_handling.check_variables 源代码

"""Functions to check that the variables in a list are of a certain type."""

from typing import List, Union

import pandas as pd
from pandas.api.types import is_numeric_dtype as is_numeric

from feature_engine.variable_handling._variable_type_checks import (
    _is_categorical_and_is_datetime,
)
from feature_engine.variable_handling.dtypes import DATETIME_TYPES

Variables = Union[int, str, List[Union[str, int]]]


[文档]def check_numerical_variables( X: pd.DataFrame, variables: Variables ) -> List[Union[str, int]]: """ Checks that the variables in the list are of type numerical. More details in the :ref:`User Guide <check_num_vars>`. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The dataset. variables : List The list with the names of the variables to check. Returns ------- variables: List The names of the numerical variables. Examples -------- >>> import pandas as pd >>> from feature_engine.variable_handling import check_numerical_variables >>> X = pd.DataFrame({ >>> "var_num": [1, 2, 3], >>> "var_cat": ["A", "B", "C"], >>> "var_date": pd.date_range("2020-02-24", periods=3, freq="T") >>> }) >>> var_ = check_numerical_variables(X, variables=["var_num"]) >>> var_ ['var_num'] """ if isinstance(variables, (str, int)): variables = [variables] if len(X[variables].select_dtypes(exclude="number").columns) > 0: raise TypeError( "Some of the variables are not numerical. Please cast them as " "numerical before using this transformer." ) return variables
[文档]def check_categorical_variables( X: pd.DataFrame, variables: Variables ) -> List[Union[str, int]]: """ Checks that the variables in the list are of type object or categorical. More details in the :ref:`User Guide <check_cat_vars>`. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The dataset variables : list The list with the names of the variables to check. Returns ------- variables: List The names of the categorical variables. Examples -------- >>> import pandas as pd >>> from feature_engine.variable_handling import check_categorical_variables >>> X = pd.DataFrame({ >>> "var_num": [1, 2, 3], >>> "var_cat": ["A", "B", "C"], >>> "var_date": pd.date_range("2020-02-24", periods=3, freq="T") >>> }) >>> var_ = check_categorical_variables(X, "var_cat") >>> var_ ['var_cat'] """ if isinstance(variables, (str, int)): variables = [variables] if len(X[variables].select_dtypes(exclude=["O", "category"]).columns) > 0: raise TypeError( "Some of the variables are not categorical. Please cast them as " "object or categorical before using this transformer." ) return variables
[文档]def check_datetime_variables( X: pd.DataFrame, variables: Variables, ) -> List[Union[str, int]]: """ Checks that the variables in the list are or can be parsed as datetime and or datetimetz. More details in the :ref:`User Guide <check_datetime_vars>`. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The dataset variables : list The list with the names of the variables to check. Returns ------- variables: List The names of the datetime variables. Examples -------- >>> import pandas as pd >>> from feature_engine.variable_handling import check_datetime_variables >>> X = pd.DataFrame({ >>> "var_num": [1, 2, 3], >>> "var_cat": ["A", "B", "C"], >>> "var_date": pd.date_range("2020-02-24", periods=3, freq="T") >>> }) >>> var_date = check_datetime_variables(X, "var_date") >>> var_date ['var_date'] """ if isinstance(variables, (str, int)): variables = [variables] # find non datetime variables, if any: non_datetime_vars = [] for column in X[variables].select_dtypes(exclude=DATETIME_TYPES): if is_numeric(X[column]) or not _is_categorical_and_is_datetime(X[column]): non_datetime_vars.append(column) if len(non_datetime_vars) > 0: raise TypeError( "Some of the variables are not or cannot be parsed as datetime." ) return variables
[文档]def check_all_variables( X: pd.DataFrame, variables: Variables, ) -> List[Union[str, int]]: """ Checks that the variables in the list are in the dataframe. More details in the :ref:`User Guide <check_all_vars>`. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The dataset variables : list The list with the names of the variables to check. Returns ------- variables: List The names of the variables. Examples -------- >>> import pandas as pd >>> from feature_engine.variable_handling import check_all_variables >>> X = pd.DataFrame({ >>> "var_num": [1, 2, 3], >>> "var_cat": ["A", "B", "C"], >>> "var_date": pd.date_range("2020-02-24", periods=3, freq="T") >>> }) >>> vars_all = check_all_variables(X, ['var_num', 'var_cat', 'var_date']) >>> vars_all ['var_num', 'var_cat', 'var_date'] """ if isinstance(variables, (str, int)): if variables not in X.columns.to_list(): raise KeyError(f"The variable {variables} is not in the dataframe.") variables_ = [variables] else: if not set(variables).issubset(set(X.columns)): raise KeyError("Some of the variables are not in the dataframe.") variables_ = variables return variables_