feature_engine.variable_handling.find_variables 源代码

"""Functions to select certain types of variables."""

from typing import List, Tuple, Union

import pandas as pd
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.core.dtypes.common import is_numeric_dtype as is_numeric
from pandas.core.dtypes.common import is_object_dtype as is_object

from feature_engine.variable_handling._variable_type_checks import (
    _is_categorical_and_is_datetime,
    _is_categorical_and_is_not_datetime,
)
from feature_engine.variable_handling.dtypes import DATETIME_TYPES


[文档]def find_numerical_variables(X: pd.DataFrame) -> List[Union[str, int]]: """ Returns a list with the names of all the numerical variables in a dataframe. More details in the :ref:`User Guide <find_num_vars>`. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The dataset. Returns ------- variables: List The names of the numerical variables. Examples -------- >>> import pandas as pd >>> from feature_engine.variable_handling import find_numerical_variables >>> X = pd.DataFrame({ >>> "var_num": [1, 2, 3], >>> "var_cat": ["A", "B", "C"], >>> "var_date": pd.date_range("2020-02-24", periods=3, freq="T") >>> }) >>> var_ = find_numerical_variables(X) >>> var_ ['var_num'] """ variables = list(X.select_dtypes(include="number").columns) if len(variables) == 0: raise TypeError( "No numerical variables found in this dataframe. Please check " "variable format with pandas dtypes." ) return variables
[文档]def find_categorical_variables(X: pd.DataFrame) -> List[Union[str, int]]: """ Returns a list with the names of all the categorical variables in a dataframe. Note that variables cast as object that can be parsed to datetime will be excluded. More details in the :ref:`User Guide <find_cat_vars>`. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The dataset Returns ------- variables: List The names of the categorical variables. Examples -------- >>> import pandas as pd >>> from feature_engine.variable_handling import find_categorical_variables >>> X = pd.DataFrame({ >>> "var_num": [1, 2, 3], >>> "var_cat": ["A", "B", "C"], >>> "var_date": pd.date_range("2020-02-24", periods=3, freq="T") >>> }) >>> var_ = find_categorical_variables(X) >>> var_ ['var_cat'] """ variables = [ column for column in X.select_dtypes(include=["O", "category"]).columns if _is_categorical_and_is_not_datetime(X[column]) ] if len(variables) == 0: raise TypeError( "No categorical variables found in this dataframe. Please check " "variable format with pandas dtypes." ) return variables
[文档]def find_datetime_variables(X: pd.DataFrame) -> List[Union[str, int]]: """ Returns a list with the names of the variables that are or can be parsed as datetime. Note that this function will select variables cast as object if they can be cast as datetime as well. More details in the :ref:`User Guide <find_datetime_vars>`. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The dataset. Returns ------- variables: List The names of the datetime variables. Examples -------- >>> import pandas as pd >>> from feature_engine.variable_handling import find_datetime_variables >>> X = pd.DataFrame({ >>> "var_num": [1, 2, 3], >>> "var_cat": ["A", "B", "C"], >>> "var_date": pd.date_range("2020-02-24", periods=3, freq="T") >>> }) >>> var_date = find_datetime_variables(X) >>> var_date ['var_date'] """ variables = [ column for column in X.select_dtypes(exclude="number").columns if is_datetime(X[column]) or _is_categorical_and_is_datetime(X[column]) ] if len(variables) == 0: raise ValueError("No datetime variables found in this dataframe.") return variables
[文档]def find_all_variables( X: pd.DataFrame, exclude_datetime: bool = False, ) -> List[Union[str, int]]: """ Returns a list with the names of all the variables in the dataframe. It has the option to exlcude variables that can be parsed as datetime or datetimetz. More details in the :ref:`User Guide <find_all_vars>`. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The dataset exclude_datetime: bool, default=False Whether to exclude datetime variables. Returns ------- variables: List The names of the variables. Examples -------- >>> import pandas as pd >>> from feature_engine.variable_handling import find_all_variables >>> X = pd.DataFrame({ >>> "var_num": [1, 2, 3], >>> "var_cat": ["A", "B", "C"], >>> "var_date": pd.date_range("2020-02-24", periods=3, freq="T") >>> }) >>> vars_all = find_all_variables(X) >>> vars_all ['var_num', 'var_cat', 'var_date'] """ if exclude_datetime is True: variables = X.select_dtypes(exclude=DATETIME_TYPES).columns.to_list() variables = [ var for var in variables if is_numeric(X[var]) or not _is_categorical_and_is_datetime(X[var]) ] else: variables = X.columns.to_list() return variables
[文档]def find_categorical_and_numerical_variables( X: pd.DataFrame, variables: Union[None, int, str, List[Union[str, int]]] = None, ) -> Tuple[List[Union[str, int]], List[Union[str, int]]]: """ Find numerical and categorical variables in a dataframe or from a list. The function returns two lists; the first one with the names of the variables of type object or categorical and the second list with the names of the numerical variables. More details in the :ref:`User Guide <find_cat_and_num_vars>`. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The dataset variables : list, default=None If `None`, the function will find all categorical and numerical variables in X. Alternatively, it will find categorical and numerical variables in X, selecting from the given list. Returns ------- variables: tuple Tupe containing a list with the categorical variables, and a List with the numerical variables. Examples -------- >>> import pandas as pd >>> from feature_engine.variable_handling import ( >>> find_categorical_and_numerical_variables >>>) >>> X = pd.DataFrame({ >>> "var_num": [1, 2, 3], >>> "var_cat": ["A", "B", "C"], >>> "var_date": pd.date_range("2020-02-24", periods=3, freq="T") >>> }) >>> var_cat, var_num = find_categorical_and_numerical_variables(X) >>> var_cat, var_num (['var_cat'], ['var_num']) """ # If the user passes just 1 variable outside a list. if isinstance(variables, (str, int)): if X[variables].dtype.name == "category" or is_object(X[variables]): variables_cat = [variables] variables_num = [] elif is_numeric(X[variables]): variables_num = [variables] variables_cat = [] else: raise TypeError( "The variable entered is neither numerical nor categorical." ) # If user leaves default None parameter. elif variables is None: # find categorical variables if variables is None: variables_cat = [ column for column in X.select_dtypes(include=["O", "category"]).columns if _is_categorical_and_is_not_datetime(X[column]) ] # find numerical variables in dataset variables_num = list(X.select_dtypes(include="number").columns) if len(variables_num) == 0 and len(variables_cat) == 0: raise TypeError( "There are no numerical or categorical variables in the dataframe" ) # If user passes variable list. else: if len(variables) == 0: raise ValueError("The list of variables is empty.") # find categorical variables variables_cat = [ var for var in X[variables].select_dtypes(include=["O", "category"]).columns ] # find numerical variables variables_num = list(X[variables].select_dtypes(include="number").columns) if any([v for v in variables if v not in variables_cat + variables_num]): raise TypeError( "Some of the variables are neither numerical nor categorical." ) return variables_cat, variables_num