feature_engine.datetime.datetime_subtraction 源代码

from typing import List, Optional, Union

import numpy as np
import pandas as pd
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from sklearn.utils.validation import check_is_fitted

from feature_engine._check_init_parameters.check_variables import (
    _check_variables_input_value,
)
from feature_engine._docstrings.fit_attributes import (
    _feature_names_in_docstring,
    _n_features_in_docstring,
)
from feature_engine._docstrings.init_parameters.all_trasnformers import (
    _missing_values_docstring,
)
from feature_engine._docstrings.methods import (
    _fit_not_learn_docstring,
    _fit_transform_docstring,
    _transform_creation_docstring,
)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.creation.base_creation import BaseCreation
from feature_engine.dataframe_checks import (
    _check_contains_na,
    _check_X_matches_training_df,
    check_X,
)
from feature_engine.variable_handling.check_variables import check_datetime_variables
from feature_engine.variable_handling.find_variables import find_datetime_variables

_example = """
    >>> import pandas as pd
    >>> from feature_engine.datetime import DatetimeSubtraction
    >>> X = pd.DataFrame({
    >>>     "date1": ["2022-09-18", "2022-10-27", "2022-12-24"],
    >>>     "date2": ["2022-08-18", "2022-08-27", "2022-06-24"]})
    >>> dtf = DatetimeSubtraction(variables=["date1"], reference=["date2"])
    >>> dtf.fit(X)
    >>> dtf.transform(X)
            date1       date2  date1_sub_date2
    0  2022-09-18  2022-08-18             31.0
    1  2022-10-27  2022-08-27             61.0
    2  2022-12-24  2022-06-24            183.0
        """.rstrip()


[文档]@Substitution( missing_values=_missing_values_docstring, feature_names_in_=_feature_names_in_docstring, n_features_in_=_n_features_in_docstring, fit=_fit_not_learn_docstring, transform=_transform_creation_docstring, fit_transform=_fit_transform_docstring, example=_example, ) class DatetimeSubtraction(BaseCreation): """ DatetimeSubtraction() applies datetime subtraction between a group of datetime variables and one or more datetime features, adding the resulting variables to the dataframe. DatetimeSubtraction() works with variables cast as datetime or object. It subtracts the variables listed in the parameter `reference` from those listed in the parameter `variables`. More details in the :ref:`User Guide <datetime_subtraction>`. Parameters ---------- variables: list The list of datetime variables that the reference variables will be subtracted from (left side of the subtraction operation). reference: list The list of datetime reference variables that will be subtracted from `variables` (right side of the subtraction operation). new_variables_names: list, default=None Names of the new variables. You have the option to pass a list with the names you'd like to assing to the new variables. If `None`, the transformer will assign arbitrary names. output_unit: string, default='D' The string representation of the output unit of the datetime differences. The default is `D` for day. This parameter is passed to `numpy.timedelta64`. Other possible values are `Y` for year, `M` for month, `W` for week, `h` for hour, `m` for minute, `s` for second, `ms` for millisecond, `us` or `μs` for microsecond, `ns` for nanosecond, `ps` for picosecond, `fs` for femtosecond and `as` for attosecond. {missing_values} drop_original: bool, default="False" If `True`, the variables listed in `variables` and `reference` will be dropped from the dataframe after the computation of the new features. dayfirst: bool, default="False" Specify a date parse order if arg is str or is list-like. If True, parses dates with the day first, e.g. 10/11/12 is parsed as 2012-11-10. Same as in `pandas.to_datetime`. yearfirst: bool, default="False" Specify a date parse order if arg is str or is list-like. Same as in `pandas.to_datetime`. - If True parses dates with the year first, e.g. 10/11/12 is parsed as 2010-11-12. - If both dayfirst and yearfirst are True, yearfirst is preceded. utc: bool, default=None Return UTC DatetimeIndex if True (converting any tz-aware datetime.datetime objects as well). Same as in `pandas.to_datetime`. format: str, default None The strftime to parse time, e.g. "%d/%m/%Y". Check pandas `to_datetime()` for more information on choices. If you have variables with different formats pass “mixed”, to infer the format for each element individually. This is risky, and you should probably use it along with dayfirst, according to pandas' documentation. Attributes ---------- variables_: The list with datetime variables from which the variables in `reference` will be substracted. It is created after the transformer corroborates that the variables in `variables` are, or can be parsed to datetime. reference_: The list with the datetime variables that will be subtracted from `variables_`. It is created after the transformer corroborates that the variables in `reference` are, or can be parsed to datetime. {feature_names_in_} {n_features_in_} Methods ------- {fit} {fit_transform} {transform} Examples -------- {example} """ def __init__( self, variables: Union[None, int, str, List[Union[str, int]]] = None, reference: Union[None, int, str, List[Union[str, int]]] = None, new_variables_names: Union[None, List[str], str] = None, output_unit: str = "D", missing_values: str = "ignore", drop_original: bool = False, dayfirst: bool = False, yearfirst: bool = False, utc: Union[None, bool] = None, format: Union[None, str] = None, ) -> None: valid_output_units = { "D", "Y", "M", "W", "h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as", } if not isinstance(output_unit, str) or output_unit not in valid_output_units: raise ValueError( f"output_unit accepts the following values: " f"{valid_output_units}. Got {output_unit} instead." ) if new_variables_names is not None: if ( not isinstance(new_variables_names, list) or not all(isinstance(var, str) for var in new_variables_names) or len(set(new_variables_names)) != len(new_variables_names) ): raise ValueError( "new_variable_names should be None or a list of unique strings. " f"Got {new_variables_names} instead." ) super().__init__(missing_values, drop_original) self.variables = _check_variables_input_value(variables) self.reference = _check_variables_input_value(reference) self.new_variables_names = new_variables_names self.output_unit = output_unit self.dayfirst = dayfirst self.yearfirst = yearfirst self.utc = utc self.format = format
[文档] def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This transformer does not learn any parameter. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to transform. y: pandas Series, or np.array. Default=None. It is not needed in this transformer. You can pass y or None. """ # Common checks and attributes X = check_X(X) # check variables are datetime if self.variables is None: self.variables_ = find_datetime_variables(X) else: self.variables_ = check_datetime_variables(X, self.variables) if self.reference is None: self.reference_ = find_datetime_variables(X) else: self.reference_ = check_datetime_variables(X, self.reference) if self.new_variables_names is not None: if len(self.new_variables_names) != len(self.variables_) * len( self.reference_ ): raise ValueError( f"{len(self.variables_) * len(self.reference_)} new variables will " f"be created but only {len(self.new_variables_names)} new variable " f"names were provided. Please check the variables list and try " f"again." ) # check if dataset contains na if self.missing_values == "raise": vars = list(set(self.variables_ + self.reference_)) _check_contains_na(X, vars) # save input features self.feature_names_in_ = X.columns.tolist() # save train set shape self.n_features_in_ = X.shape[1] return self
[文档] def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Add new features. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The data to transform. Returns ------- X_new: Pandas dataframe The input dataframe plus the new variables. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = check_X(X) # Check if input data contains same number of columns as dataframe used to fit. _check_X_matches_training_df(X, self.n_features_in_) if self.missing_values == "raise": vars = list(set(self.variables_ + self.reference_)) _check_contains_na(X, vars) # reorder variables to match train set X = X[self.feature_names_in_] X_dt = self._to_datetime(X) new_features = self._sub(X_dt) X = pd.concat([X, new_features], axis=1) if self.drop_original: X = X.drop( columns=set(self.variables_ + self.reference_), ) return X
def _to_datetime(self, X: pd.DataFrame): """covert variables to datetime.""" # convert datetime variables datetime_df = pd.concat( [ pd.to_datetime( X[variable], dayfirst=self.dayfirst, yearfirst=self.yearfirst, utc=self.utc, format=self.format, ) for variable in set(self.variables_ + self.reference_) ], axis=1, ) non_dt_columns = datetime_df.columns[~datetime_df.apply(is_datetime)].tolist() if non_dt_columns: raise ValueError( "ValueError: variable(s) " + (len(non_dt_columns) * "{} ").format(*non_dt_columns) + "could not be converted to datetime. Try setting utc=True" ) return datetime_df def _sub(self, dt_df: pd.DataFrame): """make datetime subtraction""" new_df = pd.DataFrame() for reference in self.reference_: new_varnames = [f"{var}_sub_{reference}" for var in self.variables_] new_df[new_varnames] = ( dt_df[self.variables_] .sub(dt_df[reference], axis=0) .div(np.timedelta64(1, self.output_unit).astype("timedelta64[ns]")) ) if self.new_variables_names is not None: new_df.columns = self.new_variables_names return new_df def _get_new_features_name(self) -> List: """Return names of the created features.""" if self.new_variables_names is not None: feature_names = self.new_variables_names else: feature_names = [ f"{var}_sub_{reference}" for reference in self.reference_ for var in self.variables_ ] return feature_names