feature_engine.timeseries.forecasting.expanding_window_features 源代码

# Author: Kishan Manani
# License: BSD 3 clause

from __future__ import annotations

from typing import List

import pandas as pd

from feature_engine._docstrings.fit_attributes import (
    _feature_names_in_docstring,
    _n_features_in_docstring,
)
from feature_engine._docstrings.init_parameters.all_trasnformers import (
    _drop_original_docstring,
    _missing_values_docstring,
    _variables_numerical_docstring,
)
from feature_engine._docstrings.methods import (
    _fit_not_learn_docstring,
    _fit_transform_docstring,
)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.timeseries.forecasting.base_forecast_transformers import (
    BaseForecastTransformer,
)


[文档]@Substitution(
    variables=_variables_numerical_docstring,
    missing_values=_missing_values_docstring,
    drop_original=_drop_original_docstring,
    feature_names_in_=_feature_names_in_docstring,
    n_features_in_=_n_features_in_docstring,
    fit=_fit_not_learn_docstring,
    fit_transform=_fit_transform_docstring,
)
class ExpandingWindowFeatures(BaseForecastTransformer):
    """
    ExpandingWindowFeatures adds new features to a dataframe based on expanding window
    operations. Expanding window operations are operations that perform an
    aggregation over an expanding window of all past values relative to the
    value of interest. An expanding window feature is, in other words, a feature
    created after computing statistics (e.g., mean, min, max, etc.) using a
    window over all the past data. For example, the mean value of all months
    prior to the month of interest is an expanding window feature.

    ExpandingWindowFeatures uses the pandas' functions `expanding()`, `agg()` and
    `shift()`. With `expanding()`, it creates expanding windows. With `agg()` it
    applies multiple functions within those windows. With 'shift()' it allocates
    the values to the correct rows.

    For supported aggregation functions, see Expanding Window
    `Functions
    <https://pandas.pydata.org/docs/reference/window.html#expanding-window-functions>`_.

    To be compatible with ExpandingWindowFeatures, the dataframe's index must
    have unique values and no NaN.

    ExpandingWindowFeatures works only with numerical variables. You can pass a
    list of variables to use as input for the expanding window. Alternatively,
    ExpandingWindowFeatures will automatically select all numerical variables
    in the training set.

    More details in the :ref:`User Guide <expanding_window_features>`.

    Parameters
    ----------
    {variables}

    min_periods: int, default None.
        Minimum number of observations in window required to have a value;
        otherwise, result is np.nan. See parameter `min_periods` in the pandas
        `expanding()` documentation for more details.

    functions: str, list of str, default = 'mean'
        The functions to apply within the window. Valid functions can be found
        `here <https://pandas.pydata.org/docs/reference/window.html>`_.

    periods: int, list of ints, default=1
        Number of periods to shift. Can be a positive integer. See param `periods` in
        pandas `shift`.

    freq: str, list of str, default=None
        Offset to use from the tseries module or time rule. See parameter `freq` in
        pandas `shift()`.

    sort_index: bool, default=True
        Whether to order the index of the dataframe before creating the
        expanding window feature.

    {missing_values}

    {drop_original}

    drop_na: bool, default=False.
        Whether the NAN introduced in the created features should be removed.


    Attributes
    ----------
    variables_:
        The group of variables that will be used to create the expanding window
        features.

    {feature_names_in_}

    {n_features_in_}

    Methods
    -------
    {fit}

    transform:
        Add expanding window features.

    transform_x_y:
        Remove rows with missing data from X and y.

    {fit_transform}

    See Also
    --------
    pandas.expanding
    pandas.aggregate
    pandas.shift

    Examples
    --------

    >>> import pandas as pd
    >>> from feature_engine.timeseries.forecasting import ExpandingWindowFeatures
    >>> X = pd.DataFrame(dict(date = ["2022-09-18",
    >>>                               "2022-09-19",
    >>>                               "2022-09-20",
    >>>                               "2022-09-21",
    >>>                               "2022-09-22"],
    >>>                       x1 = [1,2,3,4,5],
    >>>                       x2 = [6,7,8,9,10]
    >>>                     ))
    >>> ewf = ExpandingWindowFeatures()
    >>> ewf.fit_transform(X)
             date  x1  x2  x1_expanding_mean  x2_expanding_mean
    0  2022-09-18   1   6                NaN                NaN
    1  2022-09-19   2   7                1.0                6.0
    2  2022-09-20   3   8                1.5                6.5
    3  2022-09-21   4   9                2.0                7.0
    4  2022-09-22   5  10                2.5                7.5
    """

    def __init__(
        self,
        variables: None | int | str | list[str | int] = None,
        min_periods: int | None = None,
        functions: str | list[str] = "mean",
        periods: int = 1,
        freq: str | None = None,
        sort_index: bool = True,
        missing_values: str = "raise",
        drop_original: bool = False,
        drop_na: bool = False,
    ) -> None:

        if not isinstance(functions, (str, list)) or not all(
            isinstance(val, str) for val in functions
        ):
            raise ValueError(
                f"functions must be a list of strings or a string."
                f"Got {functions} instead."
            )
        if isinstance(functions, list) and len(functions) != len(set(functions)):
            raise ValueError(f"There are duplicated functions in the list: {functions}")

        if not isinstance(periods, int) or periods < 0:
            raise ValueError(
                f"periods must be a non-negative integer. Got {periods} instead."
            )

        super().__init__(variables, missing_values, drop_original, drop_na)

        self.min_periods = min_periods
        self.functions = functions
        self.periods = periods
        self.freq = freq
        self.sort_index = sort_index

[文档]    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Adds expanding window features.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The data to transform.

        Returns
        -------
        X_new: Pandas dataframe, shape = [n_samples, n_features + window_features]
            The dataframe with the original plus the new variables.
        """
        # Common dataframe checks and setting up.
        X = self._check_transform_input_and_state(X)

        tmp = (
            X[self.variables_]
            .expanding(min_periods=self.min_periods)
            .agg(self.functions)
            .shift(periods=self.periods, freq=self.freq)
        )

        tmp.columns = self._get_new_features_name()

        X = X.merge(tmp, left_index=True, right_index=True, how="left")

        if self.drop_original:
            X = X.drop(self.variables_, axis=1)

        if self.drop_na:
            X = X.dropna(subset=tmp.columns, axis=0)

        return X

    def _get_new_features_name(self) -> List:
        """Get names of the window features."""

        if not isinstance(self.functions, list):
            functions_ = [self.functions]
        else:
            functions_ = self.functions

        feature_names = [
            f"{feature}_expanding_{agg}"
            for feature in self.variables_
            for agg in functions_
        ]

        return feature_names
This site uses cookies

feature_engine.timeseries.forecasting.expanding_window_features 源代码