sktime.transformations.series.fourier 源代码

# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
"""Fourier features for time series with long/complex seasonality."""

__author__ = ["ltsaprounis", "blazingbhavneek"]

import warnings
from typing import Optional

import numpy as np
import pandas as pd
from numpy.fft import rfft

from sktime.transformations.base import BaseTransformer


[文档]class FourierFeatures(BaseTransformer):
    r"""Fourier Features for time series seasonality.

    Fourier Series terms can be used as explanatory variables for the cases of multiple
    seasonal periods and or complex / long seasonal periods [1]_, [2]_. For every
    seasonal period, :math:`sp` and fourier term :math:`k` pair there are 2 fourier
    terms sin_sp_k and cos_sp_k:
        - sin_sp_k = :math:`sin(\frac{2 \pi k t}{sp})`
        - cos_sp_k = :math:`cos(\frac{2 \pi k t}{sp})`

    Where :math:`t` is the elapsed time since the beginning of the seasonal period and
    :math:`sp` the total time of the seasonal period.

    The transformed output is a series that contains all requested Fourier terms.

    Warning: the output will contain only the Fourier terms under default settings,
    and discard the original columns of the input data, to avoid multiplication
    of the original data in a pipeline or ``FeatureUnion``.
    To keep the original columns, set ``keep_original_columns=True``.

    Names of the columns are generated as follows:
    additional columns with the naming convention stated above (sin_sp_k and cos_sp_k).
    The numbers of Fourier terms :math:`K` in the fourier_terms_list
    determines the number of Fourier terms that will be used for each seasonal period,
    i.e., Fourier terms :math:`k = 1\dots K` (integers), cos and sine, will be generated
    for the seasonality :math:`sp` at the same list index.
    For example, consider sp_list = [12, "Y"] and fourier_terms_list = [2, 1].
    This says that we compute 2 (2 cos, 2 sine) Fourier terms for
    seasonality 12 periods, and 1 Fourier term (1 cos and 1 sine)
    for seasonality 1 year.
    The transformed series will then have columns with the following names:
    "cos_12_1", "sin_12_1", "cos_12_2", "sin_12_2", "cos_Y_1", "sin_Y_1"

    The implementation is based on the fourier function from the R forecast package [3]_

    Parameters
    ----------
    sp_list : List[float and/or str]
        List of seasonal periods. Can be defined with the following options:

        * | float : Periodicity defined as number of timesteps since the beginning of
            the data seen in ``fit``.

        * | string : Periodicity defined as a column name in X that contains the
            :math:`t/sp` values.

        * | string : Periodicity defined as a pandas period alias:
            https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#period-aliases

    fourier_terms_list : List[int]
        List of number of fourier terms (:math:`K`) per corresponding (:math:`sp`); each
        :math:`K` matches to one :math:`sp` of the sp_list. For example, if sp_list =
        [7, "Y"] and fourier_terms_list = [3, 9], the seasonality of 7 timesteps will
        have 3 sin_sp_k and 3 cos_sp_k fourier terms and the yearly seasonality "Y" will
        have 9 sin_sp_k and 9 cos_sp_k fourier terms.
    freq : str, optional, default = None
        Only used when X has a pd.DatetimeIndex without a specified frequency.
        Specifies the frequency of the index of your data. The string should
        match a pandas offset alias:

        https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
    keep_original_columns : boolean, optional, default=False
        Keep original columns in X passed to ``.transform()``

    References
    ----------
    .. [1] Hyndsight - Forecasting with long seasonal periods:
        https://robjhyndman.com/hyndsight/longseasonality/
    .. [2] Hyndman, R.J., & Athanasopoulos, G. (2021) Forecasting: principles and
        practice, 3rd edition, OTexts: Melbourne, Australia. OTexts.com/fpp3.
        Accessed on August 14th 2022.
    .. [3] https://pkg.robjhyndman.com/forecast/reference/fourier.html

    Examples
    --------
    >>> from sktime.transformations.series.fourier import FourierFeatures
    >>> from sktime.datasets import load_airline
    >>> y = load_airline()
    >>> transformer = FourierFeatures(sp_list=[12, "Y"], fourier_terms_list=[4, 1])
    >>> y_hat = transformer.fit_transform(y)
    """

    _tags = {
        # packaging info
        # --------------
        "authors": ["ltsaprounis", "blazingbhavneek"],
        # estimator type
        # --------------
        "scitype:transform-input": "Series",
        # what is the scitype of X: Series, or Panel
        "scitype:transform-output": "Series",
        # what scitype is returned: Primitives, Series, Panel
        "scitype:transform-labels": "None",
        # what is the scitype of y: None (not needed), Primitives, Series, Panel
        "scitype:instancewise": True,  # is this an instance-wise transform?
        "capability:inverse_transform": False,  # can the transformer inverse transform?
        "univariate-only": False,  # can the transformer handle multivariate X?
        "X_inner_mtype": "pd.DataFrame",  # which mtypes do _fit/_predict support for X?
        # this can be a Panel mtype even if transform-input is Series, vectorized
        "y_inner_mtype": "None",  # which mtypes do _fit/_predict support for y?
        "requires_y": False,  # does y need to be passed in fit?
        "enforce_index_type": [
            pd.PeriodIndex,
            pd.DatetimeIndex,
        ],  # index type that needs to be enforced
        # in X/y
        "fit_is_empty": False,  # is fit empty and can be skipped? Yes = True
        "X-y-must-have-same-index": False,  # can estimator handle different X/y index?
        "transform-returns-same-time-index": True,
        # does transform return have the same time index as input X
        "skip-inverse-transform": True,  # is inverse-transform skipped when called?
        "capability:unequal_length": False,
        # can the transformer handle unequal length time series (if passed Panel)?
        "capability:unequal_length:removes": False,
        # is transform result always guaranteed to be equal length (and series)?
        #   not relevant for transformers that return Primitives in transform-output
        "handles-missing-data": False,  # can estimator handle missing data?
        # todo: rename to capability:missing_values
        "capability:missing_values:removes": False,
        # is transform result always guaranteed to contain no missing values?
    }

    def __init__(
        self,
        sp_list: list[float],
        fourier_terms_list: list[int],
        freq: Optional[str] = None,
        keep_original_columns: Optional[bool] = False,
    ):
        self.sp_list = sp_list
        self.fourier_terms_list = fourier_terms_list
        self.freq = freq
        self.keep_original_columns = keep_original_columns

        if len(self.sp_list) != len(self.fourier_terms_list):
            raise ValueError(
                "In FourierFeatures the length of the sp_list needs to be equal "
                "to the length of fourier_terms_list."
            )

        for i in range(len(self.sp_list)):
            if (
                not isinstance(sp_list[i], str)
                and sp_list[i] / fourier_terms_list[i] < 1
            ):
                raise ValueError(
                    "In FourierFeatures the number of each element of "
                    "fourier_terms_list needs to be lower from the corresponding "
                    "element of the sp_list"
                )

        super().__init__()

    def _fit(self, X, y=None):
        """Fit transformer to X and y.

        private _fit containing the core logic, called from fit

        Parameters
        ----------
        X : Series or Panel of mtype X_inner_mtype
            if X_inner_mtype is list, _fit must support all types in it
            Data to fit transform to
        y : Series or Panel of mtype y_inner_mtype, default=None
            Additional data, e.g., labels for transformation

        Returns
        -------
        self: reference to self
        """
        # Create the sp, k pairs
        # Don't add pairs where the coefficient k/sp already exists
        self.sp_k_pairs_list_ = []
        coefficient_list = []
        for i, sp in enumerate(self.sp_list):
            for k in range(1, self.fourier_terms_list[i] + 1):
                if not isinstance(sp, str):  # periodicity sp relative to start
                    coef = k / sp
                    if coef not in coefficient_list:
                        coefficient_list.append(coef)
                        self.sp_k_pairs_list_.append((sp, k))
                    else:
                        warnings.warn(
                            f"The terms sin_{sp}_{k} and cos_{sp}_{k} from "
                            "FourierFeatures will be skipped because the resulting "
                            "coefficient already exists from other seasonal period, "
                            "fourier term pairs.",
                            stacklevel=2,
                        )
                else:  # periodicity sp from offset string or X column
                    self.sp_k_pairs_list_.append((sp, k))

        time_index = X.index

        if isinstance(time_index, pd.DatetimeIndex):
            # Chooses first non None value
            self.freq_ = time_index.freq or self.freq or pd.infer_freq(time_index)
            if self.freq_ is None:
                raise ValueError("X has no known frequency and none is supplied")
            if self.freq_ == time_index.freq and self.freq_ != self.freq:
                warnings.warn(
                    f"Using frequency from index: {time_index.freq}, which "
                    f"does not match the frequency given:{self.freq}.",
                    stacklevel=2,
                )
            time_index = time_index.to_period(self.freq_)
        # this is used to make sure that time t is calculated with reference to
        # the data passed on fit
        # store the integer form of the minimum date in the prediod index
        self.min_t_ = np.min(time_index.astype("int64"))

        return self

    def _transform(self, X, y=None):
        """Transform X and return a transformed version.

        private _transform containing core logic, called from transform

        Parameters
        ----------
        X : Series or Panel of mtype X_inner_mtype
            if X_inner_mtype is list, _transform must support all types in it
            Data to be transformed
        y : Series or Panel of mtype y_inner_mtype, default=None
            Additional data, e.g., labels for transformation

        Returns
        -------
        transformed version of X
        """
        X_transformed = pd.DataFrame(index=X.index)
        X_df = pd.DataFrame(X)

        if isinstance(X.index, pd.DatetimeIndex):
            time_index = X.index.to_period(self.freq_)
        else:
            time_index = X.index

        # get the integer form of the PeriodIndex
        int_index = time_index.astype("int64") - self.min_t_

        for sp_k in self.sp_k_pairs_list_:
            sp = sp_k[0]
            k = sp_k[1]

            if not isinstance(sp, str):  # periodicity sp relative to start
                X_transformed[f"sin_{sp}_{k}"] = np.sin(int_index * 2 * k * np.pi / sp)
                X_transformed[f"cos_{sp}_{k}"] = np.cos(int_index * 2 * k * np.pi / sp)

            elif sp in X_df.columns:  # periodicity sp from X column
                frac_index = X_df[sp].values
                X_transformed[f"sin_{sp}_{k}"] = np.sin(frac_index * 2 * k * np.pi)
                X_transformed[f"cos_{sp}_{k}"] = np.cos(frac_index * 2 * k * np.pi)

            else:  # periodicity sp from offset string
                if isinstance(X.index, pd.PeriodIndex):
                    datetime_index = X.index.to_timestamp()
                else:
                    datetime_index = X.index

                frac_index = self._offset_frac_since_prev_offset(
                    datetime_index=datetime_index,
                    period_str=sp,
                )
                X_transformed[f"sin_{sp}_{k}"] = np.sin(frac_index * 2 * k * np.pi)
                X_transformed[f"cos_{sp}_{k}"] = np.cos(frac_index * 2 * k * np.pi)

        if self.keep_original_columns:
            X_transformed = pd.concat([X, X_transformed], axis=1, copy=True)

        return X_transformed

    def _offset_frac_since_prev_offset(self, datetime_index, period_str):
        """Get time passed as fraction of the current period.

        Parameters
        ----------
        datetime_index : pandas DatetimeIndex
        period_str : pandas period str
            Cannot contain digits

        Returns
        -------
        numpy array containing the time passed between [previous offset, next offset)
        as fraction in the interval [0, 1) for every datetime in datetimes
        """

        def _get_frac(datetime, offset_boundaries):
            i = np.searchsorted(offset_boundaries, datetime, side="right")
            prev = offset_boundaries[i - 1]
            next = offset_boundaries[i]
            period_timedelta = next - prev
            since_prev_timedelta = datetime - prev
            return since_prev_timedelta / period_timedelta

        offset = pd.tseries.frequencies.to_offset(period_str)
        offset_boundaries = pd.date_range(
            start=np.amin(datetime_index) - offset,
            end=np.amax(datetime_index) + offset,
            freq=period_str,
            tz=datetime_index.tz,
        )

        # date_range created with offsets <= 1day have boundaries on the first
        # moment of the new period, but date_range created with offsets > 1day
        # have boundaries on the last day of the period rather than the desired
        # first day of new period. workaround: shift by 1 day
        offset_td = pd.to_timedelta(offset, errors="coerce")
        if not offset_td <= pd.Timedelta(days=1):
            offset_boundaries = offset_boundaries + pd.Timedelta(days=1)

        fracs = [_get_frac(dt, offset_boundaries) for dt in datetime_index]

        return np.array(fracs)

[文档]    @classmethod
    def get_test_params(cls, parameter_set="default"):
        """Return testing parameter settings for the estimator.

        Parameters
        ----------
        parameter_set : str, default="default"
            Name of the set of test parameters to return, for use in tests. If no
            special parameters are defined for a value, will return ``"default"`` set.
            There are currently no reserved values for transformers.

        Returns
        -------
        params : dict or list of dict, default = {}
            Parameters to create testing instances of the class
            Each dict are parameters to construct an "interesting" test instance, i.e.,
            ``MyClass(**params)`` or ``MyClass(**params[i])`` creates a valid test
            instance.
            ``create_test_instance`` uses the first (or only) dictionary in ``params``
        """
        params = [
            {"sp_list": [12], "fourier_terms_list": [4]},
            {"sp_list": [12, 6.2], "fourier_terms_list": [3, 4]},
            {"sp_list": ["Y"], "fourier_terms_list": [4]},
            {"sp_list": ["Y", "Q"], "fourier_terms_list": [3, 4]},
        ]
        return params


[文档]class FourierTransform(BaseTransformer):
    r"""Simple Fourier transform for time series.

    The implementation is based on the real fast fourier transform from numpy.fft.rfft
    Returns pd.Series of amplitudes of integer range frequencies.
    Even-Sampling of data is assumed and frequency range converted to integer.

    Examples
    --------
    >>> from sktime.transformations.series.fourier import FourierTransform
    >>> from sktime.datasets import load_airline
    >>> X = load_airline()
    >>> transformer = FourierTransform()
    >>> X_ft = transformer.fit_transform(X)
    """

    _tags = {
        "scitype:transform-input": "Series",
        "scitype:transform-output": "Series",
        "scitype:instancewise": True,
        "scitype:transform-labels": "None",
        "X_inner_mtype": "pd.Series",
        "y_inner_mtype": "None",
        "univariate-only": True,
        "requires_y": False,
        "fit_is_empty": True,
        "capability:inverse_transform": False,
        "capability:unequal_length": True,
        "handles-missing-data": False,
    }

    def __init__(self):
        super().__init__()

    def _transform(self, X, y=None):
        """Transform X and return a transformed version.

        private _transform containing core logic, called from transform

        Parameters
        ----------
        X : Series mtype X_inner_mtype

        Returns
        -------
        transformed version of X
        """
        # numpy.fft methods
        dft_seq = np.abs(rfft(X))

        # Combining the arrays to Pandas Series
        Y = pd.Series(dft_seq[1:])
        return Y