特征工程

%load_ext autoreload
%autoreload 2

对外生回归变量计算变换

from typing import Optional

from utilsforecast.compat import DataFrame
from utilsforecast.processing import (
    drop_index_if_pandas,
    horizontal_concat,
    process_df,
    take_rows,
)
from utilsforecast.validation import validate_format

from mlforecast.core import _parse_transforms, Lags, LagTransforms
from mlforecast.grouped_array import GroupedArray
def transform_exog(
    df: DataFrame,
    lags: Optional[Lags] = None,
    lag_transforms: Optional[LagTransforms] = None,
    id_col: str = 'unique_id',
    time_col: str = 'ds',
    num_threads: int = 1,
) -> DataFrame:
    """Compute lag features for dynamic exogenous regressors.

    Parameters
    ----------
    df : pandas or polars DataFrame
        Dataframe with ids, times and values for the exogenous regressors.
    lags : list of int, optional (default=None)
        Lags of the target to use as features.
    lag_transforms : dict of int to list of functions, optional (default=None)
        Mapping of target lags to their transformations.
    id_col : str (default='unique_id')
        Column that identifies each serie.
    time_col : str (default='ds')
        Column that identifies each timestep, its values can be timestamps or integers.
    num_threads : int (default=1)
        Number of threads to use when computing the features.

    Returns
    -------
    pandas or polars DataFrame
        Original DataFrame with the computed features
    """
    if lags is None and lag_transforms is None:
        raise ValueError('At least one of `lags` or `lag_transforms` is required.')
    if lags is None:
        lags = []
    if lag_transforms is None:
        lag_transforms = {}
    tfms = _parse_transforms(lags, lag_transforms)
    targets = [c for c in df.columns if c not in (id_col, time_col)]
    # 这只是一个虚拟目标,因为process_df需要一个目标。    
    target_col = targets[0]
    validate_format(df, id_col, time_col, target_col)
    _, _, data, indptr, sort_idxs = process_df(df, id_col, time_col, target_col)
    results = {}
    cols = []
    for j, target in enumerate(targets):
        ga = GroupedArray(data[:, j], indptr)
        named_tfms = {f'{target}_{k}': v for k, v in tfms.items()}
        if num_threads == 1 or len(named_tfms) == 1:
            computed_tfms = ga.apply_transforms(transforms=named_tfms, updates_only=False)
        else:
            computed_tfms = ga.apply_multithreaded_transforms(
                transforms=named_tfms, num_threads=num_threads, updates_only=False
            )
        results.update(computed_tfms)
        cols.extend(list(named_tfms.keys()))
    if sort_idxs is not None:
        base_df = take_rows(df, sort_idxs)
    else:
        base_df = df
    base_df = drop_index_if_pandas(base_df)
    return horizontal_concat([base_df, type(df)(results)[cols]])
import numpy as np
import pandas as pd
from nbdev import show_doc
from window_ops.expanding import expanding_mean

from mlforecast.utils import generate_daily_series

设置

rng = np.random.RandomState(0)
series = generate_daily_series(100, equal_ends=True)
starts_ends = series.groupby('unique_id', as_index=False)['ds'].agg([min, max])
prices = []
for r in starts_ends.itertuples():
    dates = pd.date_range(r.min, r.max + 14 * pd.offsets.Day())
    df = pd.DataFrame({'ds': dates, 'price': rng.rand(dates.size)})
    df['unique_id'] = r.Index
    prices.append(df)
prices = pd.concat(prices)
prices['price2'] = prices['price'] * rng.rand(prices.shape[0])
prices.head()
ds price unique_id price2
0 2000-10-05 0.548814 id_00 0.345011
1 2000-10-06 0.715189 id_00 0.445598
2 2000-10-07 0.602763 id_00 0.165147
3 2000-10-08 0.544883 id_00 0.041373
4 2000-10-09 0.423655 id_00 0.391577
show_doc(transform_exog, title_level=2)

transform_exog

 transform_exog
                 (df:Union[pandas.core.frame.DataFrame,polars.dataframe.fr
                 ame.DataFrame], lags:Optional[Iterable[int]]=None, lag_tr
                 ansforms:Optional[Dict[int,List[Union[Callable,Tuple[Call
                 able,Any]]]]]=None, id_col:str='unique_id',
                 time_col:str='ds', num_threads:int=1)

Compute lag features for dynamic exogenous regressors.

Type Default Details
df Union Dataframe with ids, times and values for the exogenous regressors.
lags Optional None Lags of the target to use as features.
lag_transforms Optional None Mapping of target lags to their transformations.
id_col str unique_id Column that identifies each serie.
time_col str ds Column that identifies each timestep, its values can be timestamps or integers.
num_threads int 1 Number of threads to use when computing the features.
Returns Union Original DataFrame with the computed features
transformed = transform_exog(
    prices,
    lags=[1, 2],
    lag_transforms={1: [expanding_mean]}
)
transformed.head()
ds price unique_id price2 price_lag1 price_lag2 price_expanding_mean_lag1 price2_lag1 price2_lag2 price2_expanding_mean_lag1
0 2000-10-05 0.548814 id_00 0.345011 NaN NaN NaN NaN NaN NaN
1 2000-10-06 0.715189 id_00 0.445598 0.548814 NaN 0.548814 0.345011 NaN 0.345011
2 2000-10-07 0.602763 id_00 0.165147 0.715189 0.548814 0.632001 0.445598 0.345011 0.395304
3 2000-10-08 0.544883 id_00 0.041373 0.602763 0.715189 0.622255 0.165147 0.445598 0.318585
4 2000-10-09 0.423655 id_00 0.391577 0.544883 0.602763 0.602912 0.041373 0.165147 0.249282

::: {#dff16198-b8c3-4905-92d7-537710c87282 .cell 0=‘极’ 1=‘地’}

import polars as pl

:::

::: {#00f3a7df-cbdd-4f11-a489-6eccf059154e .cell 0=‘极’ 1=‘地’}

prices_pl = pl.from_pandas(prices)
transformed_pl = transform_exog(
    prices_pl,
    lags=[1, 2],
    lag_transforms={1: [expanding_mean]},
    num_threads=2,
)
transformed_pl.head()
shape: (5, 10)
ds price unique_id price2 price_lag1 price_lag2 price_expanding_mean_lag1 price2_lag1 price2_lag2 price2_expanding_mean_lag1
datetime[ns] f64 str f64 f64 f64 f64 f64 f64 f64
2000-10-05 00:00:00 0.548814 "id_00" 0.345011 NaN NaN NaN NaN NaN NaN
2000-10-06 00:00:00 0.715189 "id_00" 0.445598 0.548814 NaN 0.548814 0.345011 NaN 0.345011
2000-10-07 00:00:00 0.602763 "id_00" 0.165147 0.715189 0.548814 0.632001 0.445598 0.345011 0.395304
2000-10-08 00:00:00 0.544883 "id_00" 0.041373 0.602763 0.715189 0.622255 0.165147 0.445598 0.318585
2000-10-09 00:00:00 0.423655 "id_00" 0.391577 0.544883 0.602763 0.602912 0.041373 0.165147 0.249282

:::

#| 极地
pd.testing.assert_frame_equal(transformed, transformed_pl.to_pandas())
# 隐藏
from mlforecast.lag_transforms import ExpandingMean
transformed_core = transform_exog(
    prices,
    lags=[1, 2],
    lag_transforms={1: [ExpandingMean()]}
)
pd.testing.assert_frame_equal(transformed, transformed_core)

Give us a ⭐ on Github