%load_ext autoreload
%autoreload 2
特征工程
对外生回归变量计算变换
from typing import Optional
from utilsforecast.compat import DataFrame
from utilsforecast.processing import (
drop_index_if_pandas,
horizontal_concat,
process_df,
take_rows,
)from utilsforecast.validation import validate_format
from mlforecast.core import _parse_transforms, Lags, LagTransforms
from mlforecast.grouped_array import GroupedArray
def transform_exog(
df: DataFrame,= None,
lags: Optional[Lags] = None,
lag_transforms: Optional[LagTransforms] str = 'unique_id',
id_col: str = 'ds',
time_col: int = 1,
num_threads: -> DataFrame:
) """Compute lag features for dynamic exogenous regressors.
Parameters
----------
df : pandas or polars DataFrame
Dataframe with ids, times and values for the exogenous regressors.
lags : list of int, optional (default=None)
Lags of the target to use as features.
lag_transforms : dict of int to list of functions, optional (default=None)
Mapping of target lags to their transformations.
id_col : str (default='unique_id')
Column that identifies each serie.
time_col : str (default='ds')
Column that identifies each timestep, its values can be timestamps or integers.
num_threads : int (default=1)
Number of threads to use when computing the features.
Returns
-------
pandas or polars DataFrame
Original DataFrame with the computed features
"""
if lags is None and lag_transforms is None:
raise ValueError('At least one of `lags` or `lag_transforms` is required.')
if lags is None:
= []
lags if lag_transforms is None:
= {}
lag_transforms = _parse_transforms(lags, lag_transforms)
tfms = [c for c in df.columns if c not in (id_col, time_col)]
targets # 这只是一个虚拟目标,因为process_df需要一个目标。
= targets[0]
target_col
validate_format(df, id_col, time_col, target_col)= process_df(df, id_col, time_col, target_col)
_, _, data, indptr, sort_idxs = {}
results = []
cols for j, target in enumerate(targets):
= GroupedArray(data[:, j], indptr)
ga = {f'{target}_{k}': v for k, v in tfms.items()}
named_tfms if num_threads == 1 or len(named_tfms) == 1:
= ga.apply_transforms(transforms=named_tfms, updates_only=False)
computed_tfms else:
= ga.apply_multithreaded_transforms(
computed_tfms =named_tfms, num_threads=num_threads, updates_only=False
transforms
)
results.update(computed_tfms)list(named_tfms.keys()))
cols.extend(if sort_idxs is not None:
= take_rows(df, sort_idxs)
base_df else:
= df
base_df = drop_index_if_pandas(base_df)
base_df return horizontal_concat([base_df, type(df)(results)[cols]])
import numpy as np
import pandas as pd
from nbdev import show_doc
from window_ops.expanding import expanding_mean
from mlforecast.utils import generate_daily_series
设置
= np.random.RandomState(0)
rng = generate_daily_series(100, equal_ends=True)
series = series.groupby('unique_id', as_index=False)['ds'].agg([min, max])
starts_ends = []
prices for r in starts_ends.itertuples():
= pd.date_range(r.min, r.max + 14 * pd.offsets.Day())
dates = pd.DataFrame({'ds': dates, 'price': rng.rand(dates.size)})
df 'unique_id'] = r.Index
df[
prices.append(df)= pd.concat(prices)
prices 'price2'] = prices['price'] * rng.rand(prices.shape[0])
prices[ prices.head()
ds | price | unique_id | price2 | |
---|---|---|---|---|
0 | 2000-10-05 | 0.548814 | id_00 | 0.345011 |
1 | 2000-10-06 | 0.715189 | id_00 | 0.445598 |
2 | 2000-10-07 | 0.602763 | id_00 | 0.165147 |
3 | 2000-10-08 | 0.544883 | id_00 | 0.041373 |
4 | 2000-10-09 | 0.423655 | id_00 | 0.391577 |
=2) show_doc(transform_exog, title_level
transform_exog
transform_exog (df:Union[pandas.core.frame.DataFrame,polars.dataframe.fr ame.DataFrame], lags:Optional[Iterable[int]]=None, lag_tr ansforms:Optional[Dict[int,List[Union[Callable,Tuple[Call able,Any]]]]]=None, id_col:str='unique_id', time_col:str='ds', num_threads:int=1)
Compute lag features for dynamic exogenous regressors.
Type | Default | Details | |
---|---|---|---|
df | Union | Dataframe with ids, times and values for the exogenous regressors. | |
lags | Optional | None | Lags of the target to use as features. |
lag_transforms | Optional | None | Mapping of target lags to their transformations. |
id_col | str | unique_id | Column that identifies each serie. |
time_col | str | ds | Column that identifies each timestep, its values can be timestamps or integers. |
num_threads | int | 1 | Number of threads to use when computing the features. |
Returns | Union | Original DataFrame with the computed features |
= transform_exog(
transformed
prices,=[1, 2],
lags={1: [expanding_mean]}
lag_transforms
) transformed.head()
ds | price | unique_id | price2 | price_lag1 | price_lag2 | price_expanding_mean_lag1 | price2_lag1 | price2_lag2 | price2_expanding_mean_lag1 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2000-10-05 | 0.548814 | id_00 | 0.345011 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 2000-10-06 | 0.715189 | id_00 | 0.445598 | 0.548814 | NaN | 0.548814 | 0.345011 | NaN | 0.345011 |
2 | 2000-10-07 | 0.602763 | id_00 | 0.165147 | 0.715189 | 0.548814 | 0.632001 | 0.445598 | 0.345011 | 0.395304 |
3 | 2000-10-08 | 0.544883 | id_00 | 0.041373 | 0.602763 | 0.715189 | 0.622255 | 0.165147 | 0.445598 | 0.318585 |
4 | 2000-10-09 | 0.423655 | id_00 | 0.391577 | 0.544883 | 0.602763 | 0.602912 | 0.041373 | 0.165147 | 0.249282 |
::: {#dff16198-b8c3-4905-92d7-537710c87282 .cell 0=‘极’ 1=‘地’}
import polars as pl
:::
::: {#00f3a7df-cbdd-4f11-a489-6eccf059154e .cell 0=‘极’ 1=‘地’}
= pl.from_pandas(prices)
prices_pl = transform_exog(
transformed_pl
prices_pl,=[1, 2],
lags={1: [expanding_mean]},
lag_transforms=2,
num_threads
) transformed_pl.head()
shape: (5, 10)
ds | price | unique_id | price2 | price_lag1 | price_lag2 | price_expanding_mean_lag1 | price2_lag1 | price2_lag2 | price2_expanding_mean_lag1 |
---|---|---|---|---|---|---|---|---|---|
datetime[ns] | f64 | str | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
2000-10-05 00:00:00 | 0.548814 | "id_00" | 0.345011 | NaN | NaN | NaN | NaN | NaN | NaN |
2000-10-06 00:00:00 | 0.715189 | "id_00" | 0.445598 | 0.548814 | NaN | 0.548814 | 0.345011 | NaN | 0.345011 |
2000-10-07 00:00:00 | 0.602763 | "id_00" | 0.165147 | 0.715189 | 0.548814 | 0.632001 | 0.445598 | 0.345011 | 0.395304 |
2000-10-08 00:00:00 | 0.544883 | "id_00" | 0.041373 | 0.602763 | 0.715189 | 0.622255 | 0.165147 | 0.445598 | 0.318585 |
2000-10-09 00:00:00 | 0.423655 | "id_00" | 0.391577 | 0.544883 | 0.602763 | 0.602912 | 0.041373 | 0.165147 | 0.249282 |
:::
#| 极地
pd.testing.assert_frame_equal(transformed, transformed_pl.to_pandas())
# 隐藏
from mlforecast.lag_transforms import ExpandingMean
= transform_exog(
transformed_core
prices,=[1, 2],
lags={1: [ExpandingMean()]}
lag_transforms
) pd.testing.assert_frame_equal(transformed, transformed_core)
Give us a ⭐ on Github