%load_ext autoreload
%autoreload 2
工具箱
from math import ceil, log10
import numpy as np
import pandas as pd
from utilsforecast.compat import DataFrame, pl
from utilsforecast.data import generate_series
from fastcore.test import test_eq, test_fail
from nbdev import show_doc
def generate_daily_series(
int,
n_series: int = 50,
min_length: int = 500,
max_length: int = 0,
n_static_features: bool = False,
equal_ends: bool = True,
static_as_categorical: bool = False,
with_trend: int = 0,
seed: str = 'pandas',
engine: -> DataFrame:
) """Generate Synthetic Panel Series.
Parameters
----------
n_series : int
Number of series for synthetic panel.
min_length : int (default=50)
Minimum length of synthetic panel's series.
max_length : int (default=500)
Maximum length of synthetic panel's series.
n_static_features : int (default=0)
Number of static exogenous variables for synthetic panel's series.
equal_ends : bool (default=False)
Series should end in the same date stamp `ds`.
static_as_categorical : bool (default=True)
Static features should have a categorical data type.
with_trend : bool (default=False)
Series should have a (positive) trend.
seed : int (default=0)
Random seed used for generating the data.
engine : str (default='pandas')
Output Dataframe type.
Returns
-------
pandas or polars DataFrame
Synthetic panel with columns [`unique_id`, `ds`, `y`] and exogenous features.
"""
= generate_series(
series =n_series,
n_series='D',
freq=min_length,
min_length=max_length,
max_length=n_static_features,
n_static_features=equal_ends,
equal_ends=static_as_categorical,
static_as_categorical=with_trend,
with_trend=seed,
seed=engine,
engine
)= ceil(log10(n_series))
n_digits
if engine == 'pandas':
'unique_id'] = (
series['id_' + series['unique_id'].astype(str).str.rjust(n_digits, '0')
'category')
).astype(else:
try:
= series.with_columns(
series 'id_' + pl.col('unique_id').cast(pl.Utf8).str.pad_start(n_digits, '0'))
('unique_id')
.alias(
.cast(pl.Categorical)
)except AttributeError:
= series.with_columns(
series 'id_' + pl.col('unique_id').cast(pl.Utf8).str.rjust(n_digits, '0'))
('unique_id')
.alias(
.cast(pl.Categorical)
)return series
show_doc(generate_daily_series)
generate_daily_series
generate_daily_series (n_series:int, min_length:int=50, max_length:int=500, n_static_features:int=0, equal_ends:bool=False, static_as_categorical:bool=True, with_trend:bool=False, seed:int=0, engine:str='pandas')
Generate Synthetic Panel Series.
Type | Default | Details | |
---|---|---|---|
n_series | int | Number of series for synthetic panel. | |
min_length | int | 50 | Minimum length of synthetic panel’s series. |
max_length | int | 500 | Maximum length of synthetic panel’s series. |
n_static_features | int | 0 | Number of static exogenous variables for synthetic panel’s series. |
equal_ends | bool | False | Series should end in the same date stamp ds . |
static_as_categorical | bool | True | Static features should have a categorical data type. |
with_trend | bool | False | Series should have a (positive) trend. |
seed | int | 0 | Random seed used for generating the data. |
engine | str | pandas | Output Dataframe type. |
Returns | Union | Synthetic panel with columns [unique_id , ds , y ] and exogenous features. |
import numpy as np
# 生成20个长度在100到1000之间的序列
= [np.random.rand(np.random.randint(100, 1001)) for _ in range(20)]
series
# 输出生成的序列的长度
= [len(s) for s in series]
lengths lengths
= 20
n_series = 100
min_length = 1000
max_length
= generate_daily_series(n_series, min_length, max_length)
series series
unique_id | ds | y | |
---|---|---|---|
0 | id_00 | 2000-01-01 | 0.395863 |
1 | id_00 | 2000-01-02 | 1.264447 |
2 | id_00 | 2000-01-03 | 2.284022 |
3 | id_00 | 2000-01-04 | 3.462798 |
4 | id_00 | 2000-01-05 | 4.035518 |
... | ... | ... | ... |
12446 | id_19 | 2002-03-11 | 0.309275 |
12447 | id_19 | 2002-03-12 | 1.189464 |
12448 | id_19 | 2002-03-13 | 2.325032 |
12449 | id_19 | 2002-03-14 | 3.333198 |
12450 | id_19 | 2002-03-15 | 4.306117 |
12451 rows × 3 columns
我们还可以向每个系列添加静态特征(这些可以是产品ID或商店ID等)。只有第一个静态特征(static_0
)与目标相关。
= 2
n_static_features
= generate_daily_series(n_series, min_length, max_length, n_static_features)
series_with_statics series_with_statics
unique_id | ds | y | static_0 | static_1 | |
---|---|---|---|---|---|
0 | id_00 | 2000-01-01 | 7.521388 | 18 | 10 |
1 | id_00 | 2000-01-02 | 24.024502 | 18 | 10 |
2 | id_00 | 2000-01-03 | 43.396423 | 18 | 10 |
3 | id_00 | 2000-01-04 | 65.793168 | 18 | 10 |
4 | id_00 | 2000-01-05 | 76.674843 | 18 | 10 |
... | ... | ... | ... | ... | ... |
12446 | id_19 | 2002-03-11 | 27.834771 | 89 | 42 |
12447 | id_19 | 2002-03-12 | 107.051746 | 89 | 42 |
12448 | id_19 | 2002-03-13 | 209.252845 | 89 | 42 |
12449 | id_19 | 2002-03-14 | 299.987801 | 89 | 42 |
12450 | id_19 | 2002-03-15 | 387.550536 | 89 | 42 |
12451 rows × 5 columns
for i in range(n_static_features):
assert all(series_with_statics.groupby('unique_id')[f'static_{i}'].nunique() == 1)
如果 equal_ends=False
(默认值),那么每个系列都有不同的结束日期。
assert series_with_statics.groupby('unique_id')['ds'].max().nunique() > 1
我们可以通过指定 equal_ends=True
使它们都在相同的日期结束。
= generate_daily_series(n_series, min_length, max_length, equal_ends=True)
series_equal_ends
assert series_equal_ends.groupby('unique_id')['ds'].max().nunique() == 1
def generate_prices_for_series(series: pd.DataFrame, horizon: int = 7, seed: int = 0) -> pd.DataFrame:
= np.random.RandomState(seed)
rng = series.groupby('unique_id', observed=True)['ds'].max().nunique()
unique_last_dates if unique_last_dates > 1:
raise ValueError('series must have equal ends.')
= pd.tseries.frequencies.Day()
day_offset = series.groupby('unique_id', observed=True)['ds'].agg(['min', 'max'])
starts_ends = []
dfs for idx, (start, end) in starts_ends.iterrows():
= pd.DataFrame(
product_df
{'unique_id': idx,
'price': rng.rand((end - start).days + 1 + horizon),
},=pd.date_range(start, end + horizon * day_offset, name='ds'),
index
)
dfs.append(product_df)= pd.concat(dfs).reset_index()
prices_catalog return prices_catalog
= generate_daily_series(20, n_static_features=2, equal_ends=True)
series_for_prices ={'static_1': 'product_id'}, inplace=True)
series_for_prices.rename(columns= generate_prices_for_series(series_for_prices, horizon=7)
prices_catalog prices_catalog
ds | unique_id | price | |
---|---|---|---|
0 | 2000-10-05 | id_00 | 0.548814 |
1 | 2000-10-06 | id_00 | 0.715189 |
2 | 2000-10-07 | id_00 | 0.602763 |
3 | 2000-10-08 | id_00 | 0.544883 |
4 | 2000-10-09 | id_00 | 0.423655 |
... | ... | ... | ... |
5009 | 2001-05-17 | id_19 | 0.288027 |
5010 | 2001-05-18 | id_19 | 0.846305 |
5011 | 2001-05-19 | id_19 | 0.791284 |
5012 | 2001-05-20 | id_19 | 0.578636 |
5013 | 2001-05-21 | id_19 | 0.288589 |
5014 rows × 3 columns
set(prices_catalog['unique_id']), set(series_for_prices['unique_id']))
test_eq(lambda: generate_prices_for_series(series), contains='equal ends') test_fail(
class PredictionIntervals:
"""用于存储预测区间元数据信息的类。"""
def __init__(
self,
int = 2,
n_windows: int = 1,
h: str = 'conformal_distribution',
method:
):if n_windows < 2:
raise ValueError('You need at least two windows to compute conformal intervals')
= ['conformal_error', 'conformal_distribution']
allowed_methods if method not in allowed_methods:
raise ValueError(f'method must be one of {allowed_methods}')
self.n_windows = n_windows
self.h = h
self.method = method
def __repr__(self):
return f"PredictionIntervals(n_windows={self.n_windows}, h={self.h}, method='{self.method}')"
def _ensure_shallow_copy(df: pd.DataFrame) -> pd.DataFrame:
from packaging.version import Version
if Version(pd.__version__) < Version("1.4"):
# https://github.com/pandas-dev/pandas/pull/43406
= df.copy()
df return df
class _ShortSeriesException(Exception):
def __init__(self, idxs):
self.idxs = idxs
Give us a ⭐ on Github