工具箱

%load_ext autoreload
%autoreload 2
from math import ceil, log10

import numpy as np
import pandas as pd

from utilsforecast.compat import DataFrame, pl
from utilsforecast.data import generate_series
from fastcore.test import test_eq, test_fail
from nbdev import show_doc
def generate_daily_series(
    n_series: int, 
    min_length: int = 50,
    max_length: int = 500,
    n_static_features: int = 0,
    equal_ends: bool = False,
    static_as_categorical: bool = True,
    with_trend: bool = False,
    seed: int = 0,
    engine: str = 'pandas',
) -> DataFrame:
    """Generate Synthetic Panel Series.

    Parameters
    ----------
    n_series : int
        Number of series for synthetic panel.
    min_length : int (default=50)
        Minimum length of synthetic panel's series.
    max_length : int (default=500)
        Maximum length of synthetic panel's series.
    n_static_features : int (default=0)
        Number of static exogenous variables for synthetic panel's series.
    equal_ends : bool (default=False)
        Series should end in the same date stamp `ds`.
    static_as_categorical : bool (default=True)
        Static features should have a categorical data type.        
    with_trend : bool (default=False)
        Series should have a (positive) trend.
    seed : int (default=0)
        Random seed used for generating the data.
    engine : str (default='pandas')
        Output Dataframe type.        

    Returns
    -------
    pandas or polars DataFrame
        Synthetic panel with columns [`unique_id`, `ds`, `y`] and exogenous features.
    """
    series = generate_series(
        n_series=n_series,
        freq='D',
        min_length=min_length,
        max_length=max_length,
        n_static_features=n_static_features,
        equal_ends=equal_ends,
        static_as_categorical=static_as_categorical,        
        with_trend=with_trend,
        seed=seed,
        engine=engine,
    )
    n_digits = ceil(log10(n_series))

    if engine == 'pandas':
        series['unique_id'] = (
            'id_' + series['unique_id'].astype(str).str.rjust(n_digits, '0')
        ).astype('category')
    else:
        try:
            series = series.with_columns(
                ('id_' + pl.col('unique_id').cast(pl.Utf8).str.pad_start(n_digits, '0'))
                .alias('unique_id')
                .cast(pl.Categorical)
            )
        except AttributeError:
            series = series.with_columns(
                ('id_' + pl.col('unique_id').cast(pl.Utf8).str.rjust(n_digits, '0'))
                .alias('unique_id')
                .cast(pl.Categorical)
            )
    return series
show_doc(generate_daily_series)

source

generate_daily_series

 generate_daily_series (n_series:int, min_length:int=50,
                        max_length:int=500, n_static_features:int=0,
                        equal_ends:bool=False,
                        static_as_categorical:bool=True,
                        with_trend:bool=False, seed:int=0,
                        engine:str='pandas')

Generate Synthetic Panel Series.

Type Default Details
n_series int Number of series for synthetic panel.
min_length int 50 Minimum length of synthetic panel’s series.
max_length int 500 Maximum length of synthetic panel’s series.
n_static_features int 0 Number of static exogenous variables for synthetic panel’s series.
equal_ends bool False Series should end in the same date stamp ds.
static_as_categorical bool True Static features should have a categorical data type.
with_trend bool False Series should have a (positive) trend.
seed int 0 Random seed used for generating the data.
engine str pandas Output Dataframe type.
Returns Union Synthetic panel with columns [unique_id, ds, y] and exogenous features.
import numpy as np

# 生成20个长度在100到1000之间的序列
series = [np.random.rand(np.random.randint(100, 1001)) for _ in range(20)]

# 输出生成的序列的长度
lengths = [len(s) for s in series]
lengths
n_series = 20
min_length = 100
max_length = 1000

series = generate_daily_series(n_series, min_length, max_length)
series
unique_id ds y
0 id_00 2000-01-01 0.395863
1 id_00 2000-01-02 1.264447
2 id_00 2000-01-03 2.284022
3 id_00 2000-01-04 3.462798
4 id_00 2000-01-05 4.035518
... ... ... ...
12446 id_19 2002-03-11 0.309275
12447 id_19 2002-03-12 1.189464
12448 id_19 2002-03-13 2.325032
12449 id_19 2002-03-14 3.333198
12450 id_19 2002-03-15 4.306117

12451 rows × 3 columns

我们还可以向每个系列添加静态特征(这些可以是产品ID或商店ID等)。只有第一个静态特征(static_0)与目标相关。

n_static_features = 2

series_with_statics = generate_daily_series(n_series, min_length, max_length, n_static_features)
series_with_statics
unique_id ds y static_0 static_1
0 id_00 2000-01-01 7.521388 18 10
1 id_00 2000-01-02 24.024502 18 10
2 id_00 2000-01-03 43.396423 18 10
3 id_00 2000-01-04 65.793168 18 10
4 id_00 2000-01-05 76.674843 18 10
... ... ... ... ... ...
12446 id_19 2002-03-11 27.834771 89 42
12447 id_19 2002-03-12 107.051746 89 42
12448 id_19 2002-03-13 209.252845 89 42
12449 id_19 2002-03-14 299.987801 89 42
12450 id_19 2002-03-15 387.550536 89 42

12451 rows × 5 columns

for i in range(n_static_features):
    assert all(series_with_statics.groupby('unique_id')[f'static_{i}'].nunique() == 1)

如果 equal_ends=False(默认值),那么每个系列都有不同的结束日期。

assert series_with_statics.groupby('unique_id')['ds'].max().nunique() > 1

我们可以通过指定 equal_ends=True 使它们都在相同的日期结束。

series_equal_ends = generate_daily_series(n_series, min_length, max_length, equal_ends=True)

assert series_equal_ends.groupby('unique_id')['ds'].max().nunique() == 1
def generate_prices_for_series(series: pd.DataFrame, horizon: int = 7, seed: int = 0) -> pd.DataFrame:
    rng = np.random.RandomState(seed)
    unique_last_dates = series.groupby('unique_id', observed=True)['ds'].max().nunique()
    if unique_last_dates > 1:
        raise ValueError('series must have equal ends.')
    day_offset = pd.tseries.frequencies.Day()
    starts_ends = series.groupby('unique_id', observed=True)['ds'].agg(['min', 'max'])
    dfs = []
    for idx, (start, end) in starts_ends.iterrows():
        product_df = pd.DataFrame(
            {
                'unique_id': idx,
                'price': rng.rand((end - start).days + 1 + horizon),
            },
            index=pd.date_range(start, end + horizon * day_offset, name='ds'),
        )
        dfs.append(product_df)
    prices_catalog = pd.concat(dfs).reset_index()
    return prices_catalog
series_for_prices = generate_daily_series(20, n_static_features=2, equal_ends=True)
series_for_prices.rename(columns={'static_1': 'product_id'}, inplace=True)
prices_catalog = generate_prices_for_series(series_for_prices, horizon=7)
prices_catalog
ds unique_id price
0 2000-10-05 id_00 0.548814
1 2000-10-06 id_00 0.715189
2 2000-10-07 id_00 0.602763
3 2000-10-08 id_00 0.544883
4 2000-10-09 id_00 0.423655
... ... ... ...
5009 2001-05-17 id_19 0.288027
5010 2001-05-18 id_19 0.846305
5011 2001-05-19 id_19 0.791284
5012 2001-05-20 id_19 0.578636
5013 2001-05-21 id_19 0.288589

5014 rows × 3 columns

test_eq(set(prices_catalog['unique_id']), set(series_for_prices['unique_id']))
test_fail(lambda: generate_prices_for_series(series), contains='equal ends')
class PredictionIntervals:
    """用于存储预测区间元数据信息的类。"""
    def __init__(
        self,
        n_windows: int = 2,
        h: int = 1,
        method: str = 'conformal_distribution',
    ):
        if n_windows < 2:
            raise ValueError('You need at least two windows to compute conformal intervals')
        allowed_methods = ['conformal_error', 'conformal_distribution']            
        if method not in allowed_methods:
            raise ValueError(f'method must be one of {allowed_methods}')
        self.n_windows = n_windows
        self.h = h
        self.method = method

    def __repr__(self):
        return f"PredictionIntervals(n_windows={self.n_windows}, h={self.h}, method='{self.method}')"
def _ensure_shallow_copy(df: pd.DataFrame) -> pd.DataFrame:
    from packaging.version import Version
    
    if Version(pd.__version__) < Version("1.4"):
        # https://github.com/pandas-dev/pandas/pull/43406
        df = df.copy()
    return df
class _ShortSeriesException(Exception):
    def __init__(self, idxs):
        self.idxs = idxs

Give us a ⭐ on Github