Slackbot
09/06/2023, 1:50 PMJosé Morales
09/06/2023, 3:07 PMJosé Morales
09/06/2023, 7:03 PMIsaac
09/06/2023, 7:24 PMJosé Morales
09/12/2023, 5:37 PMIsaac
09/12/2023, 6:55 PMIsaac
09/12/2023, 6:56 PMJosé Morales
09/12/2023, 7:31 PMIsaac
09/12/2023, 7:52 PMIsaac
10/03/2023, 7:05 PMforecast_fitted_values
and still having some trouble. If I fit MLForecast with dropna=True
, I get an error because the forecast_fitted_values will have groups that were dropped in training but should be used Found different number of groups in fitted differences.
, but if I do dropna=False
then I'm unable to fit, since some of the models can't handle NA values. What do you recommend I do?José Morales
10/03/2023, 7:07 PMIsaac
10/03/2023, 7:09 PMIsaac
10/03/2023, 7:09 PMJosé Morales
10/03/2023, 7:14 PMIsaac
10/03/2023, 7:17 PMJosé Morales
10/03/2023, 7:18 PMIsaac
10/03/2023, 7:19 PMimport matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LinearRegression
from mlforecast import MLForecast
from mlforecast.utils import generate_daily_series
from mlforecast.target_transforms import Differences, LocalStandardScaler
from mlforecast.target_transforms import GlobalSklearnTransformer
from window_ops.rolling import rolling_mean
horizon = 10
# Create a basic series
df = generate_daily_series(
n_series=10,
min_length=10,
max_length=25,
equal_ends=True,
)
df.groupby(['unique_id'])['ds'].agg(['min', 'max'])
test_end = df['ds'].max()
train_end = test_end - pd.Timedelta(days=10)
train_df = (
df
.query("ds <= @train_end")
)
#
fcst = MLForecast(
models={'lr': LinearRegression()},
freq='D',
lags=range(1, 7),
lag_transforms={
i: [(rolling_mean, 3), (rolling_mean, 4)]
for i in range(1, 7)
},
target_transforms=[Differences([1])],
date_features=['day', 'dayofweek', 'week', 'month', 'quarter', 'year'],
)
# THIS WILL FAIL WHEN FITTING THE FORECASTED VALUE
fcst.fit(
df,
fitted=True,
dropna=True,
)
# THIS WILL FAIL WHEN TRAINING
fcst.fit(
df,
fitted=True,
dropna=False,
)
Isaac
10/03/2023, 7:21 PMforecast_fitted_values
for aggregate
in hierarchicalforecast.José Morales
10/03/2023, 7:26 PMJosé Morales
10/03/2023, 7:27 PMfrom sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
lr = make_pipeline(SimpleImputer(strategy='constant', fill_value=0), LinearRegression())
fcst = MLForecast(
models={'lr': lr},
...
Isaac
10/03/2023, 7:32 PMy
.
lr = make_pipeline(SimpleImputer(strategy='constant', fill_value=0), LinearRegression())
fcst = MLForecast(
models={'lr': lr},
freq='D',
lags=range(1, 7),
lag_transforms={
i: [(rolling_mean, 3), (rolling_mean, 4)]
for i in range(1, 7)
},
target_transforms=[Differences([1])],
date_features=['day', 'dayofweek', 'week', 'month', 'quarter', 'year'],
)
fcst.fit(
df,
fitted=True,
dropna=False,
)
José Morales
10/03/2023, 7:35 PMJosé Morales
10/03/2023, 7:37 PMJosé Morales
10/03/2023, 7:37 PMMLForecast.ts.predict
but you can also override the MLForecast.models_
attribute with a dict from str to model.José Morales
10/03/2023, 7:41 PMIsaac
10/03/2023, 7:42 PMaggregate
filling in the missing series with the fitted values and the predicted values from statsforecast.HistoricAverage
? It may mess with reconciliation, but since we have so little data about the missing series it may help neverthelessJosé Morales
10/03/2023, 7:44 PMIsaac
10/03/2023, 7:49 PMJosé Morales
10/03/2023, 7:53 PMIsaac
10/03/2023, 8:04 PMJosé Morales
10/03/2023, 8:27 PMIsaac
10/04/2023, 2:17 PMJosé Morales
10/04/2023, 6:53 PMJosé Morales
10/04/2023, 7:15 PMIsaac
10/04/2023, 8:15 PMIsaac
10/04/2023, 8:16 PMJosé Morales
10/04/2023, 10:11 PMIsaac
10/05/2023, 1:33 PMaggregate
. Would it help if I added a github issue proposing that?Isaac
10/05/2023, 1:59 PMIsaac
10/05/2023, 3:28 PMfrom sklearn.preprocessing import OneHotEncoder
from scipy import sparse
from typing import Callable, Dict, List, Optional, Iterable
def aggregate2(
df: pd.DataFrame,
spec: List[List[str]],
is_balanced: bool = False,
sparse_s: bool = False,
):
"""Utils Aggregation Function.
Aggregates bottom level series contained in the pandas DataFrame `df` according
to levels defined in the `spec` list.
Parameters
----------
df : pandas DataFrame
Dataframe with columns `['ds', 'y']` and columns to aggregate.
spec : list of list of str
List of levels. Each element of the list should contain a list of columns of `df` to aggregate.
is_balanced : bool (default=False)
Deprecated.
sparse_s : bool (default=False)
Return `S_df` as a sparse dataframe.
Returns
-------
Y_df : pandas DataFrame
Hierarchically structured series.
S_df : pandas DataFrame
Summing dataframe.
tags : dict
Aggregation indices.
"""
# Checks
if df.isnull().values.any():
raise ValueError('`df` contains null values')
if is_balanced:
warnings.warn(
"`is_balanced` is deprecated and will be removed in a future version. "
"Don't set this argument to suppress this warning.",
category=DeprecationWarning,
)
spec = sorted(spec, key=len)
all_spec = set()
all_spec = [x for x in [item for sublist in spec for item in sublist] if not (x in all_spec or all_spec.add(x))]
all_dummies = [f'{i}_{j}' for i in all_spec for j in df[i].unique()]
all_agg = [f'agg_{i}' for i in all_spec]
bottom = spec[-1]
aggs = []
tags = {}
for i, levels in enumerate(spec):
agg = df.groupby(levels + ['ds'])['y'].sum().reset_index('ds')
group = agg.index.get_level_values(0)
agg[levels[0]] = agg.index.get_level_values(levels[0]).values
for j, level in enumerate(levels):
if j > 0:
group = group + '/' + agg.index.get_level_values(level).str.replace('/', '_')
agg[level] = agg.index.get_level_values(level).values
agg = pd.concat([agg, pd.get_dummies(agg[level], prefix=level, dtype=int)], axis=1)
agg.index = group
agg.index.name = 'unique_id'
tags['/'.join(levels)] = group.unique().values
for j in all_spec:
if j not in agg:
agg[j] = np.NaN
agg[f'agg_{j}'] = agg[j].isna().astype(int)
for j in all_dummies:
if j not in agg:
agg[j] = 0
aggs.append(agg)
Y_df = (
pd.concat(aggs)
[['ds', 'y'] + all_spec + all_dummies + all_agg]
)
# construct S
bottom_key = '/'.join(bottom)
bottom_levels = tags[bottom_key]
S = np.empty((len(bottom_levels), len(spec)), dtype=object)
for j, levels in enumerate(spec[:-1]):
S[:, j] = _to_upper_hierarchy(bottom, bottom_levels, '/'.join(levels))
S[:, -1] = tags[bottom_key]
categories = list(tags.values())
try:
encoder = OneHotEncoder(categories=categories, sparse_output=sparse_s, dtype=np.float32)
except TypeError: # sklearn < 1.2
encoder = OneHotEncoder(categories=categories, sparse=sparse_s, dtype=np.float32)
S = encoder.fit_transform(S).T
if sparse_s:
df_constructor = pd.DataFrame.sparse.from_spmatrix
else:
df_constructor = pd.DataFrame
S_df = df_constructor(S, index=np.hstack(categories), columns=bottom_levels)
return Y_df, S_df, tags
José Morales
10/05/2023, 3:30 PMIsaac
10/05/2023, 3:30 PMJosé Morales
10/05/2023, 3:38 PMY_df, S_df, tags = aggregate(df, spec)
Y_df = Y_df.reset_index()
Y_df[['country', 'cat1', 'cat2']] = Y_df['unique_id'].str.split('/', n=3, expand=True)
for col in ('cat1', 'cat2'):
Y_df[col] = Y_df[col].fillna(col)
You could then use these directly in LightGBM or use pd.get_dummies or a one hot encoderJosé Morales
10/05/2023, 3:39 PMIsaac
10/05/2023, 3:43 PMIsaac
10/05/2023, 3:52 PM