Evan Miller
09/06/2023, 10:45 PMfrom utilsforecast import preprocessing as pr
import pandas as pd
import numpy as np
df = pd.DataFrame(
{
'unique_id': [0, 0, 0, 1, 1],
'ds': pd.to_datetime(['2018-11-11', '2018-11-18', '2018-12-02', '2018-11-11', '2018-12-02']),
'y': np.arange(5),
}
)
df1 = pr.fill_gaps(
df,
freq='W',
)
print(df)
print(df1)
##### OUTPUT ######
unique_id ds y
0 0 2018-11-11 0
1 0 2018-11-18 1
2 0 2018-12-02 2
3 1 2018-11-11 3
4 1 2018-12-02 4
unique_id ds y
0 0 2018-11-08 NaN
1 0 2018-11-15 NaN
2 0 2018-11-22 NaN
3 0 2018-11-29 NaN
4 1 2018-11-08 NaN
5 1 2018-11-15 NaN
6 1 2018-11-22 NaN
7 1 2018-11-29 NaN
This is happening because the numpy definition for week is bucketed starting on Thursdays:
np.datetime64(pd.to_datetime("2018-11-11"),'W')
##### OUTPUT ######
numpy.datetime64('2018-11-08')
If I comment out the last two lines of _determine_bound in the fill_gaps method, the problem is resolved:
def fill_gaps(
df: pd.DataFrame,
freq: str = 'W',
start: str = "per_serie",
end: str = "global",
id_col: str = "unique_id",
time_col: str = "ds",
) -> pd.DataFrame:
delta = np.timedelta64(1, freq) if isinstance(freq, str) else freq
times_by_id = df.groupby(id_col)[time_col].agg(["min", "max"])
starts = _determine_bound(start, freq, times_by_id, "min")
ends = _determine_bound(end, freq, times_by_id, "max") + delta
sizes = ((ends - starts) / delta).astype(np.int64)
times = np.concatenate(
[np.arange(start, end, delta) for start, end in zip(starts, ends)]
)
if isinstance(freq, str):
times = times.astype("datetime64[ns]", copy=False)
uids = np.repeat(times_by_id.index, sizes)
idx = pd.MultiIndex.from_arrays([uids, times], names=[id_col, time_col])
return df.set_index([id_col, time_col]).reindex(idx).reset_index()
def _determine_bound(bound, freq, times_by_id, agg) -> np.ndarray:
if bound == "per_serie":
out = times_by_id[agg].values
else:
# the following return a scalar
if bound == "global":
val = getattr(times_by_id[agg].values, agg)()
if isinstance(freq, str):
val = np.datetime64(val)
else:
if isinstance(freq, str):
# this raises a nice error message if it isn't a valid datetime
val = np.datetime64(bound)
else:
val = bound
out = np.full(times_by_id.shape[0], val)
#if isinstance(freq, str):
#out = out.astype(f"datetime64[{freq}]")
return out
df = pd.DataFrame(
{
'unique_id': [0, 0, 0, 1, 1],
'ds': pd.to_datetime(['2018-11-11', '2018-11-18', '2018-12-02', '2018-11-11', '2018-12-02']),
'y': np.arange(5),
}
)
df1 = fill_gaps(
df,
freq='W',
)
print(df)
print(df1)
##### OUTPUT #####
unique_id ds y
0 0 2018-11-11 0
1 0 2018-11-18 1
2 0 2018-12-02 2
3 1 2018-11-11 3
4 1 2018-12-02 4
unique_id ds y
0 0 2018-11-11 0.0
1 0 2018-11-18 1.0
2 0 2018-11-25 NaN
3 0 2018-12-02 2.0
4 1 2018-11-11 3.0
5 1 2018-11-18 NaN
6 1 2018-11-25 NaN
7 1 2018-12-02 4.0
However, with this change problems occur when the weekly bucketing of the data is not precise (i.e. if I accidentally put '2018-12-01' in it would error)José Morales
09/06/2023, 11:15 PMpip install git+<https://github.com/nixtla/utilsforecast@irregular-freq>
I think that should fix the error. I'll add some more validations before mergingEvan Miller
09/08/2023, 12:12 AMJosé Morales
09/08/2023, 12:53 AM