import pandas as pd
from nixtlats import TimeGPT
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import classification_report,confusion_matrix
timegpt = TimeGPT(
# defaults to os.environ.get("TIMEGPT_TOKEN")
token = 'my-token'
)
data_path = "-----"
df = pd.read_csv(data_path)
df.shape
def convert_to_datetime(date_string):
try:
date_object = datetime.fromisoformat(date_string.replace("Z", "+00:00"))
return date_object
except ValueError:
return None
df['_time'] = df['_time'].apply(convert_to_datetime)
df['_time'] = df['_time'].
dt.tz_localize(None)
df.head()
pm_df=df
column_names=['Q_VFD1_Temperature','Description']
df=df[column_names]
nltk.download('punkt')
df['Tokenized_Description'] = df['Description'].apply(lambda x: word_tokenize(str(x)) if pd.notnull(x) else [])
result = df[[column_names[0], 'Tokenized_Description']].values.tolist()
all_tokens = set()
_ = df['Tokenized_Description'].apply(lambda x: all_tokens.update(x) if x else all_tokens.add(None))
# List of all different tokens
different_tokens = list(all_tokens)
different_tokens
tokens ={'NoBody1':2, 'E_STOPPED':3, ',':0, 'NoNose':1, 'None':0, 'NoBody2':2}
def replace_with_numeric(tokens_dict, tokens_list):
return [tokens_dict[token] if token in tokens_dict else None for token in tokens_list]
df['Tokenized_Description1'] = df['Tokenized_Description'].apply(lambda x: replace_with_numeric(tokens, x))
def calculate_average(row):
numeric_values = [value for value in row if value is not None]
return sum(numeric_values) / len(numeric_values) if len(numeric_values) > 0 else 0
df['Average'] = df['Tokenized_Description1'].apply(calculate_average)
features_df=df[[column_names[0],'Average']]
features_df=features_df.rename(columns={"Average": "Description"})
features_df
pm_df['Description']=features_df['Description']
"""#splitting data"""
train = pm_df[:int(0.8*(len(pm_df)))]
valid = pm_df[int(0.8*(len(pm_df))):]
"""##creating training data"""
train_2=train[column_names].round(0)
train_2
"""##Unique value count"""
unique, counts = np.unique(train_2.Description.round(0), return_counts=True)
dict(zip(unique, counts))
train_2.tail(20)
"""###FINAL TRAINING DATA
train_new.
This is the training data that we have used to train our model
"""
train_new=train_2.iloc[:95710]
unique, counts = np.unique(train_new.Description.round(0), return_counts=True)
dict(zip(unique, counts))
train_new.tail(20)
"""#SCALING DATA"""
scalery = StandardScaler()
# Fit and transform the data (for 'prediction')
df_scaled = scalery.fit_transform(train_new.values)
train_scaled = pd.DataFrame(df_scaled, index=train_new.index, columns=train_new.columns)
train_scaled
time=train['_time'].iloc[:95710]
train_new_scaled=pd.concat([time,train_scaled],axis=1)
train_new_scaled
"""#Creating new df which contains our final training data."""
new=train_new_scaled
new=new.melt(id_vars=["_time"],
var_name="unique_id",
value_name="Value")
new.rename(columns={'_time': 'ds' },inplace=True, errors='raise')
train_new_scaled.tail(20)
"""#Predicting"""
timegpt_fcst_pred_both_df = timegpt.forecast(model='timegpt-1-long-horizon',finetune_steps=90,
df=new, h=200,freq='100ms', level=[90],
time_col='ds', target_col='Value',
)
----> 2 timegpt_fcst_pred_both_df = timegpt.forecast(model='timegpt-1-long-horizon',finetune_steps=90,
3 df=new, h=200,freq='100ms', level=[90],
4 time_col='ds', target_col='Value',
5 )
10 frames
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/multi.py in from_arrays(cls, arrays, sortorder, names)
527 for i in range(1, len(arrays)):
528 if len(arrays[i]) != len(arrays[i - 1]):
--> 529 raise ValueError("all arrays must be same length")
530
531 codes, levels = factorize_from_iterables(arrays)
This code snippet is performing time series anomalay detection using the TimeGPT model from the Nixtla library on a dataset. Below is a breakdown of what the code does:
Description of Data:
- The
Description
column in the DataFrame contains the anomalies to be predicted.
- The
Q_VFD1_Temperature
column is used as the feature for making predictions across the time series.
- The
_time
column contains the time sequence dates.
- The dataset Factory-Dataset is loaded into a DataFrame
df
.
- The
_time
column is converted to datetime format and localized to remove timezone information.
- The
Description
column is tokenized using NLTK's
word_tokenize
function.
- Tokens in the
Description
column are mapped to numeric values using a predefined dictionary.
- The average of these numeric values is computed for each row and added as a new column
'Average'
.
- A new DataFrame
features_df
is created with the temperature column and the calculated average.
- The data is split into training and validation sets.
- The training data is further refined and scaled using
StandardScaler
.
- The scaled data is reshaped and molded to create a DataFrame
new
suitable for forecasting using TimeGPT.
- The
TimeGPT
model is used to forecast future values based on this prepared data.
In the last line :
timegpt_fcst_pred_both_df = timegpt.forecast(model='timegpt-1-long-horizon', finetune_steps=90, df=new, h=200, freq='100ms', level=[90], time_col='ds', target_col='Value')
I am getting an error
ValueError: all arrays must be of same length
The error occurs when attempting to execute the forecast with TimeGPT, specifically at this line
The traceback indicates that there is a mismatch in the lengths of arrays used during the forecast process.
Can you please help me resolve the above error in the code ? The use case for which I would like to use TimeGPT is for the anomaly detection in Smart-Manufacturing domain?