import requests
import pandas as pd
import math
import datetime
import urllib.request
import json
import time
import numpy as np
import matplotlib.pyplot as plt
from prophet import Prophet
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error
pd.options.mode.chained_assignment = None # default='warn'
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
plt.style.use('fivethirtyeight')
#hide api ket in text file so its not public
app_token = open('api_key.txt', 'r').read()
#app_token
#hide api token
#use request to pull 311 data from Jan 01, 2020 until today / can slice data in pandas later
limit = 500000
app_token = open('api_key.txt', 'r').read()
uri = f"https://data.buffalony.gov/resource/whkc-e5vr.json?$limit={limit}&$$app_token={app_token}&$where=open_date>'2020-01-10T12:00:00'"
r = requests.get(uri)
print('Status code ',r.status_code)
print('Number of rows returned ',len(r.json()))
print('Endoced URI with params ',r.url)
new_json = r.json()
#new_json
Status code 200
Number of rows returned 277742
Endoced URI with params https://data.buffalony.gov/resource/whkc-e5vr.json?$limit=500000&$$app_token=NnGV0W4ip4YEFBLvBMGAjaByD&$where=open_date%3E'2020-01-10T12:00:00'
#make pandas df from json / check head
df=pd.DataFrame(new_json)
print(df.shape)
df.head(2)
(277742, 33)
case_reference | open_date | closed_date | status | subject | reason | type | object_type | address_number | address_line_1 | ... | census_tract_2010 | census_block_group_2010 | census_block_2010 | tractce20 | geoid20_tract | geoid20_blockgroup | geoid20_block | address_line_2 | x_coordinate | y_coordinate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1001109603 | 2020-01-10T12:02:00.000 | 2020-01-22T13:35:00.000 | Closed | Dept of Public Works | Sanitation | Recycling Tote Deliver (Req_Serv) | Property | 420 | PARKDALE | ... | 171 | 2 | 2000 | 017100 | 36029017100 | 360290171002 | 360290171002000 | NaN | NaN | NaN |
1 | 1001109605 | 2020-01-10T12:07:00.000 | 2020-01-14T13:28:00.000 | Closed | Dept of Public Works | Rodent Control | Rodents (Req_Serv) | Property | 92 | PROCTOR | ... | 41 | 2 | 2002 | 004100 | 36029004100 | 360290041002 | 360290041002002 | NaN | NaN | NaN |
2 rows × 33 columns
#save original Buffalo 311 Data to csv
df.to_csv('Buffalo311Data.csv', index = None, header=True)
#add count column with a value of one to rows to count calls by frequency
df['count'] = 1
#df.head()
#format open date to pd datetime to get ready to parse column
df['time'] = pd.to_datetime(df['open_date'])
#df.info() #check to make sure Dtype is datetime64
#take a look at teh columns I need
#parsing by hour will provide 24 times more training data than doing it by day
df1 = df[['time', 'count']]
df1.head(3)
time | count | |
---|---|---|
0 | 2020-01-10 12:02:00 | 1 |
1 | 2020-01-10 12:07:00 | 1 |
2 | 2020-01-10 12:08:00 | 1 |
#use date time to parse by year
df1['time'] = pd.to_datetime(df1['time']).dt.strftime('%Y-%m-%d %H')
df1.head()
time | count | |
---|---|---|
0 | 2020-01-10 12 | 1 |
1 | 2020-01-10 12 | 1 |
2 | 2020-01-10 12 | 1 |
3 | 2020-01-10 12 | 1 |
4 | 2020-01-10 12 | 1 |
#group by hour, count and reset index
df2 = df1.groupby(['time']).count().reset_index()
df2.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18336 entries, 0 to 18335
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 time 18336 non-null object
1 count 18336 non-null int64
dtypes: int64(1), object(1)
memory usage: 286.6+ KB
#make sure time is pd datetime after groupby. seems to create new object after groupby
df2['time'] = pd.to_datetime(df2['time'])
#set time as index
df2 = df2.set_index('time')
#df2.head()
#make df by day instead of hour for some charts
#ultimatly decided to use hourly because I feel it would help for staffing and letting people go home early
df_day = df1.copy()
df_day['time'] = pd.to_datetime(df_day['time']).dt.strftime('%Y-%m-%d')
df_day = df_day.groupby(['time']).count().reset_index()
df_day['time'] = pd.to_datetime(df_day['time'])
df_day = df_day.set_index('time')
#df_day.head()
#make graph of calls by day
color_pal = sns.color_palette()
df_day.plot(style='.',
figsize=(10,5),
ms=1,
color = color_pal[0],
title='311 Calls By Day')
plt.show()
#make graph of call by hour
color_pal = sns.color_palette()
df2.plot(style='.',
figsize=(10,5),
ms=1,
color = color_pal[0],
title='311 Calls By Hour')
plt.show()
#create seasons for boxplot graph showing dayofweek and season
from pandas.api.types import CategoricalDtype
cat_type = CategoricalDtype(categories=['Monday', 'Tuesday',
'Wednesday', 'Thursday',
'Friday', 'Saturday', 'Sunday'],
ordered=True)
def create_features(df, label=None):
"""
Creates time series features from datetime index.
"""
df = df.copy()
df['date'] = df.index
df['dayofweek'] = df['date'].dt.dayofweek
df['weekday'] = df['date'].dt.day_name()
df['weekday'] = df['weekday'].astype(cat_type)
df['quarter'] = df['date'].dt.quarter
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['dayofyear'] = df['date'].dt.dayofyear
df['dayofmonth'] = df['date'].dt.day
df['weekofyear'] = df['date'].dt.isocalendar().week
#df['weekofyear'] = df['date'].dt.weekofyear
df['date_offset'] = (df.date.dt.month*100 + df.date.dt.day - 320)%1300
df['season'] = pd.cut(df['date_offset'], [0, 300, 600, 900, 1300],
labels=['Spring', 'Summer', 'Fall', 'Winter'])
X = df[['dayofweek','quarter','month','year',
'dayofyear','dayofmonth','weekofyear', 'weekday', 'season']]
if label:
y = df[label]
return X, y
return X
X, y = create_features(df_day, label='count')
features_and_target = pd.concat([X, y], axis=1)
features_and_target.head()
dayofweek | quarter | month | year | dayofyear | dayofmonth | weekofyear | weekday | season | count | |
---|---|---|---|---|---|---|---|---|---|---|
time | ||||||||||
2020-01-10 | 4 | 1 | 1 | 2020 | 10 | 10 | 2 | Friday | Winter | 98 |
2020-01-11 | 5 | 1 | 1 | 2020 | 11 | 11 | 2 | Saturday | Winter | 12 |
2020-01-12 | 6 | 1 | 1 | 2020 | 12 | 12 | 2 | Sunday | Winter | 13 |
2020-01-13 | 0 | 1 | 1 | 2020 | 13 | 13 | 3 | Monday | Winter | 368 |
2020-01-14 | 1 | 1 | 1 | 2020 | 14 | 14 | 3 | Tuesday | Winter | 316 |
fig, ax = plt.subplots(figsize=(10,5))
sns.boxplot(data=features_and_target.dropna(),
x='weekday',
y='count',
hue='season',
ax = ax,
linewidth=1,
palette='YlGnBu')
ax.set_title('Number of 311 Calls by Day of Week')
ax.set_xlabel('Day of Week')
ax.set_ylabel('311 Call Count')
ax.legend(bbox_to_anchor=(1,1))
plt.show()
fig, ax = plt.subplots(figsize=(10,5))
sns.boxplot(data=features_and_target.dropna(),
x='weekofyear',
y='count',
hue='season',
ax = ax,
linewidth=1,
palette='YlGnBu')
ax.set_title('Number Of 311 Calls By Week Of Year')
ax.set_xlabel('Week')
ax.set_ylabel('311 Call Count')
plt.xticks(rotation = 90) # rotates x-axis by 45 degrees
ax.legend(bbox_to_anchor=(1,1))
plt.show()
split_date = '1-jan-2023'
df_train = df2.loc[df2.index <= split_date].copy()
df_test = df2.loc[df2.index > split_date].copy()
# plot train and test so you can see where the split is
df_test \
.rename(columns={'count': 'TEST SET'}) \
.join(df_train.rename(columns={'count': 'TRAINING SET'}),
how='outer') \
.plot(figsize=(15,5), title='Buffalo 311 Calls By Hour', style='.')
plt.show()
Train Prophet model datetime column named ds target = y
#format data for prophet model using ds and y
df_train_prophet = df_train.reset_index()\
.rename(columns={'time': 'ds',
'count': 'y'})
df_train_prophet.head()
ds | y | |
---|---|---|
0 | 2020-01-10 12:00:00 | 21 |
1 | 2020-01-10 13:00:00 | 20 |
2 | 2020-01-10 14:00:00 | 22 |
3 | 2020-01-10 15:00:00 | 30 |
4 | 2020-01-10 16:00:00 | 3 |
%%time
model = Prophet()
model.fit(df_train_prophet)
15:18:59 - cmdstanpy - INFO - Chain [1] start processing
15:19:07 - cmdstanpy - INFO - Chain [1] done processing
CPU times: total: 1.42 s
Wall time: 9.67 s
<prophet.forecaster.Prophet at 0x2cc235e5f10>
%%time
# create test dat with same column names
df_test_prophet = df_test.reset_index()\
.rename(columns={'time': 'ds',
'count': 'y'})
df_test_fcst = model.predict(df_test_prophet)
CPU times: total: 2.02 s
Wall time: 541 ms
df_test_fcst.head()
ds | trend | yhat_lower | yhat_upper | trend_lower | trend_upper | additive_terms | additive_terms_lower | additive_terms_upper | daily | ... | weekly | weekly_lower | weekly_upper | yearly | yearly_lower | yearly_upper | multiplicative_terms | multiplicative_terms_lower | multiplicative_terms_upper | yhat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2023-01-01 01:00:00 | 19.038017 | -18.082230 | 16.134619 | 19.038017 | 19.038017 | -20.199082 | -20.199082 | -20.199082 | -6.409008 | ... | -14.649280 | -14.649280 | -14.649280 | 0.859206 | 0.859206 | 0.859206 | 0.0 | 0.0 | 0.0 | -1.161066 |
1 | 2023-01-01 03:00:00 | 19.041680 | -16.196708 | 16.441544 | 19.041680 | 19.041680 | -18.544646 | -18.544646 | -18.544646 | -5.126092 | ... | -14.317852 | -14.317852 | -14.317852 | 0.899298 | 0.899298 | 0.899298 | 0.0 | 0.0 | 0.0 | 0.497034 |
2 | 2023-01-01 07:00:00 | 19.049007 | -16.467086 | 17.694639 | 19.049007 | 19.049007 | -18.550251 | -18.550251 | -18.550251 | -6.533164 | ... | -12.996319 | -12.996319 | -12.996319 | 0.979233 | 0.979233 | 0.979233 | 0.0 | 0.0 | 0.0 | 0.498756 |
3 | 2023-01-01 08:00:00 | 19.050838 | -1.699195 | 31.412708 | 19.050838 | 19.050838 | -4.490101 | -4.490101 | -4.490101 | 7.049725 | ... | -12.538989 | -12.538989 | -12.538989 | 0.999163 | 0.999163 | 0.999163 | 0.0 | 0.0 | 0.0 | 14.560738 |
4 | 2023-01-01 09:00:00 | 19.052670 | 7.991571 | 42.268996 | 19.052670 | 19.052670 | 6.905000 | 6.905000 | 6.905000 | 17.921719 | ... | -12.035792 | -12.035792 | -12.035792 | 1.019073 | 1.019073 | 1.019073 | 0.0 | 0.0 | 0.0 | 25.957670 |
5 rows × 22 columns
fig, ax = plt.subplots(figsize=(10,5))
fig = model.plot(df_test_fcst, ax= ax)
plt.xticks(rotation = 45) # rotates x-axis by 45 degrees
ax.set_title('Prophet Forecast')
plt.show()
fig = model.plot_components(df_test_fcst)
plt.show()
#plot teh forecast with the actual
f, ax = plt.subplots(figsize=(15,5))
ax.scatter(df_test.index, df_test['count'], color ='r')
fig = model.plot(df_test_fcst, ax= ax)
RED DOTS ARE ACTUAL/ BLUE LINES ARE PREDICTIONS /nseems like there is a mismatch between actual and predicted probably due to 2023 blizzard.
#zoomed in plot of forecast vs actual
import datetime
f, ax = plt.subplots(figsize=(10,5))
ax.scatter(df_test.index, df_test['count'], color ='r')
fig = model.plot(df_test_fcst, ax= ax)
# ax.set_xbound(lower='01-01-2015',
# upper='02-01-2015')
ax.set_xlim([datetime.date(2023, 1, 1), datetime.date(2023, 2, 1)])
ax.set_ylim(0,110)
plot = plt.suptitle('January 2023 Forecast vs Actuals')
Seems like the 2023 Blizard caused more calls than predicted especially when you compare to the first week of April
#zoomed in plot of forecast vs actual
import datetime
f, ax = plt.subplots(figsize=(10,5))
ax.scatter(df_test.index, df_test['count'], color ='r')
fig = model.plot(df_test_fcst, ax= ax)
# ax.set_xbound(lower='01-01-2015',
# upper='02-01-2015')
ax.set_xlim([datetime.date(2023, 1, 1), datetime.date(2023, 1, 7)])
ax.set_ylim(0,100)
plot = plt.suptitle('First Week Of January 2023 Forecast vs Actuals')
#zoomed in plot of forecast vs actual
import datetime
f, ax = plt.subplots(figsize=(10,5))
ax.scatter(df_test.index, df_test['count'], color ='r')
fig = model.plot(df_test_fcst, ax= ax)
# ax.set_xbound(lower='01-01-2015',
# upper='02-01-2015')
ax.set_xlim([datetime.date(2023, 4, 1), datetime.date(2023, 4, 7)])
ax.set_ylim(0,70)
plot = plt.suptitle('First Week Of April 2023 Forecast vs Actuals')
# true value vs predicted value
np.sqrt(mean_squared_error(y_true=df_test['count'],
y_pred=df_test_fcst['yhat']))
18.53331967782901
mean_absolute_error(y_true=df_test['count'],
y_pred=df_test_fcst['yhat'])
16.444075387619147
# set up holiday dataframe
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
cal = calendar()
holidays = cal.holidays(start=df2.index.min(),
end=df2.index.max(),
return_name=True)
holiday_df = pd.DataFrame(data=holidays,
columns=['holiday'])
holiday_df = holiday_df.reset_index().rename(columns={'index':'ds'})
holiday_df.head()
ds | holiday | |
---|---|---|
0 | 2020-01-20 | Birthday of Martin Luther King, Jr. |
1 | 2020-02-17 | Washington’s Birthday |
2 | 2020-05-25 | Memorial Day |
3 | 2020-07-03 | Independence Day |
4 | 2020-09-07 | Labor Day |
holiday_df['holiday'].value_counts()
Birthday of Martin Luther King, Jr. 4
Washington’s Birthday 4
Memorial Day 3
Independence Day 3
Labor Day 3
Columbus Day 3
Veterans Day 3
Thanksgiving Day 3
Christmas Day 3
New Year's Day 3
Juneteenth National Independence Day 2
Name: holiday, dtype: int64
import time
#run Prophet model with holiday df as a parameter for holiday
#%%time
model_with_holidays = Prophet(holidays=holiday_df)
model_with_holidays.fit(df_train_prophet)
15:35:23 - cmdstanpy - INFO - Chain [1] start processing
15:35:31 - cmdstanpy - INFO - Chain [1] done processing
<prophet.forecaster.Prophet at 0x2cc2cfe3c40>
# predict on training set with model
df_test_fcst_with_hols = model_with_holidays.predict(df=df_test_prophet)
df_test_fcst_with_hols.head()
ds | trend | yhat_lower | yhat_upper | trend_lower | trend_upper | Birthday of Martin Luther King, Jr. | Birthday of Martin Luther King, Jr._lower | Birthday of Martin Luther King, Jr._upper | Christmas Day | ... | weekly | weekly_lower | weekly_upper | yearly | yearly_lower | yearly_upper | multiplicative_terms | multiplicative_terms_lower | multiplicative_terms_upper | yhat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2023-01-01 01:00:00 | 19.962327 | -17.298810 | 16.125482 | 19.962327 | 19.962327 | 0.0 | 0.0 | 0.0 | 0.0 | ... | -15.474464 | -15.474464 | -15.474464 | 1.114713 | 1.114713 | 1.114713 | 0.0 | 0.0 | 0.0 | -0.726449 |
1 | 2023-01-01 03:00:00 | 19.966306 | -15.429744 | 17.809987 | 19.966306 | 19.966306 | 0.0 | 0.0 | 0.0 | 0.0 | ... | -15.147649 | -15.147649 | -15.147649 | 1.160615 | 1.160615 | 1.160615 | 0.0 | 0.0 | 0.0 | 0.815920 |
2 | 2023-01-01 07:00:00 | 19.974263 | -15.194841 | 17.696200 | 19.974263 | 19.974263 | 0.0 | 0.0 | 0.0 | 0.0 | ... | -13.741419 | -13.741419 | -13.741419 | 1.251959 | 1.251959 | 1.251959 | 0.0 | 0.0 | 0.0 | 0.869277 |
3 | 2023-01-01 08:00:00 | 19.976252 | -0.710243 | 32.056320 | 19.976252 | 19.976252 | 0.0 | 0.0 | 0.0 | 0.0 | ... | -13.242802 | -13.242802 | -13.242802 | 1.274699 | 1.274699 | 1.274699 | 0.0 | 0.0 | 0.0 | 15.025595 |
4 | 2023-01-01 09:00:00 | 19.978242 | 10.426556 | 42.368802 | 19.978242 | 19.978242 | 0.0 | 0.0 | 0.0 | 0.0 | ... | -12.690424 | -12.690424 | -12.690424 | 1.297399 | 1.297399 | 1.297399 | 0.0 | 0.0 | 0.0 | 26.507473 |
5 rows × 58 columns
model_with_holidays.plot_components(
df_test_fcst_with_hols)
#zoomed in plot of forecast vs actual
# red dots show actual vs projected line in blue
import datetime
f, ax = plt.subplots(figsize=(10,5))
ax.scatter(df_test.index, df_test['count'], color ='r')
fig = model.plot(df_test_fcst_with_hols, ax= ax)
# ax.set_xbound(lower='07-01-2015',
# upper='07-07-2015')
ax.set_xlim([datetime.date(2023, 1, 1), datetime.date(2023, 5, 1)])
ax.set_ylim(0,110)
plt.xticks(rotation = 45) # rotates x-axis by 45 degrees
plot = plt.suptitle('January to April Forecast vs Actuals')
#zoomed in plot of forecast vs actual
import datetime
f, ax = plt.subplots(figsize=(10,5))
ax.scatter(df_test.index, df_test['count'], color ='r')
fig = model.plot(df_test_fcst_with_hols, ax= ax)
# ax.set_xbound(lower='07-01-2015',
# upper='07-07-2015')
ax.set_xlim([datetime.date(2023, 4, 1), datetime.date(2023, 4, 7)])
ax.set_ylim(0,80)
plot = plt.suptitle('First Week Of April Forecast vs Actuals With Holidays')
# true value vs predicted value
np.sqrt(mean_squared_error(y_true=df_test['count'],
y_pred=df_test_fcst_with_hols['yhat']))
19.14157031066351
mean_absolute_error(y_true=df_test['count'],
y_pred=df_test_fcst_with_hols['yhat'])
17.139861907579142
future = model.make_future_dataframe(periods=365*24, freq='h', include_history=False)
forecast = model_with_holidays.predict(future)
forecast.head()
ds | trend | yhat_lower | yhat_upper | trend_lower | trend_upper | Birthday of Martin Luther King, Jr. | Birthday of Martin Luther King, Jr._lower | Birthday of Martin Luther King, Jr._upper | Christmas Day | ... | weekly | weekly_lower | weekly_upper | yearly | yearly_lower | yearly_upper | multiplicative_terms | multiplicative_terms_lower | multiplicative_terms_upper | yhat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2023-01-01 01:00:00 | 19.962327 | -16.882693 | 15.841012 | 19.962327 | 19.962327 | 0.0 | 0.0 | 0.0 | 0.0 | ... | -15.474464 | -15.474464 | -15.474464 | 1.114713 | 1.114713 | 1.114713 | 0.0 | 0.0 | 0.0 | -0.726449 |
1 | 2023-01-01 02:00:00 | 19.964316 | -12.946475 | 17.548705 | 19.964316 | 19.964316 | 0.0 | 0.0 | 0.0 | 0.0 | ... | -15.343241 | -15.343241 | -15.343241 | 1.137683 | 1.137683 | 1.137683 | 0.0 | 0.0 | 0.0 | 2.207288 |
2 | 2023-01-01 03:00:00 | 19.966306 | -14.783082 | 17.679284 | 19.966306 | 19.966306 | 0.0 | 0.0 | 0.0 | 0.0 | ... | -15.147649 | -15.147649 | -15.147649 | 1.160615 | 1.160615 | 1.160615 | 0.0 | 0.0 | 0.0 | 0.815920 |
3 | 2023-01-01 04:00:00 | 19.968295 | -21.090066 | 11.068481 | 19.968295 | 19.968295 | 0.0 | 0.0 | 0.0 | 0.0 | ... | -14.888335 | -14.888335 | -14.888335 | 1.183508 | 1.183508 | 1.183508 | 0.0 | 0.0 | 0.0 | -4.982981 |
4 | 2023-01-01 05:00:00 | 19.970284 | -28.096313 | 3.966954 | 19.970284 | 19.970284 | 0.0 | 0.0 | 0.0 | 0.0 | ... | -14.566408 | -14.566408 | -14.566408 | 1.206364 | 1.206364 | 1.206364 | 0.0 | 0.0 | 0.0 | -10.279008 |
5 rows × 58 columns
fig, ax = plt.subplots(figsize=(10,5))
fig = model.plot(forecast, ax= ax)
ax.set_title('Prophet Forecast')
plt.show()
#zoomed in plot of forecast
import datetime
fig, ax = plt.subplots(figsize=(10,5))
fig = model.plot(forecast, ax= ax)
ax.set_xlim([datetime.date(2023, 1, 1), datetime.date(2023, 12, 31)])
ax.set_ylim(0,100)
plt.xticks(rotation = 45) # rotates x-axis by 45 degrees
ax.set_title('Prophet Forecast')
plt.show()
#zoomed in plot of forecast for upcoming week; first week of may
import datetime
fig, ax = plt.subplots(figsize=(10,5))
fig = model.plot(forecast, ax= ax)
ax.set_xlim([datetime.date(2023, 6, 1), datetime.date(2023, 6, 7)])
ax.set_ylim(0,80)
ax.set_title('Prophet Forecast First Week Of June')
plt.show()
#zoomed in plot of forecast for upcoming Friday; first firday of june
import datetime
fig, ax = plt.subplots(figsize=(10,5))
fig = model.plot(forecast, ax= ax)
ax.set_xlim([datetime.date(2023, 6, 2), datetime.date(2023, 6, 3)])
ax.set_ylim(0,70)
ax.set_title('Prophet Forecast First Friday Of June 2023 By Hour')
plt.show()
The graph above shows a peak call time between 9am and 3pm friday June 2nd
split_date = '1-jan-2023'
df_train2 = df2.loc[df2.index <= split_date].copy()
df_test2 = df2.loc[df2.index > split_date].copy()
fig, ax = plt.subplots(figsize=(15,5))
df_train2.plot(ax=ax, label='Training Data', title='Train/Test Data Split')
df_test2.plot(ax=ax, label='Testing Data')
ax.axvline(split_date, color='black', ls='--')
ax.legend(['Training Data', 'Test Data'])
plt.show()
#check out two weeks in Jan 2022 to get a better picture of the data
df_train2.loc[(df_train.index > '01-01-2022') & (df_train2.index < '01-15-2022')].plot(figsize=(15,5), title='Two Weeks in Jan 2022')
plt.show()
def create_features_2(df):
'''
create time series features based on index date
'''
df = df.copy()
df['hour'] = df.index.hour
df['dayofweek'] = df.index.dayofweek
df['quarter'] = df.index.quarter
df['month'] = df.index.month
df['year'] = df.index.year
df['dayofyear'] = df.index.dayofyear
return df
df3 = create_features_2(df2)
df3.head(2)
df3.shape
(18336, 7)
fig, ax = plt.subplots(figsize=(10,8))
sns.boxplot(data = df3, x='hour', y='count')
ax.set_title('Buffalo 311 Call Count By Hour')
plt.show()
fig, ax = plt.subplots(figsize=(10,8))
sns.boxplot(data = df3, x='month', y='count', palette='Blues')
ax.set_title('Buffalo 311 Call Count By Month')
plt.show()
fig, ax = plt.subplots(figsize=(10,8))
sns.boxplot(data = df3, x='quarter', y='count', palette='Blues')
ax.set_title('Buffalo 311 Call Count By Quarter')
plt.show()
#note that 2023 data is only until 05/04/2023
fig, ax = plt.subplots(figsize=(10,8))
sns.boxplot(data = df3, x='year', y='count', palette = 'Reds')
ax.set_title('Buffalo 311 Call Count By Year')
plt.show()
fig, ax = plt.subplots(figsize=(10,8))
sns.boxplot(data = df3, x='dayofweek', y='count', palette='Greens')
ax.set_title('Buffalo 311 Call Count By Day Of Week')
plt.show()
fig, ax = plt.subplots(figsize=(10,8))
sns.boxplot(data = df3, x='dayofyear', y='count', palette='Purples')
ax.set_title('Buffalo 311 Call Count By Day Of Year')
plt.show()
import xgboost as xgb
from sklearn.metrics import mean_squared_error
df2.head()
count | |
---|---|
time | |
2020-01-10 12:00:00 | 21 |
2020-01-10 13:00:00 | 20 |
2020-01-10 14:00:00 | 22 |
2020-01-10 15:00:00 | 30 |
2020-01-10 16:00:00 | 3 |
df_train = create_features_2(df_train)
df_test = create_features_2(df_test)
print(df_train.columns)
print(df_test.columns)
Index(['count', 'hour', 'dayofweek', 'quarter', 'month', 'year', 'dayofyear'], dtype='object')
Index(['count', 'hour', 'dayofweek', 'quarter', 'month', 'year', 'dayofyear'], dtype='object')
features = ['hour', 'dayofweek', 'quarter', 'month', 'year', 'dayofyear']
target = 'count'
X_train = df_train[features]
y_train = df_train[target]
X_test = df_test[features]
y_test = df_test[target]
#had to lower learning rate to yield better MSE results and have the model not overfit
reg = xgb.XGBRegressor(n_estimators = 1000, early_stopping_rounds = 50,
learning_rate = 0.001)
reg.fit(X_train, y_train,
eval_set = [(X_train, y_train), (X_test, y_test)],
verbose = 75)
[0] validation_0-rmse:24.42930 validation_1-rmse:21.25184
[75] validation_0-rmse:23.02614 validation_1-rmse:19.67356
[150] validation_0-rmse:21.74553 validation_1-rmse:18.24009
[225] validation_0-rmse:20.57594 validation_1-rmse:17.05880
[300] validation_0-rmse:19.51455 validation_1-rmse:15.93974
[375] validation_0-rmse:18.54768 validation_1-rmse:14.92595
[450] validation_0-rmse:17.66992 validation_1-rmse:14.00971
[525] validation_0-rmse:16.87422 validation_1-rmse:13.17353
[600] validation_0-rmse:16.15464 validation_1-rmse:12.41610
[675] validation_0-rmse:15.50567 validation_1-rmse:11.74820
[750] validation_0-rmse:14.91105 validation_1-rmse:11.12767
[825] validation_0-rmse:14.37133 validation_1-rmse:10.58090
[900] validation_0-rmse:13.87591 validation_1-rmse:10.11855
[975] validation_0-rmse:13.41815 validation_1-rmse:9.75119
[999] validation_0-rmse:13.27996 validation_1-rmse:9.65540
XGBRegressor(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=50, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.001, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=1000, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
XGBRegressor(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=50, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.001, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=1000, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=None, ...)
from xgboost import plot_importance
plot_importance(reg)
plt.show()
df_test['prediction'] = reg.predict(X_test)
df3 = df3.merge(df_test[['prediction']], how='left', left_index=True, right_index=True)
ax = df3[['count']].plot(figsize = (15,5))
df3['prediction'].plot(ax=ax, style='.')
plt.legend(['True Data', 'Predictions'])
ax.set_title('Raw Data & Predictions')
plt.show()
ax = df_test.loc[(df_test.index > '02-01-2023') & (df_test.index < '03-01-2023')]['count'] \
.plot(figsize=(15,5), title = ' Feb 2023')
df_test.loc[(df_test.index > '02-01-2023') & (df_test.index <'03-01-2023')]['prediction'] \
.plot(style='.')
plt.legend(['Prediction','Real Data' ])
plt.show()
score = np.sqrt(mean_squared_error(df_test['count'], df_test['prediction']))
print(f'RMSE Score on Test set: {score:0.2f}')
RMSE Score on Test set: 9.66
df_test['error'] = np.abs(df_test[target] - df_test['prediction'])
df_test['date'] = df_test.index.date
#worst projected days happended close to the blizzard
df_test.groupby('date')['error'].mean().sort_values(ascending=False).head(5)
date
2023-01-05 19.826350
2023-01-03 17.533686
2023-02-24 17.288375
2023-01-04 16.783298
2023-01-09 14.823705
Name: error, dtype: float64
#best projected days of forecast
df_test.groupby('date')['error'].mean().sort_values(ascending=True).head(5)
date
2023-01-15 0.749523
2023-03-25 0.754664
2023-01-29 0.799345
2023-04-30 0.824599
2023-03-18 0.903888
Name: error, dtype: float64
df2['count'].plot(kind='hist', bins=1000)
<AxesSubplot: ylabel='Frequency'>
df2.query('count < 10').plot(figsize=(15,5), style='.')
<AxesSubplot: xlabel='time'>
df2.query('count > 50').plot(figsize=(15,5), style='.')
<AxesSubplot: xlabel='time'>
split_date = '1-jan-2023'
df_train3 = df2.loc[df2.index <= split_date].copy()
df_test3 = df2.loc[df2.index > split_date].copy()
fig, ax = plt.subplots(figsize=(15,5))
df_train3.plot(ax=ax, label='Training Data', title='Train/Test Data Split')
df_test3.plot(ax=ax, label='Testing Data')
ax.axvline(split_date, color='black', ls='--')
ax.legend(['Training Data', 'Test Data'])
plt.show()
from sklearn.model_selection import TimeSeriesSplit
df2.shape
(18336, 1)
print(df2.head())
print(df2.tail())
count
time
2020-01-10 12:00:00 21
2020-01-10 13:00:00 20
2020-01-10 14:00:00 22
2020-01-10 15:00:00 30
2020-01-10 16:00:00 3
count
time
2023-05-09 19:00:00 5
2023-05-09 20:00:00 1
2023-05-09 21:00:00 2
2023-05-09 22:00:00 17
2023-05-09 23:00:00 2
#tss = TimeSeriesSplit(n_splits=2, test_size=24*365, gap = 24)
tss = TimeSeriesSplit(n_splits=8, test_size=24*60*1, gap = 24)
df4 = df2.sort_index()
df4.head()
count | |
---|---|
time | |
2020-01-10 12:00:00 | 21 |
2020-01-10 13:00:00 | 20 |
2020-01-10 14:00:00 | 22 |
2020-01-10 15:00:00 | 30 |
2020-01-10 16:00:00 | 3 |
fig, axs = plt.subplots(8, 1, figsize=(15,15),
sharex = True)
fold = 0
for train_idx, val_idx in tss.split(df4):
train = df4.iloc[train_idx]
test = df4.iloc[val_idx]
train['count'].plot(ax=axs[fold],
label='Training Set',
title=f'Data Train/Test Split Fold {fold}')
test['count'].plot(ax=axs[fold],
label = 'Test Set')
axs[fold].axvline(test.index.min(), color='black', ls='--')
fold +=1
# make function for new time features
def create_features4(df):
"""
Create time series features based on time series index.
"""
df = df.copy()
df['hour'] = df.index.hour
df['dayofweek'] = df.index.dayofweek
df['quarter'] = df.index.quarter
df['month'] = df.index.month
df['year'] = df.index.year
df['dayofyear'] = df.index.dayofyear
df['dayofmonth'] = df.index.day
df['weekofyear'] = df.index.isocalendar().week
return df
df4 = create_features4(df4)
df4.head()
count | hour | dayofweek | quarter | month | year | dayofyear | dayofmonth | weekofyear | |
---|---|---|---|---|---|---|---|---|---|
time | |||||||||
2020-01-10 12:00:00 | 21 | 12 | 4 | 1 | 1 | 2020 | 10 | 10 | 2 |
2020-01-10 13:00:00 | 20 | 13 | 4 | 1 | 1 | 2020 | 10 | 10 | 2 |
2020-01-10 14:00:00 | 22 | 14 | 4 | 1 | 1 | 2020 | 10 | 10 | 2 |
2020-01-10 15:00:00 | 30 | 15 | 4 | 1 | 1 | 2020 | 10 | 10 | 2 |
2020-01-10 16:00:00 | 3 | 16 | 4 | 1 | 1 | 2020 | 10 | 10 | 2 |
# make lag feature function
def add_lags(df):
target_map = df['count'].to_dict()
df['lag1'] = (df.index - pd.Timedelta('182 days')).map(target_map)
df['lag2'] = (df.index - pd.Timedelta('364 days')).map(target_map)
df['lag3'] = (df.index - pd.Timedelta('728 days')).map(target_map)
return df
#test lags function
#df4 = add_lags(df)
#check df after lags function test
#df4.tail()
tss = TimeSeriesSplit(n_splits=8, test_size=24*60*1, gap = 24)
df4 = df2.sort_index()
fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(df4):
train = df4.iloc[train_idx]
test = df4.iloc[val_idx]
train = create_features4(train)
test = create_features4(test)
train = add_lags(train)
test = add_lags(test)
FEATURES4 = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month','year',
'lag1','lag2', 'lag3']
TARGET4 = 'count'
X_train = train[FEATURES4]
y_train = train[TARGET4]
X_test = test[FEATURES4]
y_test = test[TARGET4]
reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
n_estimators=100000,
early_stopping_rounds=50,
objective='reg:linear',
max_depth=3,
learning_rate=0.1)
reg.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=10000)
y_pred = reg.predict(X_test)
preds.append(y_pred)
score = np.sqrt(mean_squared_error(y_test, y_pred))
scores.append(score)
[15:49:23] WARNING: c:\users\dev-admin\croot2\xgboost-split_1675461376218\work\src\objective\regression_obj.cu:213: reg:linear is now deprecated in favor of reg:squarederror.
[0] validation_0-rmse:20.28806 validation_1-rmse:25.23168
[365] validation_0-rmse:6.34963 validation_1-rmse:9.52946
[15:49:23] WARNING: c:\users\dev-admin\croot2\xgboost-split_1675461376218\work\src\objective\regression_obj.cu:213: reg:linear is now deprecated in favor of reg:squarederror.
[0] validation_0-rmse:21.08501 validation_1-rmse:20.03218
[495] validation_0-rmse:6.41239 validation_1-rmse:7.29066
[15:49:24] WARNING: c:\users\dev-admin\croot2\xgboost-split_1675461376218\work\src\objective\regression_obj.cu:213: reg:linear is now deprecated in favor of reg:squarederror.
[0] validation_0-rmse:20.86114 validation_1-rmse:29.95618
[163] validation_0-rmse:7.21473 validation_1-rmse:21.61451
[15:49:25] WARNING: c:\users\dev-admin\croot2\xgboost-split_1675461376218\work\src\objective\regression_obj.cu:213: reg:linear is now deprecated in favor of reg:squarederror.
[0] validation_0-rmse:22.05514 validation_1-rmse:22.50675
[252] validation_0-rmse:7.36629 validation_1-rmse:11.83633
[15:49:25] WARNING: c:\users\dev-admin\croot2\xgboost-split_1675461376218\work\src\objective\regression_obj.cu:213: reg:linear is now deprecated in favor of reg:squarederror.
[0] validation_0-rmse:22.03981 validation_1-rmse:26.97997
[972] validation_0-rmse:6.62589 validation_1-rmse:11.07590
[15:49:26] WARNING: c:\users\dev-admin\croot2\xgboost-split_1675461376218\work\src\objective\regression_obj.cu:213: reg:linear is now deprecated in favor of reg:squarederror.
[0] validation_0-rmse:22.46248 validation_1-rmse:21.76932
[3115] validation_0-rmse:5.52317 validation_1-rmse:7.01527
[15:49:31] WARNING: c:\users\dev-admin\croot2\xgboost-split_1675461376218\work\src\objective\regression_obj.cu:213: reg:linear is now deprecated in favor of reg:squarederror.
[0] validation_0-rmse:22.24770 validation_1-rmse:28.82747
[837] validation_0-rmse:6.73216 validation_1-rmse:19.81065
[15:49:33] WARNING: c:\users\dev-admin\croot2\xgboost-split_1675461376218\work\src\objective\regression_obj.cu:213: reg:linear is now deprecated in favor of reg:squarederror.
[0] validation_0-rmse:22.71290 validation_1-rmse:20.37634
[239] validation_0-rmse:8.34257 validation_1-rmse:12.02755
print(f'Scores across all folds {np.mean(scores):0.4f}')
print(f'Fold scores:{scores}')
Scores across all folds 12.4270
Fold scores:[9.432683974684368, 7.263089487308816, 21.508818615589814, 11.799599285719262, 11.072779306904335, 7.013240970100097, 19.336143148373846, 11.989809586405414]
#retrain all data to prepare for forecasting
#still leverage all data for forecast prediction
#switchedn_estimator to 1500 because the model still seemed like it was getting better
#also moved the learning rate down
df4 = create_features4(df4)
df4 = add_lags(df4)
FEATURES4 = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month','year',
'lag1','lag2', 'lag3']
TARGET4 = 'count'
X_all = df4[FEATURES4]
y_all = df4[TARGET4]
reg = xgb.XGBRegressor(base_score=0.5,
booster='gbtree',
#n_estimators=25000,
n_estimators=100000,
early_stopping_rounds=50,
objective='reg:linear',
max_depth=3,
#learning_rate=0.0001) # LEARNING FASTER SEEMS TO BOOST PERFORMANCE OF THE MODEL
learning_rate=0.1)
reg.fit(X_all, y_all,
eval_set=[(X_all, y_all)],
verbose=10000)
[15:50:49] WARNING: c:\users\dev-admin\croot2\xgboost-split_1675461376218\work\src\objective\regression_obj.cu:213: reg:linear is now deprecated in favor of reg:squarederror.
[0] validation_0-rmse:22.43720
[10000] validation_0-rmse:4.73683
[20000] validation_0-rmse:4.05477
[30000] validation_0-rmse:3.67679
[40000] validation_0-rmse:3.42238
[50000] validation_0-rmse:3.23311
[60000] validation_0-rmse:3.07159
[70000] validation_0-rmse:2.93679
[80000] validation_0-rmse:2.82711
[90000] validation_0-rmse:2.72784
[99999] validation_0-rmse:2.63702
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=50, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.1, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=3, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100000, n_jobs=None, num_parallel_tree=None, objective='reg:linear', predictor=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=50, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.1, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=3, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100000, n_jobs=None, num_parallel_tree=None, objective='reg:linear', predictor=None, ...)
#find max value and set rage from last date
FurtueStartDate = df4.index.max()
'''
df_day['time'] = pd.to_datetime(df_day['time']).dt.strftime('%Y-%m-%d')
'''
#FurtueStartDate = FurtueStartDate.dt.strftime('%Y-%m-%d')
FurtueStartDate
Timestamp('2023-05-09 23:00:00')
#can't project longer than the smallest lag
future = pd.date_range('2023-05-04', '11/02/2023' , freq='1h')
future_df = pd.DataFrame(index=future)
future_df['is_future'] = True
df4['is_future'] = False
df_and_future = pd.concat([df4, future_df])
df_and_future = create_features4(df_and_future)
df_and_future = add_lags(df_and_future)
df_and_future
count | hour | dayofweek | quarter | month | year | dayofyear | dayofmonth | weekofyear | lag1 | lag2 | lag3 | is_future | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2020-01-10 12:00:00 | 21.0 | 12 | 4 | 1 | 1 | 2020 | 10 | 10 | 2 | NaN | NaN | NaN | False |
2020-01-10 13:00:00 | 20.0 | 13 | 4 | 1 | 1 | 2020 | 10 | 10 | 2 | NaN | NaN | NaN | False |
2020-01-10 14:00:00 | 22.0 | 14 | 4 | 1 | 1 | 2020 | 10 | 10 | 2 | NaN | NaN | NaN | False |
2020-01-10 15:00:00 | 30.0 | 15 | 4 | 1 | 1 | 2020 | 10 | 10 | 2 | NaN | NaN | NaN | False |
2020-01-10 16:00:00 | 3.0 | 16 | 4 | 1 | 1 | 2020 | 10 | 10 | 2 | NaN | NaN | NaN | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2023-11-01 20:00:00 | NaN | 20 | 2 | 4 | 11 | 2023 | 305 | 1 | 44 | 3.0 | 4.0 | NaN | True |
2023-11-01 21:00:00 | NaN | 21 | 2 | 4 | 11 | 2023 | 305 | 1 | 44 | 1.0 | 2.0 | 1.0 | True |
2023-11-01 22:00:00 | NaN | 22 | 2 | 4 | 11 | 2023 | 305 | 1 | 44 | 1.0 | NaN | 1.0 | True |
2023-11-01 23:00:00 | NaN | 23 | 2 | 4 | 11 | 2023 | 305 | 1 | 44 | 2.0 | NaN | 1.0 | True |
2023-11-02 00:00:00 | NaN | 0 | 3 | 4 | 11 | 2023 | 306 | 2 | 44 | NaN | NaN | NaN | True |
22705 rows × 13 columns
#may need to go in and fill in lag Nan's with ffill
future_with_features = df_and_future.query('is_future').copy()
future_with_features
count | hour | dayofweek | quarter | month | year | dayofyear | dayofmonth | weekofyear | lag1 | lag2 | lag3 | is_future | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2023-05-04 00:00:00 | NaN | 0 | 3 | 2 | 5 | 2023 | 124 | 4 | 18 | NaN | NaN | NaN | True |
2023-05-04 01:00:00 | NaN | 1 | 3 | 2 | 5 | 2023 | 124 | 4 | 18 | NaN | NaN | NaN | True |
2023-05-04 02:00:00 | NaN | 2 | 3 | 2 | 5 | 2023 | 124 | 4 | 18 | NaN | NaN | NaN | True |
2023-05-04 03:00:00 | NaN | 3 | 3 | 2 | 5 | 2023 | 124 | 4 | 18 | NaN | NaN | NaN | True |
2023-05-04 04:00:00 | NaN | 4 | 3 | 2 | 5 | 2023 | 124 | 4 | 18 | NaN | 1.0 | NaN | True |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2023-11-01 20:00:00 | NaN | 20 | 2 | 4 | 11 | 2023 | 305 | 1 | 44 | 3.0 | 4.0 | NaN | True |
2023-11-01 21:00:00 | NaN | 21 | 2 | 4 | 11 | 2023 | 305 | 1 | 44 | 1.0 | 2.0 | 1.0 | True |
2023-11-01 22:00:00 | NaN | 22 | 2 | 4 | 11 | 2023 | 305 | 1 | 44 | 1.0 | NaN | 1.0 | True |
2023-11-01 23:00:00 | NaN | 23 | 2 | 4 | 11 | 2023 | 305 | 1 | 44 | 2.0 | NaN | 1.0 | True |
2023-11-02 00:00:00 | NaN | 0 | 3 | 4 | 11 | 2023 | 306 | 2 | 44 | NaN | NaN | NaN | True |
4369 rows × 13 columns
FEATURES4 = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month','year',
'lag1','lag2', 'lag3']
TARGET4 = 'count'
#run trained model on features in future df
#save in new column called pred
future_with_features['pred'] = reg.predict(future_with_features[FEATURES4])
import matplotlib.pyplot as plt
future_with_features['pred'].plot(figsize = (10,5),
#color= 'Blue',
color= color_pal[0],
ms=1,
lw=1,
title='XGBOOST Future Predictions')
plt.show()
reg.save_model('311XGB_model.json')
reg_new = xgb.XGBRegressor()
reg_new.load_model('311XGB_model.json')
#run saved model under reg_new and plot results
future_with_features['pred'] = reg_new.predict(future_with_features[FEATURES4])
future_with_features['pred'].plot(figsize = (10,5),
#color= 'Blue',
color= color_pal[0],
ms=1,
lw=1,
title='XGBOOST Future Predictions')
plt.show()
#zoomed in plot of forecast for upcoming week; first week of may
import datetime
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10,5))
ax.plot(future_with_features.index, future_with_features['pred'])
#fig = future_with_features['pred'].plot(future_with_features['pred'], ax= ax)
ax.set_xlim([datetime.date(2023, 7, 1), datetime.date(2023, 7, 7)])
ax.set_ylim(0,80)
ax.set_title('XGBOOST Forecast First Week Of July')
plt.show()
#zoomed in plot of forecast for upcoming week; first week of may
import datetime
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10,5))
ax.plot(future_with_features.index, future_with_features['pred'])
#fig = future_with_features['pred'].plot(future_with_features['pred'], ax= ax)
ax.set_xlim([datetime.date(2023, 11, 1), datetime.date(2023, 11, 2)])
ax.set_ylim(0,80)
ax.set_title('XGBOOST Forecast November 1st By Hour')
plt.show()
from xgboost import plot_importance
plot_importance(reg_new)
plt.show()
Auto Regression, , Integarted(differencing) , Moving Average
!pip3 install pmdarima
Requirement already satisfied: pmdarima in c:\users\brett\anaconda3\envs\school\lib\site-packages (2.0.3)
Requirement already satisfied: numpy>=1.21.2 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pmdarima) (1.23.5)
Requirement already satisfied: setuptools!=50.0.0,>=38.6.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pmdarima) (65.6.3)
Requirement already satisfied: pandas>=0.19 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pmdarima) (1.5.3)
Requirement already satisfied: Cython!=0.29.18,!=0.29.31,>=0.29 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pmdarima) (0.29.34)
Requirement already satisfied: urllib3 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pmdarima) (1.26.14)
Requirement already satisfied: joblib>=0.11 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pmdarima) (1.2.0)
Requirement already satisfied: statsmodels>=0.13.2 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pmdarima) (0.13.5)
Requirement already satisfied: scikit-learn>=0.22 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pmdarima) (1.2.1)
Requirement already satisfied: scipy>=1.3.2 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pmdarima) (1.10.0)
Requirement already satisfied: pytz>=2020.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pandas>=0.19->pmdarima) (2022.7)
Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pandas>=0.19->pmdarima) (2.8.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-learn>=0.22->pmdarima) (3.1.0)
Requirement already satisfied: packaging>=21.3 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from statsmodels>=0.13.2->pmdarima) (22.0)
Requirement already satisfied: patsy>=0.5.2 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from statsmodels>=0.13.2->pmdarima) (0.5.3)
Requirement already satisfied: six in c:\users\brett\anaconda3\envs\school\lib\site-packages (from patsy>=0.5.2->statsmodels>=0.13.2->pmdarima) (1.16.0)
df_day['count'].plot(figsize=(12,5))
<AxesSubplot: xlabel='time'>
!pip install statsmodels
Requirement already satisfied: statsmodels in c:\users\brett\anaconda3\envs\school\lib\site-packages (0.13.5)
Requirement already satisfied: numpy>=1.17 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from statsmodels) (1.23.5)
Requirement already satisfied: scipy>=1.3 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from statsmodels) (1.10.0)
Requirement already satisfied: pandas>=0.25 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from statsmodels) (1.5.3)
Requirement already satisfied: packaging>=21.3 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from statsmodels) (22.0)
Requirement already satisfied: patsy>=0.5.2 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from statsmodels) (0.5.3)
Requirement already satisfied: pytz>=2020.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pandas>=0.25->statsmodels) (2022.7)
Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pandas>=0.25->statsmodels) (2.8.2)
Requirement already satisfied: six in c:\users\brett\anaconda3\envs\school\lib\site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)
#import statsmodels.api as sm
import statsmodels.api
import statsmodels.formula.api as smf
from statsmodels.tsa.stattools import adfuller
def ad_test(dataset):
dftest = adfuller(dataset, autolag = 'AIC')
print('1. ADF : ', dftest[0])
print('2. P-Value : ', dftest[1])
print('3. Num Of Lags : ', dftest[2])
print('4. Num Of Observations Used For ADF Regression and Critical Values Calculation : ', dftest[3])
print('5. Critical Values : ')
for key, val in dftest[4].items():
print('\t', key, ':',val)
# P-Value is probability and should be as low as possible
#the smaller the better P-Value
#small value means teh data set is stationary
ad_test(df_day['count'])
1. ADF : -4.7528316341012635
2. P-Value : 6.688647512497074e-05
3. Num Of Lags : 22
4. Num Of Observations Used For ADF Regression and Critical Values Calculation : 1189
5. Critical Values :
1% : -3.435861752677197
5% : -2.8639738850277796
10% : -2.568065847341873
from pmdarima import auto_arima
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
df_day.head()
count | |
---|---|
time | |
2020-01-10 | 98 |
2020-01-11 | 12 |
2020-01-12 | 13 |
2020-01-13 | 368 |
2020-01-14 | 316 |
split_date = '1-jan-2023'
df_train_day = df_day.loc[df_day.index <= split_date].copy()
df_test_day = df_day.loc[df_day.index > split_date].copy()
fig, ax = plt.subplots(figsize=(15,5))
df_train_day.plot(ax=ax, label='Training Data', title='Train/Test Data Split BY Day')
df_test_day.plot(ax=ax, label='Testing Data')
ax.axvline(split_date, color='black', ls='--')
ax.legend(['Training Data', 'Test Data'])
plt.show()
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api
import statsmodels.formula.api as smf
stepwise_fit = auto_arima(df_day['count'],
trace=True,
supress_warnings=True)
stepwise_fit.summary()
Performing stepwise search to minimize aic
ARIMA(2,1,2)(0,0,0)[0] intercept : AIC=15485.928, Time=1.29 sec
ARIMA(0,1,0)(0,0,0)[0] intercept : AIC=16082.799, Time=0.03 sec
ARIMA(1,1,0)(0,0,0)[0] intercept : AIC=16076.953, Time=0.05 sec
ARIMA(0,1,1)(0,0,0)[0] intercept : AIC=15868.409, Time=0.27 sec
ARIMA(0,1,0)(0,0,0)[0] : AIC=16080.802, Time=0.02 sec
ARIMA(1,1,2)(0,0,0)[0] intercept : AIC=15577.259, Time=0.55 sec
ARIMA(2,1,1)(0,0,0)[0] intercept : AIC=15544.333, Time=0.56 sec
ARIMA(3,1,2)(0,0,0)[0] intercept : AIC=inf, Time=1.19 sec
ARIMA(2,1,3)(0,0,0)[0] intercept : AIC=15334.875, Time=1.49 sec
ARIMA(1,1,3)(0,0,0)[0] intercept : AIC=15546.470, Time=1.10 sec
ARIMA(3,1,3)(0,0,0)[0] intercept : AIC=15311.143, Time=1.53 sec
ARIMA(4,1,3)(0,0,0)[0] intercept : AIC=15164.604, Time=1.69 sec
ARIMA(4,1,2)(0,0,0)[0] intercept : AIC=15301.116, Time=1.93 sec
ARIMA(5,1,3)(0,0,0)[0] intercept : AIC=15108.060, Time=1.90 sec
ARIMA(5,1,2)(0,0,0)[0] intercept : AIC=15151.929, Time=1.64 sec
ARIMA(5,1,4)(0,0,0)[0] intercept : AIC=15056.058, Time=2.09 sec
ARIMA(4,1,4)(0,0,0)[0] intercept : AIC=15151.651, Time=1.83 sec
ARIMA(5,1,5)(0,0,0)[0] intercept : AIC=14925.176, Time=2.65 sec
ARIMA(4,1,5)(0,0,0)[0] intercept : AIC=14951.563, Time=2.32 sec
ARIMA(5,1,5)(0,0,0)[0] : AIC=14923.674, Time=1.89 sec
ARIMA(4,1,5)(0,0,0)[0] : AIC=14949.732, Time=1.61 sec
ARIMA(5,1,4)(0,0,0)[0] : AIC=15054.277, Time=1.25 sec
ARIMA(4,1,4)(0,0,0)[0] : AIC=15149.548, Time=0.89 sec
Best model: ARIMA(5,1,5)(0,0,0)[0]
Total fit time: 29.778 seconds
Dep. Variable: | y | No. Observations: | 1212 |
---|---|---|---|
Model: | SARIMAX(5, 1, 5) | Log Likelihood | -7450.837 |
Date: | Wed, 10 May 2023 | AIC | 14923.674 |
Time: | 16:08:20 | BIC | 14979.765 |
Sample: | 0 | HQIC | 14944.793 |
- 1212 | |||
Covariance Type: | opg |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
ar.L1 | 0.9527 | 0.061 | 15.629 | 0.000 | 0.833 | 1.072 |
ar.L2 | -1.5558 | 0.046 | -33.884 | 0.000 | -1.646 | -1.466 |
ar.L3 | 1.0378 | 0.081 | 12.746 | 0.000 | 0.878 | 1.197 |
ar.L4 | -1.1024 | 0.049 | -22.362 | 0.000 | -1.199 | -1.006 |
ar.L5 | 0.1677 | 0.053 | 3.145 | 0.002 | 0.063 | 0.272 |
ma.L1 | -1.5405 | 0.053 | -28.943 | 0.000 | -1.645 | -1.436 |
ma.L2 | 1.8972 | 0.057 | 33.325 | 0.000 | 1.786 | 2.009 |
ma.L3 | -1.8915 | 0.057 | -33.031 | 0.000 | -2.004 | -1.779 |
ma.L4 | 1.4121 | 0.062 | 22.934 | 0.000 | 1.291 | 1.533 |
ma.L5 | -0.6891 | 0.036 | -19.358 | 0.000 | -0.759 | -0.619 |
sigma2 | 1.568e+04 | 237.142 | 66.111 | 0.000 | 1.52e+04 | 1.61e+04 |
Ljung-Box (L1) (Q): | 0.01 | Jarque-Bera (JB): | 210261.66 |
---|---|---|---|
Prob(Q): | 0.91 | Prob(JB): | 0.00 |
Heteroskedasticity (H): | 1.25 | Skew: | 4.43 |
Prob(H) (two-sided): | 0.03 | Kurtosis: | 66.94 |
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
#use order from best order in the step above
model_day=sm.tsa.ARIMA(df_train_day['count'], order=(5,1,5))
model_day=model_day.fit()
model_day.summary()
Dep. Variable: | count | No. Observations: | 1084 |
---|---|---|---|
Model: | ARIMA(5, 1, 5) | Log Likelihood | -6684.911 |
Date: | Wed, 10 May 2023 | AIC | 13391.822 |
Time: | 16:08:47 | BIC | 13446.684 |
Sample: | 0 | HQIC | 13412.592 |
- 1084 | |||
Covariance Type: | opg |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
ar.L1 | 0.9872 | 0.059 | 16.838 | 0.000 | 0.872 | 1.102 |
ar.L2 | -1.5865 | 0.045 | -35.588 | 0.000 | -1.674 | -1.499 |
ar.L3 | 1.0828 | 0.080 | 13.602 | 0.000 | 0.927 | 1.239 |
ar.L4 | -1.1353 | 0.047 | -23.966 | 0.000 | -1.228 | -1.042 |
ar.L5 | 0.1974 | 0.053 | 3.742 | 0.000 | 0.094 | 0.301 |
ma.L1 | -1.5795 | 0.052 | -30.481 | 0.000 | -1.681 | -1.478 |
ma.L2 | 1.9451 | 0.057 | 33.879 | 0.000 | 1.833 | 2.058 |
ma.L3 | -1.9274 | 0.059 | -32.557 | 0.000 | -2.043 | -1.811 |
ma.L4 | 1.4612 | 0.060 | 24.198 | 0.000 | 1.343 | 1.580 |
ma.L5 | -0.7163 | 0.038 | -18.685 | 0.000 | -0.791 | -0.641 |
sigma2 | 1.612e+04 | 250.719 | 64.298 | 0.000 | 1.56e+04 | 1.66e+04 |
Ljung-Box (L1) (Q): | 0.00 | Jarque-Bera (JB): | 195507.26 |
---|---|---|---|
Prob(Q): | 0.96 | Prob(JB): | 0.00 |
Heteroskedasticity (H): | 2.45 | Skew: | 4.61 |
Prob(H) (two-sided): | 0.00 | Kurtosis: | 68.17 |
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
Make Predictions On Test Set
start=len(df_train_day)
end=len(df_train_day)+len(df_test_day)-1
pred_day=model_day.predict(start=start, end=end, typ='levels')
#print(pred) # dates did not appear do use next line of code
pred_day.index=df_day.index[start:end+1]
print(pred_day)
time
2023-01-02 479.428567
2023-01-03 674.749061
2023-01-04 604.407490
2023-01-05 557.103754
2023-01-06 398.781992
...
2023-05-05 469.493291
2023-05-06 312.357834
2023-05-07 315.081241
2023-05-08 506.594428
2023-05-09 507.856525
Name: predicted_mean, Length: 128, dtype: float64
pred_day.plot(legend=True)
df_test_day['count'].plot(legend=True)
<AxesSubplot: xlabel='time'>
df_day['count'].mean()
229.16006600660066
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse_day=sqrt(mean_squared_error(pred_day, df_test_day['count']))
rmse_day
224.44111330608987
Make predictions
#Fit model on all df_day
model_day = sm.tsa.ARIMA(df_day['count'], order=(5,0,5))
model_day=model_day.fit()
df_day.tail() # predict into the future after training
count | |
---|---|
time | |
2023-05-05 | 286 |
2023-05-06 | 51 |
2023-05-07 | 51 |
2023-05-08 | 482 |
2023-05-09 | 387 |
index_future_dates=pd.date_range(start='2023-05-09', end='2023-12-31', freq='D')
pred_day=model_day.predict(start=len(df_day), end=len(df_day)+(236), type='levels').rename('ARIMA Predictions BY Day')
pred_day.index=index_future_dates
print(pred_day)
2023-05-09 297.317413
2023-05-10 336.257542
2023-05-11 253.195036
2023-05-12 48.160946
2023-05-13 112.191123
...
2023-12-27 262.186749
2023-12-28 227.061904
2023-12-29 157.228323
2023-12-30 188.266458
2023-12-31 265.360301
Freq: D, Name: ARIMA Predictions BY Day, Length: 237, dtype: float64
pred_day.info()
<class 'pandas.core.series.Series'>
DatetimeIndex: 237 entries, 2023-05-09 to 2023-12-31
Freq: D
Series name: ARIMA Predictions BY Day
Non-Null Count Dtype
-------------- -----
237 non-null float64
dtypes: float64(1)
memory usage: 3.7 KB
pred_day.plot(figsize=(12,5),legend=True)
<AxesSubplot: >
df5 = df2.copy()
#df5 = df5.drop(['is_future', 'lag1', 'lag2', 'lag3'], axis=1)
df5.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 18336 entries, 2020-01-10 12:00:00 to 2023-05-09 23:00:00
Data columns (total 1 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 count 18336 non-null int64
dtypes: int64(1)
memory usage: 286.5 KB
df5['count'].plot(figsize=(12,5))
<AxesSubplot: xlabel='time'>
# P-Value is probability and should be as low as possible
#the smaller the better P-Value
#small value means teh data set is stationary
ad_test(df5['count'])
1. ADF : -22.210656144053754
2. P-Value : 0.0
3. Num Of Lags : 31
4. Num Of Observations Used For ADF Regression and Critical Values Calculation : 18304
5. Critical Values :
1% : -3.430707310823011
5% : -2.8616979180198356
10% : -2.5668540555869606
stepwise_fit5 = auto_arima(df5['count'],
trace=True,
supress_warnings=True)
stepwise_fit5.summary()
Performing stepwise search to minimize aic
ARIMA(2,0,2)(0,0,0)[0] intercept : AIC=139245.469, Time=12.43 sec
ARIMA(0,0,0)(0,0,0)[0] intercept : AIC=160319.042, Time=0.23 sec
ARIMA(1,0,0)(0,0,0)[0] intercept : AIC=140155.291, Time=0.29 sec
ARIMA(0,0,1)(0,0,0)[0] intercept : AIC=148101.958, Time=2.04 sec
ARIMA(0,0,0)(0,0,0)[0] : AIC=169221.769, Time=0.13 sec
ARIMA(1,0,2)(0,0,0)[0] intercept : AIC=139761.479, Time=5.38 sec
ARIMA(2,0,1)(0,0,0)[0] intercept : AIC=139280.154, Time=8.03 sec
ARIMA(3,0,2)(0,0,0)[0] intercept : AIC=139282.702, Time=11.24 sec
ARIMA(2,0,3)(0,0,0)[0] intercept : AIC=138849.798, Time=19.91 sec
ARIMA(1,0,3)(0,0,0)[0] intercept : AIC=139740.324, Time=6.93 sec
ARIMA(3,0,3)(0,0,0)[0] intercept : AIC=139249.468, Time=16.14 sec
ARIMA(2,0,4)(0,0,0)[0] intercept : AIC=138392.690, Time=22.20 sec
ARIMA(1,0,4)(0,0,0)[0] intercept : AIC=139725.762, Time=9.09 sec
ARIMA(3,0,4)(0,0,0)[0] intercept : AIC=138687.604, Time=24.18 sec
ARIMA(2,0,5)(0,0,0)[0] intercept : AIC=138328.398, Time=14.43 sec
ARIMA(1,0,5)(0,0,0)[0] intercept : AIC=139462.275, Time=10.38 sec
ARIMA(3,0,5)(0,0,0)[0] intercept : AIC=138326.439, Time=19.29 sec
ARIMA(4,0,5)(0,0,0)[0] intercept : AIC=137966.103, Time=26.64 sec
ARIMA(4,0,4)(0,0,0)[0] intercept : AIC=138459.123, Time=27.90 sec
ARIMA(5,0,5)(0,0,0)[0] intercept : AIC=137865.368, Time=29.40 sec
ARIMA(5,0,4)(0,0,0)[0] intercept : AIC=139131.798, Time=27.98 sec
ARIMA(5,0,5)(0,0,0)[0] : AIC=140472.475, Time=14.99 sec
Best model: ARIMA(5,0,5)(0,0,0)[0] intercept
Total fit time: 309.258 seconds
Dep. Variable: | y | No. Observations: | 18336 |
---|---|---|---|
Model: | SARIMAX(5, 0, 5) | Log Likelihood | -68920.684 |
Date: | Wed, 10 May 2023 | AIC | 137865.368 |
Time: | 16:34:35 | BIC | 137959.167 |
Sample: | 0 | HQIC | 137896.186 |
- 18336 | |||
Covariance Type: | opg |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
intercept | 0.6856 | 0.063 | 10.810 | 0.000 | 0.561 | 0.810 |
ar.L1 | 3.3027 | 0.031 | 108.077 | 0.000 | 3.243 | 3.363 |
ar.L2 | -5.0795 | 0.077 | -65.888 | 0.000 | -5.231 | -4.928 |
ar.L3 | 4.5577 | 0.093 | 49.226 | 0.000 | 4.376 | 4.739 |
ar.L4 | -2.3567 | 0.063 | -37.415 | 0.000 | -2.480 | -2.233 |
ar.L5 | 0.5307 | 0.021 | 25.760 | 0.000 | 0.490 | 0.571 |
ma.L1 | -2.4724 | 0.030 | -81.359 | 0.000 | -2.532 | -2.413 |
ma.L2 | 2.9964 | 0.053 | 56.326 | 0.000 | 2.892 | 3.101 |
ma.L3 | -2.0045 | 0.054 | -37.183 | 0.000 | -2.110 | -1.899 |
ma.L4 | 0.6717 | 0.031 | 21.894 | 0.000 | 0.612 | 0.732 |
ma.L5 | -0.0364 | 0.011 | -3.297 | 0.001 | -0.058 | -0.015 |
sigma2 | 106.6402 | 0.304 | 350.752 | 0.000 | 106.044 | 107.236 |
Ljung-Box (L1) (Q): | 0.13 | Jarque-Bera (JB): | 1126273.03 |
---|---|---|---|
Prob(Q): | 0.72 | Prob(JB): | 0.00 |
Heteroskedasticity (H): | 1.19 | Skew: | 1.96 |
Prob(H) (two-sided): | 0.00 | Kurtosis: | 41.19 |
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
split_date = '1-jan-2023'
df_train_day = df_day.loc[df_day.index <= split_date].copy()
df_test_day = df_day.loc[df_day.index > split_date].copy()
fig, ax = plt.subplots(figsize=(15,5))
df_train_day.plot(ax=ax, label='Training Data', title='Train/Test Data Split BY Day')
df_test_day.plot(ax=ax, label='Testing Data')
ax.axvline(split_date, color='black', ls='--')
ax.legend(['Training Data', 'Test Data'])
plt.show()
#from statsmodels.tsa.arima_model import ARIMA
model_day=sm.tsa.ARIMA(df_train_day['count'], order=(5,0,5))
model_day=model_day.fit()
model_day.summary()
Dep. Variable: | count | No. Observations: | 1084 |
---|---|---|---|
Model: | ARIMA(5, 0, 5) | Log Likelihood | -6698.985 |
Date: | Wed, 10 May 2023 | AIC | 13421.970 |
Time: | 16:44:37 | BIC | 13481.831 |
Sample: | 0 | HQIC | 13444.632 |
- 1084 | |||
Covariance Type: | opg |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 230.3111 | 39.164 | 5.881 | 0.000 | 153.551 | 307.072 |
ar.L1 | 1.7576 | 0.022 | 81.197 | 0.000 | 1.715 | 1.800 |
ar.L2 | -2.1975 | 0.024 | -92.527 | 0.000 | -2.244 | -2.151 |
ar.L3 | 2.1699 | 0.034 | 64.358 | 0.000 | 2.104 | 2.236 |
ar.L4 | -1.7470 | 0.028 | -62.794 | 0.000 | -1.802 | -1.693 |
ar.L5 | 0.9487 | 0.023 | 41.549 | 0.000 | 0.904 | 0.993 |
ma.L1 | -1.4176 | 0.028 | -49.868 | 0.000 | -1.473 | -1.362 |
ma.L2 | 1.7757 | 0.042 | 42.510 | 0.000 | 1.694 | 1.858 |
ma.L3 | -1.6703 | 0.059 | -28.253 | 0.000 | -1.786 | -1.554 |
ma.L4 | 1.2575 | 0.047 | 26.648 | 0.000 | 1.165 | 1.350 |
ma.L5 | -0.5664 | 0.038 | -15.068 | 0.000 | -0.640 | -0.493 |
sigma2 | 1.678e+04 | 251.650 | 66.696 | 0.000 | 1.63e+04 | 1.73e+04 |
Ljung-Box (L1) (Q): | 4.49 | Jarque-Bera (JB): | 193826.67 |
---|---|---|---|
Prob(Q): | 0.03 | Prob(JB): | 0.00 |
Heteroskedasticity (H): | 2.63 | Skew: | 4.64 |
Prob(H) (two-sided): | 0.00 | Kurtosis: | 67.85 |
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
start=len(df_train_day)
end=len(df_train_day)+len(df_test_day)-1
pred_day=model_day.predict(start=start, end=end, typ='levels')
#print(pred) # dates did not appear do use next line of code
pred_day.index=df_day.index[start:end+1]
print(pred_day)
time
2023-01-02 526.434333
2023-01-03 772.138844
2023-01-04 714.534101
2023-01-05 611.437116
2023-01-06 378.830014
...
2023-05-05 190.894294
2023-05-06 -15.404300
2023-05-07 26.984342
2023-05-08 295.538955
2023-05-09 421.600123
Name: predicted_mean, Length: 128, dtype: float64
pred_day.plot(legend=True)
df_test_day['count'].plot(legend=True)
<AxesSubplot: xlabel='time'>
df_day['count'].mean()
229.16006600660066
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse_day=sqrt(mean_squared_error(pred_day, df_test_day['count']))
rmse_day
157.9142119549536
#Fit model on all df5
model_day = sm.tsa.ARIMA(df_day['count'], order=(5,0,5))
model_day=model_day.fit()
df_day.tail() # predict into the future after training
count | |
---|---|
time | |
2023-05-05 | 286 |
2023-05-06 | 51 |
2023-05-07 | 51 |
2023-05-08 | 482 |
2023-05-09 | 387 |
index_future_dates=pd.date_range(start='2023-05-09', end='2023-12-31', freq='D')
pred_day=model_day.predict(start=len(df_day), end=len(df_day)+(236), type='levels').rename('ARIMA Predictions BY Day')
pred_day.index=index_future_dates
print(pred_day)
2023-05-09 297.317413
2023-05-10 336.257542
2023-05-11 253.195036
2023-05-12 48.160946
2023-05-13 112.191123
...
2023-12-27 262.186749
2023-12-28 227.061904
2023-12-29 157.228323
2023-12-30 188.266458
2023-12-31 265.360301
Freq: D, Name: ARIMA Predictions BY Day, Length: 237, dtype: float64
pred_day.info()
<class 'pandas.core.series.Series'>
DatetimeIndex: 237 entries, 2023-05-09 to 2023-12-31
Freq: D
Series name: ARIMA Predictions BY Day
Non-Null Count Dtype
-------------- -----
237 non-null float64
dtypes: float64(1)
memory usage: 3.7 KB
pred_day.plot(figsize=(12,5),legend=True)
<AxesSubplot: >