Buffalo 311 Call Forecasting

Forecasting Buffalo 311 Call Volume Using AMIRA, FBProphet & XGBOOST

September 01, 2023 · 109 mins read

Imports

import requests
import pandas as pd
import math
import datetime
import urllib.request
import json
import time
import numpy as np
import matplotlib.pyplot as plt
from prophet import Prophet
import seaborn as sns

from sklearn.metrics import mean_squared_error, mean_absolute_error

pd.options.mode.chained_assignment = None  # default='warn'

import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')
plt.style.use('fivethirtyeight')

Retrieve Open Buffalo Data

#hide api ket in text file so its not public
app_token = open('api_key.txt', 'r').read()
#app_token
#hide api token
#use request to pull 311 data from Jan 01, 2020 until today / can slice data in pandas later
limit = 500000
app_token = open('api_key.txt', 'r').read()
uri = f"https://data.buffalony.gov/resource/whkc-e5vr.json?$limit={limit}&$$app_token={app_token}&$where=open_date>'2020-01-10T12:00:00'"
r = requests.get(uri)
print('Status code ',r.status_code)
print('Number of rows returned ',len(r.json()))
print('Endoced URI with params ',r.url)
new_json = r.json()
#new_json
Status code  200
Number of rows returned  277742
Endoced URI with params  https://data.buffalony.gov/resource/whkc-e5vr.json?$limit=500000&$$app_token=NnGV0W4ip4YEFBLvBMGAjaByD&$where=open_date%3E'2020-01-10T12:00:00'
#make pandas df from  json / check head
df=pd.DataFrame(new_json)
print(df.shape)
df.head(2)
(277742, 33)
case_reference open_date closed_date status subject reason type object_type address_number address_line_1 ... census_tract_2010 census_block_group_2010 census_block_2010 tractce20 geoid20_tract geoid20_blockgroup geoid20_block address_line_2 x_coordinate y_coordinate
0 1001109603 2020-01-10T12:02:00.000 2020-01-22T13:35:00.000 Closed Dept of Public Works Sanitation Recycling Tote Deliver (Req_Serv) Property 420 PARKDALE ... 171 2 2000 017100 36029017100 360290171002 360290171002000 NaN NaN NaN
1 1001109605 2020-01-10T12:07:00.000 2020-01-14T13:28:00.000 Closed Dept of Public Works Rodent Control Rodents (Req_Serv) Property 92 PROCTOR ... 41 2 2002 004100 36029004100 360290041002 360290041002002 NaN NaN NaN

2 rows × 33 columns

#save original Buffalo 311 Data to csv
df.to_csv('Buffalo311Data.csv', index = None, header=True)

Format Open Buffalo Data Dates

#add count column with a value of one to rows to count calls by frequency
df['count'] = 1
#df.head()
#format open date to pd datetime to get ready to parse column
df['time'] = pd.to_datetime(df['open_date'])
#df.info() #check to make sure Dtype is datetime64
#take a look at teh columns I need
#parsing by hour will provide 24 times more training data than doing it by day
df1 = df[['time', 'count']]
df1.head(3)
time count
0 2020-01-10 12:02:00 1
1 2020-01-10 12:07:00 1
2 2020-01-10 12:08:00 1

parsing by hour will provide 24 times more training data than doing it by day

I beleive that would equate to needing an extra six years of training data if I did it by day

If I am only testing since the start of 2023 and projecting roughly seven months

I think it is better to train on granular data rather than daily data going back an extra six years

I feel like the more recent the training data, especially after covid the better the results will yield

#use date time to parse by year
df1['time'] = pd.to_datetime(df1['time']).dt.strftime('%Y-%m-%d %H')
df1.head()
time count
0 2020-01-10 12 1
1 2020-01-10 12 1
2 2020-01-10 12 1
3 2020-01-10 12 1
4 2020-01-10 12 1
#group by hour, count and reset index
df2 = df1.groupby(['time']).count().reset_index()
df2.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18336 entries, 0 to 18335
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   time    18336 non-null  object
 1   count   18336 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 286.6+ KB
#make sure time is pd datetime after groupby. seems to create new object after groupby
df2['time'] = pd.to_datetime(df2['time'])
#set time as index
df2 = df2.set_index('time')
#df2.head()
#make df by day instead of hour for some charts
#ultimatly decided to use hourly because I feel it would help for staffing and letting people go home early 
df_day = df1.copy()
df_day['time'] = pd.to_datetime(df_day['time']).dt.strftime('%Y-%m-%d')
df_day = df_day.groupby(['time']).count().reset_index()
df_day['time'] = pd.to_datetime(df_day['time'])
df_day = df_day.set_index('time')

#df_day.head()
#make graph of calls by day
color_pal = sns.color_palette()
df_day.plot(style='.',
         figsize=(10,5),
         ms=1,
         color = color_pal[0],
         title='311 Calls By Day')
plt.show()

png

#make graph of call by hour
color_pal = sns.color_palette()
df2.plot(style='.',
         figsize=(10,5),
         ms=1,
         color = color_pal[0],
         title='311 Calls By Hour')
plt.show()

png

#create seasons for boxplot graph showing dayofweek and season
from pandas.api.types import CategoricalDtype

cat_type = CategoricalDtype(categories=['Monday', 'Tuesday',
                                       'Wednesday', 'Thursday',
                                       'Friday', 'Saturday', 'Sunday'],
                           ordered=True)




def create_features(df, label=None):
    """
    Creates time series features from datetime index.
    """
    df = df.copy()
    df['date'] = df.index
    df['dayofweek'] = df['date'].dt.dayofweek
    df['weekday'] = df['date'].dt.day_name()
    df['weekday'] = df['weekday'].astype(cat_type)
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.isocalendar().week
    #df['weekofyear'] = df['date'].dt.weekofyear
    df['date_offset'] = (df.date.dt.month*100 + df.date.dt.day - 320)%1300
    
    df['season'] = pd.cut(df['date_offset'], [0, 300, 600, 900, 1300],
                         labels=['Spring', 'Summer', 'Fall', 'Winter'])


    X = df[['dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear', 'weekday', 'season']]
    if label:
        y = df[label]
        return X, y
    return X

X, y = create_features(df_day, label='count')

features_and_target = pd.concat([X, y], axis=1)
features_and_target.head()
dayofweek quarter month year dayofyear dayofmonth weekofyear weekday season count
time
2020-01-10 4 1 1 2020 10 10 2 Friday Winter 98
2020-01-11 5 1 1 2020 11 11 2 Saturday Winter 12
2020-01-12 6 1 1 2020 12 12 2 Sunday Winter 13
2020-01-13 0 1 1 2020 13 13 3 Monday Winter 368
2020-01-14 1 1 1 2020 14 14 3 Tuesday Winter 316
fig, ax = plt.subplots(figsize=(10,5))
sns.boxplot(data=features_and_target.dropna(),
           x='weekday',
           y='count',
           hue='season',
           ax = ax,
           linewidth=1,
           palette='YlGnBu')

ax.set_title('Number of 311 Calls by Day of Week')
ax.set_xlabel('Day of Week')
ax.set_ylabel('311 Call Count')
ax.legend(bbox_to_anchor=(1,1))
plt.show()

png

fig, ax = plt.subplots(figsize=(10,5))
sns.boxplot(data=features_and_target.dropna(),
           x='weekofyear',
           y='count',
           hue='season',
           ax = ax,
           linewidth=1,
           palette='YlGnBu')

ax.set_title('Number Of 311 Calls By Week Of Year')
ax.set_xlabel('Week')
ax.set_ylabel('311 Call Count')
plt.xticks(rotation = 90) # rotates x-axis by 45 degrees
ax.legend(bbox_to_anchor=(1,1))
plt.show()

png

Group data by hour and see what the call volume is and group data by week

split_date = '1-jan-2023'
df_train = df2.loc[df2.index <= split_date].copy()
df_test = df2.loc[df2.index > split_date].copy()

# plot train and test so you can see where the split is

df_test \
    .rename(columns={'count': 'TEST SET'}) \
    .join(df_train.rename(columns={'count': 'TRAINING SET'}),
         how='outer') \
    .plot(figsize=(15,5), title='Buffalo 311 Calls By Hour', style='.')
plt.show()

png

Train Prophet model datetime column named ds target = y

#format data for prophet model using ds and y
df_train_prophet = df_train.reset_index()\
    .rename(columns={'time': 'ds',
                    'count': 'y'})
df_train_prophet.head()
ds y
0 2020-01-10 12:00:00 21
1 2020-01-10 13:00:00 20
2 2020-01-10 14:00:00 22
3 2020-01-10 15:00:00 30
4 2020-01-10 16:00:00 3
%%time
model = Prophet()
model.fit(df_train_prophet)
15:18:59 - cmdstanpy - INFO - Chain [1] start processing
15:19:07 - cmdstanpy - INFO - Chain [1] done processing


CPU times: total: 1.42 s
Wall time: 9.67 s





<prophet.forecaster.Prophet at 0x2cc235e5f10>
%%time
# create test dat with same column names
df_test_prophet = df_test.reset_index()\
    .rename(columns={'time': 'ds',
                    'count': 'y'})

df_test_fcst = model.predict(df_test_prophet)
CPU times: total: 2.02 s
Wall time: 541 ms
df_test_fcst.head()
ds trend yhat_lower yhat_upper trend_lower trend_upper additive_terms additive_terms_lower additive_terms_upper daily ... weekly weekly_lower weekly_upper yearly yearly_lower yearly_upper multiplicative_terms multiplicative_terms_lower multiplicative_terms_upper yhat
0 2023-01-01 01:00:00 19.038017 -18.082230 16.134619 19.038017 19.038017 -20.199082 -20.199082 -20.199082 -6.409008 ... -14.649280 -14.649280 -14.649280 0.859206 0.859206 0.859206 0.0 0.0 0.0 -1.161066
1 2023-01-01 03:00:00 19.041680 -16.196708 16.441544 19.041680 19.041680 -18.544646 -18.544646 -18.544646 -5.126092 ... -14.317852 -14.317852 -14.317852 0.899298 0.899298 0.899298 0.0 0.0 0.0 0.497034
2 2023-01-01 07:00:00 19.049007 -16.467086 17.694639 19.049007 19.049007 -18.550251 -18.550251 -18.550251 -6.533164 ... -12.996319 -12.996319 -12.996319 0.979233 0.979233 0.979233 0.0 0.0 0.0 0.498756
3 2023-01-01 08:00:00 19.050838 -1.699195 31.412708 19.050838 19.050838 -4.490101 -4.490101 -4.490101 7.049725 ... -12.538989 -12.538989 -12.538989 0.999163 0.999163 0.999163 0.0 0.0 0.0 14.560738
4 2023-01-01 09:00:00 19.052670 7.991571 42.268996 19.052670 19.052670 6.905000 6.905000 6.905000 17.921719 ... -12.035792 -12.035792 -12.035792 1.019073 1.019073 1.019073 0.0 0.0 0.0 25.957670

5 rows × 22 columns

fig, ax = plt.subplots(figsize=(10,5))
fig = model.plot(df_test_fcst, ax= ax)
plt.xticks(rotation = 45) # rotates x-axis by 45 degrees
ax.set_title('Prophet Forecast')
plt.show()

png

fig = model.plot_components(df_test_fcst)
plt.show()

png

Compare Forecast to Actuals

#plot teh forecast with the actual
f, ax = plt.subplots(figsize=(15,5))
ax.scatter(df_test.index, df_test['count'], color ='r')
fig = model.plot(df_test_fcst, ax= ax)

png

Zoom into actual vs predicted

RED DOTS ARE ACTUAL/ BLUE LINES ARE PREDICTIONS /nseems like there is a mismatch between actual and predicted probably due to 2023 blizzard.

#zoomed in plot of forecast vs actual
import datetime

f, ax = plt.subplots(figsize=(10,5))
ax.scatter(df_test.index, df_test['count'], color ='r')
fig = model.plot(df_test_fcst, ax= ax)
# ax.set_xbound(lower='01-01-2015',
#              upper='02-01-2015')
ax.set_xlim([datetime.date(2023, 1, 1), datetime.date(2023, 2, 1)])
ax.set_ylim(0,110)
plot = plt.suptitle('January 2023 Forecast vs Actuals')

png

Zoom into first week of jan / Blizzard affected call projections

Seems like the 2023 Blizard caused more calls than predicted especially when you compare to the first week of April

#zoomed in plot of forecast vs actual
import datetime

f, ax = plt.subplots(figsize=(10,5))
ax.scatter(df_test.index, df_test['count'], color ='r')
fig = model.plot(df_test_fcst, ax= ax)
# ax.set_xbound(lower='01-01-2015',
#              upper='02-01-2015')
ax.set_xlim([datetime.date(2023, 1, 1), datetime.date(2023, 1, 7)])
ax.set_ylim(0,100)
plot = plt.suptitle('First Week Of January 2023 Forecast vs Actuals')

png

Zoom Into First Week Of April 2023

Aprils predcition is looking better than the first week of Jan
Outliers in first week of Jan are most likly from the Blizzard
#zoomed in plot of forecast vs actual
import datetime

f, ax = plt.subplots(figsize=(10,5))
ax.scatter(df_test.index, df_test['count'], color ='r')
fig = model.plot(df_test_fcst, ax= ax)
# ax.set_xbound(lower='01-01-2015',
#              upper='02-01-2015')
ax.set_xlim([datetime.date(2023, 4, 1), datetime.date(2023, 4, 7)])
ax.set_ylim(0,70)
plot = plt.suptitle('First Week Of April 2023 Forecast vs Actuals')

png

evaluate the model with Error Metrics

# true value vs predicted value
np.sqrt(mean_squared_error(y_true=df_test['count'],
                          y_pred=df_test_fcst['yhat']))
18.53331967782901
mean_absolute_error(y_true=df_test['count'],
                   y_pred=df_test_fcst['yhat'])
16.444075387619147

Adding Holidays To Prophet Model

# set up holiday dataframe
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

cal = calendar()


holidays = cal.holidays(start=df2.index.min(),
                        end=df2.index.max(),
                        return_name=True)
holiday_df = pd.DataFrame(data=holidays,
                          columns=['holiday'])
holiday_df = holiday_df.reset_index().rename(columns={'index':'ds'})
holiday_df.head()
ds holiday
0 2020-01-20 Birthday of Martin Luther King, Jr.
1 2020-02-17 Washington’s Birthday
2 2020-05-25 Memorial Day
3 2020-07-03 Independence Day
4 2020-09-07 Labor Day
holiday_df['holiday'].value_counts()
Birthday of Martin Luther King, Jr.     4
Washington’s Birthday                   4
Memorial Day                            3
Independence Day                        3
Labor Day                               3
Columbus Day                            3
Veterans Day                            3
Thanksgiving Day                        3
Christmas Day                           3
New Year's Day                          3
Juneteenth National Independence Day    2
Name: holiday, dtype: int64
import time
#run Prophet model with holiday df as a parameter for holiday
#%%time

model_with_holidays = Prophet(holidays=holiday_df)
model_with_holidays.fit(df_train_prophet)
15:35:23 - cmdstanpy - INFO - Chain [1] start processing
15:35:31 - cmdstanpy - INFO - Chain [1] done processing





<prophet.forecaster.Prophet at 0x2cc2cfe3c40>
# predict on training set with model
df_test_fcst_with_hols = model_with_holidays.predict(df=df_test_prophet)
df_test_fcst_with_hols.head()
ds trend yhat_lower yhat_upper trend_lower trend_upper Birthday of Martin Luther King, Jr. Birthday of Martin Luther King, Jr._lower Birthday of Martin Luther King, Jr._upper Christmas Day ... weekly weekly_lower weekly_upper yearly yearly_lower yearly_upper multiplicative_terms multiplicative_terms_lower multiplicative_terms_upper yhat
0 2023-01-01 01:00:00 19.962327 -17.298810 16.125482 19.962327 19.962327 0.0 0.0 0.0 0.0 ... -15.474464 -15.474464 -15.474464 1.114713 1.114713 1.114713 0.0 0.0 0.0 -0.726449
1 2023-01-01 03:00:00 19.966306 -15.429744 17.809987 19.966306 19.966306 0.0 0.0 0.0 0.0 ... -15.147649 -15.147649 -15.147649 1.160615 1.160615 1.160615 0.0 0.0 0.0 0.815920
2 2023-01-01 07:00:00 19.974263 -15.194841 17.696200 19.974263 19.974263 0.0 0.0 0.0 0.0 ... -13.741419 -13.741419 -13.741419 1.251959 1.251959 1.251959 0.0 0.0 0.0 0.869277
3 2023-01-01 08:00:00 19.976252 -0.710243 32.056320 19.976252 19.976252 0.0 0.0 0.0 0.0 ... -13.242802 -13.242802 -13.242802 1.274699 1.274699 1.274699 0.0 0.0 0.0 15.025595
4 2023-01-01 09:00:00 19.978242 10.426556 42.368802 19.978242 19.978242 0.0 0.0 0.0 0.0 ... -12.690424 -12.690424 -12.690424 1.297399 1.297399 1.297399 0.0 0.0 0.0 26.507473

5 rows × 58 columns

model_with_holidays.plot_components(
    df_test_fcst_with_hols)

png

png

#zoomed in plot of forecast vs actual

# red dots show actual vs projected line in blue
import datetime

f, ax = plt.subplots(figsize=(10,5))
ax.scatter(df_test.index, df_test['count'], color ='r')
fig = model.plot(df_test_fcst_with_hols, ax= ax)
# ax.set_xbound(lower='07-01-2015',
#              upper='07-07-2015')
ax.set_xlim([datetime.date(2023, 1, 1), datetime.date(2023, 5, 1)])
ax.set_ylim(0,110)
plt.xticks(rotation = 45) # rotates x-axis by 45 degrees
plot = plt.suptitle('January to April Forecast vs Actuals')

png

#zoomed in plot of forecast vs actual
import datetime

f, ax = plt.subplots(figsize=(10,5))
ax.scatter(df_test.index, df_test['count'], color ='r')
fig = model.plot(df_test_fcst_with_hols, ax= ax)
# ax.set_xbound(lower='07-01-2015',
#              upper='07-07-2015')
ax.set_xlim([datetime.date(2023, 4, 1), datetime.date(2023, 4, 7)])
ax.set_ylim(0,80)
plot = plt.suptitle('First Week Of April Forecast vs Actuals With Holidays')

png

# true value vs predicted value
np.sqrt(mean_squared_error(y_true=df_test['count'],
                          y_pred=df_test_fcst_with_hols['yhat']))
19.14157031066351
mean_absolute_error(y_true=df_test['count'],
                   y_pred=df_test_fcst_with_hols['yhat'])
17.139861907579142

Predict into the future

future = model.make_future_dataframe(periods=365*24, freq='h', include_history=False)
forecast = model_with_holidays.predict(future)
forecast.head()
ds trend yhat_lower yhat_upper trend_lower trend_upper Birthday of Martin Luther King, Jr. Birthday of Martin Luther King, Jr._lower Birthday of Martin Luther King, Jr._upper Christmas Day ... weekly weekly_lower weekly_upper yearly yearly_lower yearly_upper multiplicative_terms multiplicative_terms_lower multiplicative_terms_upper yhat
0 2023-01-01 01:00:00 19.962327 -16.882693 15.841012 19.962327 19.962327 0.0 0.0 0.0 0.0 ... -15.474464 -15.474464 -15.474464 1.114713 1.114713 1.114713 0.0 0.0 0.0 -0.726449
1 2023-01-01 02:00:00 19.964316 -12.946475 17.548705 19.964316 19.964316 0.0 0.0 0.0 0.0 ... -15.343241 -15.343241 -15.343241 1.137683 1.137683 1.137683 0.0 0.0 0.0 2.207288
2 2023-01-01 03:00:00 19.966306 -14.783082 17.679284 19.966306 19.966306 0.0 0.0 0.0 0.0 ... -15.147649 -15.147649 -15.147649 1.160615 1.160615 1.160615 0.0 0.0 0.0 0.815920
3 2023-01-01 04:00:00 19.968295 -21.090066 11.068481 19.968295 19.968295 0.0 0.0 0.0 0.0 ... -14.888335 -14.888335 -14.888335 1.183508 1.183508 1.183508 0.0 0.0 0.0 -4.982981
4 2023-01-01 05:00:00 19.970284 -28.096313 3.966954 19.970284 19.970284 0.0 0.0 0.0 0.0 ... -14.566408 -14.566408 -14.566408 1.206364 1.206364 1.206364 0.0 0.0 0.0 -10.279008

5 rows × 58 columns

fig, ax = plt.subplots(figsize=(10,5))
fig = model.plot(forecast, ax= ax)
ax.set_title('Prophet Forecast')
plt.show()

png

#zoomed in plot of forecast
import datetime

fig, ax = plt.subplots(figsize=(10,5))
fig = model.plot(forecast, ax= ax)
ax.set_xlim([datetime.date(2023, 1, 1), datetime.date(2023, 12, 31)])
ax.set_ylim(0,100)
plt.xticks(rotation = 45) # rotates x-axis by 45 degrees
ax.set_title('Prophet Forecast')
plt.show()


png

#zoomed in plot of forecast for upcoming week; first week of may
import datetime

fig, ax = plt.subplots(figsize=(10,5))
fig = model.plot(forecast, ax= ax)
ax.set_xlim([datetime.date(2023, 6, 1), datetime.date(2023, 6, 7)])
ax.set_ylim(0,80)
ax.set_title('Prophet Forecast First Week Of June')
plt.show()

png

#zoomed in plot of forecast for upcoming Friday; first firday of june
import datetime

fig, ax = plt.subplots(figsize=(10,5))
fig = model.plot(forecast, ax= ax)
ax.set_xlim([datetime.date(2023, 6, 2), datetime.date(2023, 6, 3)])
ax.set_ylim(0,70)
ax.set_title('Prophet Forecast First Friday Of June 2023 By Hour')
plt.show()

png

The graph above shows a peak call time between 9am and 3pm friday June 2nd

XGBOOST Forecasting

split_date = '1-jan-2023'
df_train2 = df2.loc[df2.index <= split_date].copy()
df_test2 = df2.loc[df2.index > split_date].copy()

fig, ax = plt.subplots(figsize=(15,5))
df_train2.plot(ax=ax, label='Training Data', title='Train/Test Data Split')
df_test2.plot(ax=ax, label='Testing Data')
ax.axvline(split_date, color='black', ls='--')
ax.legend(['Training Data', 'Test Data'])
plt.show()

png

#check out two weeks in Jan 2022 to get a better picture of the data
df_train2.loc[(df_train.index > '01-01-2022') & (df_train2.index < '01-15-2022')].plot(figsize=(15,5), title='Two Weeks in Jan 2022')
plt.show()

png

Create more features to add to the model

def create_features_2(df):
    '''
    create time series features based on index date
    '''
    df = df.copy()
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofyear'] = df.index.dayofyear
    return df

df3 = create_features_2(df2)

Visualize Data By New Features

df3.head(2)
df3.shape
(18336, 7)
fig, ax = plt.subplots(figsize=(10,8))
sns.boxplot(data = df3, x='hour', y='count')
ax.set_title('Buffalo 311 Call Count By Hour')
plt.show()

png

fig, ax = plt.subplots(figsize=(10,8))
sns.boxplot(data = df3, x='month', y='count', palette='Blues')
ax.set_title('Buffalo 311 Call Count By Month')
plt.show()

png

fig, ax = plt.subplots(figsize=(10,8))
sns.boxplot(data = df3, x='quarter', y='count', palette='Blues')
ax.set_title('Buffalo 311 Call Count By Quarter')
plt.show()

png

#note that  2023 data is only until 05/04/2023
fig, ax = plt.subplots(figsize=(10,8))
sns.boxplot(data = df3, x='year', y='count', palette = 'Reds')
ax.set_title('Buffalo 311 Call Count By Year')
plt.show()

png

Seems like 2023 is going to be a blow out year for 311 calls since it is already equal to 2021 call count
We are about to be into the 3rd quarter which is historically the largest quarter according to the quarter graph above
fig, ax = plt.subplots(figsize=(10,8))
sns.boxplot(data = df3, x='dayofweek', y='count', palette='Greens')
ax.set_title('Buffalo 311 Call Count By Day Of Week')
plt.show()

png

fig, ax = plt.subplots(figsize=(10,8))
sns.boxplot(data = df3, x='dayofyear', y='count', palette='Purples')
ax.set_title('Buffalo 311 Call Count By Day Of Year')
plt.show()

png

Create XGBOOST Model

import xgboost as xgb
from sklearn.metrics import mean_squared_error
df2.head()
count
time
2020-01-10 12:00:00 21
2020-01-10 13:00:00 20
2020-01-10 14:00:00 22
2020-01-10 15:00:00 30
2020-01-10 16:00:00 3
df_train = create_features_2(df_train)
df_test = create_features_2(df_test)
print(df_train.columns)
print(df_test.columns)
Index(['count', 'hour', 'dayofweek', 'quarter', 'month', 'year', 'dayofyear'], dtype='object')
Index(['count', 'hour', 'dayofweek', 'quarter', 'month', 'year', 'dayofyear'], dtype='object')
features = ['hour', 'dayofweek', 'quarter', 'month', 'year', 'dayofyear']
target = 'count'
X_train = df_train[features]
y_train = df_train[target]

X_test = df_test[features]
y_test = df_test[target]
#had to lower learning rate to yield better MSE results and have the model not overfit
reg = xgb.XGBRegressor(n_estimators = 1000, early_stopping_rounds = 50,
                      learning_rate = 0.001)
reg.fit(X_train, y_train,
       eval_set = [(X_train, y_train), (X_test, y_test)],
       verbose = 75)
[0]	validation_0-rmse:24.42930	validation_1-rmse:21.25184
[75]	validation_0-rmse:23.02614	validation_1-rmse:19.67356
[150]	validation_0-rmse:21.74553	validation_1-rmse:18.24009
[225]	validation_0-rmse:20.57594	validation_1-rmse:17.05880
[300]	validation_0-rmse:19.51455	validation_1-rmse:15.93974
[375]	validation_0-rmse:18.54768	validation_1-rmse:14.92595
[450]	validation_0-rmse:17.66992	validation_1-rmse:14.00971
[525]	validation_0-rmse:16.87422	validation_1-rmse:13.17353
[600]	validation_0-rmse:16.15464	validation_1-rmse:12.41610
[675]	validation_0-rmse:15.50567	validation_1-rmse:11.74820
[750]	validation_0-rmse:14.91105	validation_1-rmse:11.12767
[825]	validation_0-rmse:14.37133	validation_1-rmse:10.58090
[900]	validation_0-rmse:13.87591	validation_1-rmse:10.11855
[975]	validation_0-rmse:13.41815	validation_1-rmse:9.75119
[999]	validation_0-rmse:13.27996	validation_1-rmse:9.65540
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=50,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.001, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=1000, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Feature Importance

from xgboost import plot_importance
plot_importance(reg)
plt.show()

png

Forecast on the Test Data

df_test['prediction'] = reg.predict(X_test)
df3 = df3.merge(df_test[['prediction']], how='left', left_index=True, right_index=True)
ax = df3[['count']].plot(figsize = (15,5))
df3['prediction'].plot(ax=ax, style='.')
plt.legend(['True Data', 'Predictions'])
ax.set_title('Raw Data & Predictions')
plt.show()

png

Zoom into the test vs projections

Feb projection(blue) vs actual(red)

ax = df_test.loc[(df_test.index > '02-01-2023') & (df_test.index < '03-01-2023')]['count'] \
    .plot(figsize=(15,5), title = ' Feb 2023')
df_test.loc[(df_test.index > '02-01-2023') & (df_test.index <'03-01-2023')]['prediction'] \
    .plot(style='.')
plt.legend(['Prediction','Real Data' ])
plt.show()

png

score = np.sqrt(mean_squared_error(df_test['count'], df_test['prediction']))
print(f'RMSE Score on Test set: {score:0.2f}')
RMSE Score on Test set: 9.66

Calaculate Error for best and worst days

Worst Projected Days Happened Close to Blizzard

df_test['error'] = np.abs(df_test[target] - df_test['prediction'])
df_test['date'] = df_test.index.date
#worst projected days happended close to the blizzard
df_test.groupby('date')['error'].mean().sort_values(ascending=False).head(5)
date
2023-01-05    19.826350
2023-01-03    17.533686
2023-02-24    17.288375
2023-01-04    16.783298
2023-01-09    14.823705
Name: error, dtype: float64
#best projected days of forecast
df_test.groupby('date')['error'].mean().sort_values(ascending=True).head(5)
date
2023-01-15    0.749523
2023-03-25    0.754664
2023-01-29    0.799345
2023-04-30    0.824599
2023-03-18    0.903888
Name: error, dtype: float64

Outlier Analysis

Decided to keep outliers in for the K_fold xgb model

df2['count'].plot(kind='hist', bins=1000)
<AxesSubplot: ylabel='Frequency'>

png

df2.query('count < 10').plot(figsize=(15,5), style='.')
<AxesSubplot: xlabel='time'>

png

df2.query('count > 50').plot(figsize=(15,5), style='.')
<AxesSubplot: xlabel='time'>

png

split_date = '1-jan-2023'
df_train3 = df2.loc[df2.index <= split_date].copy()
df_test3 = df2.loc[df2.index > split_date].copy()

fig, ax = plt.subplots(figsize=(15,5))
df_train3.plot(ax=ax, label='Training Data', title='Train/Test Data Split')
df_test3.plot(ax=ax, label='Testing Data')
ax.axvline(split_date, color='black', ls='--')
ax.legend(['Training Data', 'Test Data'])
plt.show()

png

Time Series Cross Validation

from sklearn.model_selection import TimeSeriesSplit
df2.shape
(18336, 1)
print(df2.head())
print(df2.tail())
                     count
time                      
2020-01-10 12:00:00     21
2020-01-10 13:00:00     20
2020-01-10 14:00:00     22
2020-01-10 15:00:00     30
2020-01-10 16:00:00      3
                     count
time                      
2023-05-09 19:00:00      5
2023-05-09 20:00:00      1
2023-05-09 21:00:00      2
2023-05-09 22:00:00     17
2023-05-09 23:00:00      2
#tss = TimeSeriesSplit(n_splits=2, test_size=24*365, gap = 24)
tss = TimeSeriesSplit(n_splits=8, test_size=24*60*1, gap = 24)
df4 = df2.sort_index()
df4.head()
count
time
2020-01-10 12:00:00 21
2020-01-10 13:00:00 20
2020-01-10 14:00:00 22
2020-01-10 15:00:00 30
2020-01-10 16:00:00 3
fig, axs = plt.subplots(8, 1, figsize=(15,15),
                            sharex = True)

fold = 0

for train_idx, val_idx in tss.split(df4):
    train = df4.iloc[train_idx]
    test = df4.iloc[val_idx]
    train['count'].plot(ax=axs[fold],
                        label='Training Set', 
                        title=f'Data Train/Test Split Fold {fold}')
    test['count'].plot(ax=axs[fold],
                       label = 'Test Set')
    axs[fold].axvline(test.index.min(), color='black', ls='--')
    fold +=1

png

# make function for new time features 
def create_features4(df):
    """
    Create time series features based on time series index.
    """
    df = df.copy()
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofyear'] = df.index.dayofyear
    df['dayofmonth'] = df.index.day
    df['weekofyear'] = df.index.isocalendar().week
    return df

df4 = create_features4(df4)
df4.head()
count hour dayofweek quarter month year dayofyear dayofmonth weekofyear
time
2020-01-10 12:00:00 21 12 4 1 1 2020 10 10 2
2020-01-10 13:00:00 20 13 4 1 1 2020 10 10 2
2020-01-10 14:00:00 22 14 4 1 1 2020 10 10 2
2020-01-10 15:00:00 30 15 4 1 1 2020 10 10 2
2020-01-10 16:00:00 3 16 4 1 1 2020 10 10 2

Lag Features

# make lag feature function
def add_lags(df):
    target_map = df['count'].to_dict()
    df['lag1'] = (df.index - pd.Timedelta('182 days')).map(target_map)
    df['lag2'] = (df.index - pd.Timedelta('364 days')).map(target_map)
    df['lag3'] = (df.index - pd.Timedelta('728 days')).map(target_map)
    return df
#test lags function
#df4 = add_lags(df)
#check df after lags function test
#df4.tail()

Train Using Cross Validation

tss = TimeSeriesSplit(n_splits=8, test_size=24*60*1, gap = 24)
df4 = df2.sort_index()

fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(df4):
    train = df4.iloc[train_idx]
    test = df4.iloc[val_idx]
    
    train = create_features4(train)
    test = create_features4(test)
    
    train = add_lags(train)
    test = add_lags(test)
    
    FEATURES4 = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month','year',
                'lag1','lag2', 'lag3']
    TARGET4 = 'count'
    
    X_train = train[FEATURES4]
    y_train = train[TARGET4]

    X_test = test[FEATURES4]
    y_test = test[TARGET4]
    
    
    reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
                           n_estimators=100000,
                           early_stopping_rounds=50,
                           objective='reg:linear',
                           max_depth=3,
                           learning_rate=0.1)
    reg.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            verbose=10000)

    y_pred = reg.predict(X_test)
    preds.append(y_pred)
    score = np.sqrt(mean_squared_error(y_test, y_pred))
    scores.append(score)
    
    
[15:49:23] WARNING: c:\users\dev-admin\croot2\xgboost-split_1675461376218\work\src\objective\regression_obj.cu:213: reg:linear is now deprecated in favor of reg:squarederror.
[0]	validation_0-rmse:20.28806	validation_1-rmse:25.23168
[365]	validation_0-rmse:6.34963	validation_1-rmse:9.52946
[15:49:23] WARNING: c:\users\dev-admin\croot2\xgboost-split_1675461376218\work\src\objective\regression_obj.cu:213: reg:linear is now deprecated in favor of reg:squarederror.
[0]	validation_0-rmse:21.08501	validation_1-rmse:20.03218
[495]	validation_0-rmse:6.41239	validation_1-rmse:7.29066
[15:49:24] WARNING: c:\users\dev-admin\croot2\xgboost-split_1675461376218\work\src\objective\regression_obj.cu:213: reg:linear is now deprecated in favor of reg:squarederror.
[0]	validation_0-rmse:20.86114	validation_1-rmse:29.95618
[163]	validation_0-rmse:7.21473	validation_1-rmse:21.61451
[15:49:25] WARNING: c:\users\dev-admin\croot2\xgboost-split_1675461376218\work\src\objective\regression_obj.cu:213: reg:linear is now deprecated in favor of reg:squarederror.
[0]	validation_0-rmse:22.05514	validation_1-rmse:22.50675
[252]	validation_0-rmse:7.36629	validation_1-rmse:11.83633
[15:49:25] WARNING: c:\users\dev-admin\croot2\xgboost-split_1675461376218\work\src\objective\regression_obj.cu:213: reg:linear is now deprecated in favor of reg:squarederror.
[0]	validation_0-rmse:22.03981	validation_1-rmse:26.97997
[972]	validation_0-rmse:6.62589	validation_1-rmse:11.07590
[15:49:26] WARNING: c:\users\dev-admin\croot2\xgboost-split_1675461376218\work\src\objective\regression_obj.cu:213: reg:linear is now deprecated in favor of reg:squarederror.
[0]	validation_0-rmse:22.46248	validation_1-rmse:21.76932
[3115]	validation_0-rmse:5.52317	validation_1-rmse:7.01527
[15:49:31] WARNING: c:\users\dev-admin\croot2\xgboost-split_1675461376218\work\src\objective\regression_obj.cu:213: reg:linear is now deprecated in favor of reg:squarederror.
[0]	validation_0-rmse:22.24770	validation_1-rmse:28.82747
[837]	validation_0-rmse:6.73216	validation_1-rmse:19.81065
[15:49:33] WARNING: c:\users\dev-admin\croot2\xgboost-split_1675461376218\work\src\objective\regression_obj.cu:213: reg:linear is now deprecated in favor of reg:squarederror.
[0]	validation_0-rmse:22.71290	validation_1-rmse:20.37634
[239]	validation_0-rmse:8.34257	validation_1-rmse:12.02755
print(f'Scores across all folds {np.mean(scores):0.4f}')
print(f'Fold scores:{scores}')
Scores across all folds 12.4270
Fold scores:[9.432683974684368, 7.263089487308816, 21.508818615589814, 11.799599285719262, 11.072779306904335, 7.013240970100097, 19.336143148373846, 11.989809586405414]

Retrain model on all training data to prepare for forecasting the future

#retrain all data to prepare for forecasting
#still leverage all data for forecast prediction
#switchedn_estimator to 1500 because the model still seemed like it was getting better
#also moved the learning rate down

df4 = create_features4(df4)
df4 = add_lags(df4)

FEATURES4 = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month','year',
                'lag1','lag2', 'lag3']
TARGET4 = 'count'

X_all = df4[FEATURES4]
y_all = df4[TARGET4]

reg = xgb.XGBRegressor(base_score=0.5,
                       booster='gbtree',
                       #n_estimators=25000,
                       n_estimators=100000,
                       early_stopping_rounds=50,
                       objective='reg:linear',
                       max_depth=3,
                       #learning_rate=0.0001) # LEARNING FASTER SEEMS TO BOOST PERFORMANCE OF THE MODEL
                       learning_rate=0.1)

reg.fit(X_all, y_all,
       eval_set=[(X_all, y_all)],
       verbose=10000)
[15:50:49] WARNING: c:\users\dev-admin\croot2\xgboost-split_1675461376218\work\src\objective\regression_obj.cu:213: reg:linear is now deprecated in favor of reg:squarederror.
[0]	validation_0-rmse:22.43720
[10000]	validation_0-rmse:4.73683
[20000]	validation_0-rmse:4.05477
[30000]	validation_0-rmse:3.67679
[40000]	validation_0-rmse:3.42238
[50000]	validation_0-rmse:3.23311
[60000]	validation_0-rmse:3.07159
[70000]	validation_0-rmse:2.93679
[80000]	validation_0-rmse:2.82711
[90000]	validation_0-rmse:2.72784
[99999]	validation_0-rmse:2.63702
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=50,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=3, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100000, n_jobs=None, num_parallel_tree=None,
             objective='reg:linear', predictor=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Lowered RMSE when I change the n_estimator to 100K and move the learning rate down

Forecasting The Future

#find max value and set rage from last date 
FurtueStartDate = df4.index.max()
'''
df_day['time'] = pd.to_datetime(df_day['time']).dt.strftime('%Y-%m-%d')
'''
#FurtueStartDate = FurtueStartDate.dt.strftime('%Y-%m-%d')
FurtueStartDate
Timestamp('2023-05-09 23:00:00')
#can't project longer than the smallest lag
future = pd.date_range('2023-05-04', '11/02/2023' , freq='1h')
future_df = pd.DataFrame(index=future)
future_df['is_future'] = True
df4['is_future'] = False
df_and_future = pd.concat([df4, future_df])
df_and_future = create_features4(df_and_future)
df_and_future = add_lags(df_and_future)
df_and_future
count hour dayofweek quarter month year dayofyear dayofmonth weekofyear lag1 lag2 lag3 is_future
2020-01-10 12:00:00 21.0 12 4 1 1 2020 10 10 2 NaN NaN NaN False
2020-01-10 13:00:00 20.0 13 4 1 1 2020 10 10 2 NaN NaN NaN False
2020-01-10 14:00:00 22.0 14 4 1 1 2020 10 10 2 NaN NaN NaN False
2020-01-10 15:00:00 30.0 15 4 1 1 2020 10 10 2 NaN NaN NaN False
2020-01-10 16:00:00 3.0 16 4 1 1 2020 10 10 2 NaN NaN NaN False
... ... ... ... ... ... ... ... ... ... ... ... ... ...
2023-11-01 20:00:00 NaN 20 2 4 11 2023 305 1 44 3.0 4.0 NaN True
2023-11-01 21:00:00 NaN 21 2 4 11 2023 305 1 44 1.0 2.0 1.0 True
2023-11-01 22:00:00 NaN 22 2 4 11 2023 305 1 44 1.0 NaN 1.0 True
2023-11-01 23:00:00 NaN 23 2 4 11 2023 305 1 44 2.0 NaN 1.0 True
2023-11-02 00:00:00 NaN 0 3 4 11 2023 306 2 44 NaN NaN NaN True

22705 rows × 13 columns

#may need to go in and fill in lag Nan's with ffill
future_with_features = df_and_future.query('is_future').copy()
future_with_features
count hour dayofweek quarter month year dayofyear dayofmonth weekofyear lag1 lag2 lag3 is_future
2023-05-04 00:00:00 NaN 0 3 2 5 2023 124 4 18 NaN NaN NaN True
2023-05-04 01:00:00 NaN 1 3 2 5 2023 124 4 18 NaN NaN NaN True
2023-05-04 02:00:00 NaN 2 3 2 5 2023 124 4 18 NaN NaN NaN True
2023-05-04 03:00:00 NaN 3 3 2 5 2023 124 4 18 NaN NaN NaN True
2023-05-04 04:00:00 NaN 4 3 2 5 2023 124 4 18 NaN 1.0 NaN True
... ... ... ... ... ... ... ... ... ... ... ... ... ...
2023-11-01 20:00:00 NaN 20 2 4 11 2023 305 1 44 3.0 4.0 NaN True
2023-11-01 21:00:00 NaN 21 2 4 11 2023 305 1 44 1.0 2.0 1.0 True
2023-11-01 22:00:00 NaN 22 2 4 11 2023 305 1 44 1.0 NaN 1.0 True
2023-11-01 23:00:00 NaN 23 2 4 11 2023 305 1 44 2.0 NaN 1.0 True
2023-11-02 00:00:00 NaN 0 3 4 11 2023 306 2 44 NaN NaN NaN True

4369 rows × 13 columns

Run Predict

FEATURES4 = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month','year',
                'lag1','lag2', 'lag3']
TARGET4 = 'count'

#run trained model on features in future df
#save in new column called pred
future_with_features['pred'] = reg.predict(future_with_features[FEATURES4])
import matplotlib.pyplot as plt

future_with_features['pred'].plot(figsize = (10,5),
                                  #color= 'Blue',
                                  color= color_pal[0],
                                  ms=1, 
                                  lw=1,
                                  title='XGBOOST Future Predictions')
plt.show()

png

Save & Load XGBOOST model

reg.save_model('311XGB_model.json')
reg_new = xgb.XGBRegressor()
reg_new.load_model('311XGB_model.json')

Run saved model on future features

#run saved model under reg_new and plot results
future_with_features['pred'] = reg_new.predict(future_with_features[FEATURES4])

future_with_features['pred'].plot(figsize = (10,5),
                                  #color= 'Blue',
                                  color= color_pal[0],
                                  ms=1, 
                                  lw=1,
                                  title='XGBOOST Future Predictions')
plt.show()

png

#zoomed in plot of forecast for upcoming week; first week of may
import datetime
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10,5))
ax.plot(future_with_features.index, future_with_features['pred'])
#fig = future_with_features['pred'].plot(future_with_features['pred'], ax= ax)
ax.set_xlim([datetime.date(2023, 7, 1), datetime.date(2023, 7, 7)])
ax.set_ylim(0,80)
ax.set_title('XGBOOST Forecast First Week Of July')
plt.show()

png

#zoomed in plot of forecast for upcoming week; first week of may
import datetime
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10,5))
ax.plot(future_with_features.index, future_with_features['pred'])
#fig = future_with_features['pred'].plot(future_with_features['pred'], ax= ax)
ax.set_xlim([datetime.date(2023, 11, 1), datetime.date(2023, 11, 2)])
ax.set_ylim(0,80)
ax.set_title('XGBOOST Forecast November 1st By Hour')
plt.show()

png

from xgboost import plot_importance
plot_importance(reg_new)
plt.show()

png

AMIRA

Auto Regression, , Integarted(differencing) , Moving Average

can not figure out how to do hourly, switched to daily data
!pip3 install pmdarima
Requirement already satisfied: pmdarima in c:\users\brett\anaconda3\envs\school\lib\site-packages (2.0.3)
Requirement already satisfied: numpy>=1.21.2 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pmdarima) (1.23.5)
Requirement already satisfied: setuptools!=50.0.0,>=38.6.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pmdarima) (65.6.3)
Requirement already satisfied: pandas>=0.19 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pmdarima) (1.5.3)
Requirement already satisfied: Cython!=0.29.18,!=0.29.31,>=0.29 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pmdarima) (0.29.34)
Requirement already satisfied: urllib3 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pmdarima) (1.26.14)
Requirement already satisfied: joblib>=0.11 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pmdarima) (1.2.0)
Requirement already satisfied: statsmodels>=0.13.2 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pmdarima) (0.13.5)
Requirement already satisfied: scikit-learn>=0.22 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pmdarima) (1.2.1)
Requirement already satisfied: scipy>=1.3.2 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pmdarima) (1.10.0)
Requirement already satisfied: pytz>=2020.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pandas>=0.19->pmdarima) (2022.7)
Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pandas>=0.19->pmdarima) (2.8.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-learn>=0.22->pmdarima) (3.1.0)
Requirement already satisfied: packaging>=21.3 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from statsmodels>=0.13.2->pmdarima) (22.0)
Requirement already satisfied: patsy>=0.5.2 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from statsmodels>=0.13.2->pmdarima) (0.5.3)
Requirement already satisfied: six in c:\users\brett\anaconda3\envs\school\lib\site-packages (from patsy>=0.5.2->statsmodels>=0.13.2->pmdarima) (1.16.0)
df_day['count'].plot(figsize=(12,5))
<AxesSubplot: xlabel='time'>

png

Check For Stationarity

!pip install statsmodels
Requirement already satisfied: statsmodels in c:\users\brett\anaconda3\envs\school\lib\site-packages (0.13.5)
Requirement already satisfied: numpy>=1.17 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from statsmodels) (1.23.5)
Requirement already satisfied: scipy>=1.3 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from statsmodels) (1.10.0)
Requirement already satisfied: pandas>=0.25 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from statsmodels) (1.5.3)
Requirement already satisfied: packaging>=21.3 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from statsmodels) (22.0)
Requirement already satisfied: patsy>=0.5.2 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from statsmodels) (0.5.3)
Requirement already satisfied: pytz>=2020.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pandas>=0.25->statsmodels) (2022.7)
Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pandas>=0.25->statsmodels) (2.8.2)
Requirement already satisfied: six in c:\users\brett\anaconda3\envs\school\lib\site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)
#import statsmodels.api as sm
import statsmodels.api
import statsmodels.formula.api as smf
from statsmodels.tsa.stattools import adfuller
def ad_test(dataset):
    dftest = adfuller(dataset, autolag = 'AIC')
    print('1. ADF : ', dftest[0])
    print('2. P-Value : ', dftest[1])
    print('3. Num Of Lags : ', dftest[2])
    print('4. Num Of Observations Used For ADF Regression and Critical Values Calculation : ', dftest[3])
    print('5. Critical Values : ')
    for key, val in dftest[4].items():
        print('\t', key, ':',val)
# P-Value is probability and should be as low as possible
#the smaller the better P-Value
#small value means teh data set is stationary
ad_test(df_day['count'])
1. ADF :  -4.7528316341012635
2. P-Value :  6.688647512497074e-05
3. Num Of Lags :  22
4. Num Of Observations Used For ADF Regression and Critical Values Calculation :  1189
5. Critical Values : 
	 1% : -3.435861752677197
	 5% : -2.8639738850277796
	 10% : -2.568065847341873
from pmdarima import auto_arima
# Ignore  warnings
import warnings
warnings.filterwarnings('ignore')

make predictions on future values

Predict the future

df_day.head()
count
time
2020-01-10 98
2020-01-11 12
2020-01-12 13
2020-01-13 368
2020-01-14 316
split_date = '1-jan-2023'
df_train_day = df_day.loc[df_day.index <= split_date].copy()
df_test_day = df_day.loc[df_day.index > split_date].copy()

fig, ax = plt.subplots(figsize=(15,5))
df_train_day.plot(ax=ax, label='Training Data', title='Train/Test Data Split BY Day')
df_test_day.plot(ax=ax, label='Testing Data')
ax.axvline(split_date, color='black', ls='--')
ax.legend(['Training Data', 'Test Data'])
plt.show()

png

import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api
import statsmodels.formula.api as smf
stepwise_fit = auto_arima(df_day['count'],
                          trace=True,
                          supress_warnings=True)

stepwise_fit.summary()  
Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=15485.928, Time=1.29 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=16082.799, Time=0.03 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=16076.953, Time=0.05 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=15868.409, Time=0.27 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=16080.802, Time=0.02 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=15577.259, Time=0.55 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=15544.333, Time=0.56 sec
 ARIMA(3,1,2)(0,0,0)[0] intercept   : AIC=inf, Time=1.19 sec
 ARIMA(2,1,3)(0,0,0)[0] intercept   : AIC=15334.875, Time=1.49 sec
 ARIMA(1,1,3)(0,0,0)[0] intercept   : AIC=15546.470, Time=1.10 sec
 ARIMA(3,1,3)(0,0,0)[0] intercept   : AIC=15311.143, Time=1.53 sec
 ARIMA(4,1,3)(0,0,0)[0] intercept   : AIC=15164.604, Time=1.69 sec
 ARIMA(4,1,2)(0,0,0)[0] intercept   : AIC=15301.116, Time=1.93 sec
 ARIMA(5,1,3)(0,0,0)[0] intercept   : AIC=15108.060, Time=1.90 sec
 ARIMA(5,1,2)(0,0,0)[0] intercept   : AIC=15151.929, Time=1.64 sec
 ARIMA(5,1,4)(0,0,0)[0] intercept   : AIC=15056.058, Time=2.09 sec
 ARIMA(4,1,4)(0,0,0)[0] intercept   : AIC=15151.651, Time=1.83 sec
 ARIMA(5,1,5)(0,0,0)[0] intercept   : AIC=14925.176, Time=2.65 sec
 ARIMA(4,1,5)(0,0,0)[0] intercept   : AIC=14951.563, Time=2.32 sec
 ARIMA(5,1,5)(0,0,0)[0]             : AIC=14923.674, Time=1.89 sec
 ARIMA(4,1,5)(0,0,0)[0]             : AIC=14949.732, Time=1.61 sec
 ARIMA(5,1,4)(0,0,0)[0]             : AIC=15054.277, Time=1.25 sec
 ARIMA(4,1,4)(0,0,0)[0]             : AIC=15149.548, Time=0.89 sec

Best model:  ARIMA(5,1,5)(0,0,0)[0]          
Total fit time: 29.778 seconds
SARIMAX Results
Dep. Variable: y No. Observations: 1212
Model: SARIMAX(5, 1, 5) Log Likelihood -7450.837
Date: Wed, 10 May 2023 AIC 14923.674
Time: 16:08:20 BIC 14979.765
Sample: 0 HQIC 14944.793
- 1212
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
ar.L1 0.9527 0.061 15.629 0.000 0.833 1.072
ar.L2 -1.5558 0.046 -33.884 0.000 -1.646 -1.466
ar.L3 1.0378 0.081 12.746 0.000 0.878 1.197
ar.L4 -1.1024 0.049 -22.362 0.000 -1.199 -1.006
ar.L5 0.1677 0.053 3.145 0.002 0.063 0.272
ma.L1 -1.5405 0.053 -28.943 0.000 -1.645 -1.436
ma.L2 1.8972 0.057 33.325 0.000 1.786 2.009
ma.L3 -1.8915 0.057 -33.031 0.000 -2.004 -1.779
ma.L4 1.4121 0.062 22.934 0.000 1.291 1.533
ma.L5 -0.6891 0.036 -19.358 0.000 -0.759 -0.619
sigma2 1.568e+04 237.142 66.111 0.000 1.52e+04 1.61e+04
Ljung-Box (L1) (Q): 0.01 Jarque-Bera (JB): 210261.66
Prob(Q): 0.91 Prob(JB): 0.00
Heteroskedasticity (H): 1.25 Skew: 4.43
Prob(H) (two-sided): 0.03 Kurtosis: 66.94



Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

#use order from best order in the step above

model_day=sm.tsa.ARIMA(df_train_day['count'], order=(5,1,5))
model_day=model_day.fit()
model_day.summary()
SARIMAX Results
Dep. Variable: count No. Observations: 1084
Model: ARIMA(5, 1, 5) Log Likelihood -6684.911
Date: Wed, 10 May 2023 AIC 13391.822
Time: 16:08:47 BIC 13446.684
Sample: 0 HQIC 13412.592
- 1084
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
ar.L1 0.9872 0.059 16.838 0.000 0.872 1.102
ar.L2 -1.5865 0.045 -35.588 0.000 -1.674 -1.499
ar.L3 1.0828 0.080 13.602 0.000 0.927 1.239
ar.L4 -1.1353 0.047 -23.966 0.000 -1.228 -1.042
ar.L5 0.1974 0.053 3.742 0.000 0.094 0.301
ma.L1 -1.5795 0.052 -30.481 0.000 -1.681 -1.478
ma.L2 1.9451 0.057 33.879 0.000 1.833 2.058
ma.L3 -1.9274 0.059 -32.557 0.000 -2.043 -1.811
ma.L4 1.4612 0.060 24.198 0.000 1.343 1.580
ma.L5 -0.7163 0.038 -18.685 0.000 -0.791 -0.641
sigma2 1.612e+04 250.719 64.298 0.000 1.56e+04 1.66e+04
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 195507.26
Prob(Q): 0.96 Prob(JB): 0.00
Heteroskedasticity (H): 2.45 Skew: 4.61
Prob(H) (two-sided): 0.00 Kurtosis: 68.17



Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

Make Predictions On Test Set

start=len(df_train_day)
end=len(df_train_day)+len(df_test_day)-1
pred_day=model_day.predict(start=start, end=end, typ='levels')
#print(pred) # dates did not appear do use next line of code
pred_day.index=df_day.index[start:end+1]
print(pred_day)
time
2023-01-02    479.428567
2023-01-03    674.749061
2023-01-04    604.407490
2023-01-05    557.103754
2023-01-06    398.781992
                 ...    
2023-05-05    469.493291
2023-05-06    312.357834
2023-05-07    315.081241
2023-05-08    506.594428
2023-05-09    507.856525
Name: predicted_mean, Length: 128, dtype: float64
pred_day.plot(legend=True)
df_test_day['count'].plot(legend=True)
<AxesSubplot: xlabel='time'>

png

df_day['count'].mean()
229.16006600660066
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse_day=sqrt(mean_squared_error(pred_day, df_test_day['count']))
rmse_day
224.44111330608987

Make predictions

#Fit model on all df_day
model_day = sm.tsa.ARIMA(df_day['count'], order=(5,0,5))
model_day=model_day.fit()
df_day.tail() # predict into the future after training
count
time
2023-05-05 286
2023-05-06 51
2023-05-07 51
2023-05-08 482
2023-05-09 387
index_future_dates=pd.date_range(start='2023-05-09', end='2023-12-31', freq='D')
pred_day=model_day.predict(start=len(df_day), end=len(df_day)+(236), type='levels').rename('ARIMA Predictions BY Day')
pred_day.index=index_future_dates
print(pred_day)
2023-05-09    297.317413
2023-05-10    336.257542
2023-05-11    253.195036
2023-05-12     48.160946
2023-05-13    112.191123
                 ...    
2023-12-27    262.186749
2023-12-28    227.061904
2023-12-29    157.228323
2023-12-30    188.266458
2023-12-31    265.360301
Freq: D, Name: ARIMA Predictions BY Day, Length: 237, dtype: float64
pred_day.info()
<class 'pandas.core.series.Series'>
DatetimeIndex: 237 entries, 2023-05-09 to 2023-12-31
Freq: D
Series name: ARIMA Predictions BY Day
Non-Null Count  Dtype  
--------------  -----  
237 non-null    float64
dtypes: float64(1)
memory usage: 3.7 KB
pred_day.plot(figsize=(12,5),legend=True)
<AxesSubplot: >

png

Running AMIRA on hourly data then projecting to daily

Test data looks better than training on daily

df5 = df2.copy()
#df5 = df5.drop(['is_future', 'lag1', 'lag2', 'lag3'], axis=1)
df5.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 18336 entries, 2020-01-10 12:00:00 to 2023-05-09 23:00:00
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   count   18336 non-null  int64
dtypes: int64(1)
memory usage: 286.5 KB
df5['count'].plot(figsize=(12,5))
<AxesSubplot: xlabel='time'>

png

# P-Value is probability and should be as low as possible
#the smaller the better P-Value
#small value means teh data set is stationary
ad_test(df5['count'])
1. ADF :  -22.210656144053754
2. P-Value :  0.0
3. Num Of Lags :  31
4. Num Of Observations Used For ADF Regression and Critical Values Calculation :  18304
5. Critical Values : 
	 1% : -3.430707310823011
	 5% : -2.8616979180198356
	 10% : -2.5668540555869606
stepwise_fit5 = auto_arima(df5['count'],
                          trace=True,
                          supress_warnings=True)

stepwise_fit5.summary()    
Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0] intercept   : AIC=139245.469, Time=12.43 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=160319.042, Time=0.23 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=140155.291, Time=0.29 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=148101.958, Time=2.04 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=169221.769, Time=0.13 sec
 ARIMA(1,0,2)(0,0,0)[0] intercept   : AIC=139761.479, Time=5.38 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=139280.154, Time=8.03 sec
 ARIMA(3,0,2)(0,0,0)[0] intercept   : AIC=139282.702, Time=11.24 sec
 ARIMA(2,0,3)(0,0,0)[0] intercept   : AIC=138849.798, Time=19.91 sec
 ARIMA(1,0,3)(0,0,0)[0] intercept   : AIC=139740.324, Time=6.93 sec
 ARIMA(3,0,3)(0,0,0)[0] intercept   : AIC=139249.468, Time=16.14 sec
 ARIMA(2,0,4)(0,0,0)[0] intercept   : AIC=138392.690, Time=22.20 sec
 ARIMA(1,0,4)(0,0,0)[0] intercept   : AIC=139725.762, Time=9.09 sec
 ARIMA(3,0,4)(0,0,0)[0] intercept   : AIC=138687.604, Time=24.18 sec
 ARIMA(2,0,5)(0,0,0)[0] intercept   : AIC=138328.398, Time=14.43 sec
 ARIMA(1,0,5)(0,0,0)[0] intercept   : AIC=139462.275, Time=10.38 sec
 ARIMA(3,0,5)(0,0,0)[0] intercept   : AIC=138326.439, Time=19.29 sec
 ARIMA(4,0,5)(0,0,0)[0] intercept   : AIC=137966.103, Time=26.64 sec
 ARIMA(4,0,4)(0,0,0)[0] intercept   : AIC=138459.123, Time=27.90 sec
 ARIMA(5,0,5)(0,0,0)[0] intercept   : AIC=137865.368, Time=29.40 sec
 ARIMA(5,0,4)(0,0,0)[0] intercept   : AIC=139131.798, Time=27.98 sec
 ARIMA(5,0,5)(0,0,0)[0]             : AIC=140472.475, Time=14.99 sec

Best model:  ARIMA(5,0,5)(0,0,0)[0] intercept
Total fit time: 309.258 seconds
SARIMAX Results
Dep. Variable: y No. Observations: 18336
Model: SARIMAX(5, 0, 5) Log Likelihood -68920.684
Date: Wed, 10 May 2023 AIC 137865.368
Time: 16:34:35 BIC 137959.167
Sample: 0 HQIC 137896.186
- 18336
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
intercept 0.6856 0.063 10.810 0.000 0.561 0.810
ar.L1 3.3027 0.031 108.077 0.000 3.243 3.363
ar.L2 -5.0795 0.077 -65.888 0.000 -5.231 -4.928
ar.L3 4.5577 0.093 49.226 0.000 4.376 4.739
ar.L4 -2.3567 0.063 -37.415 0.000 -2.480 -2.233
ar.L5 0.5307 0.021 25.760 0.000 0.490 0.571
ma.L1 -2.4724 0.030 -81.359 0.000 -2.532 -2.413
ma.L2 2.9964 0.053 56.326 0.000 2.892 3.101
ma.L3 -2.0045 0.054 -37.183 0.000 -2.110 -1.899
ma.L4 0.6717 0.031 21.894 0.000 0.612 0.732
ma.L5 -0.0364 0.011 -3.297 0.001 -0.058 -0.015
sigma2 106.6402 0.304 350.752 0.000 106.044 107.236
Ljung-Box (L1) (Q): 0.13 Jarque-Bera (JB): 1126273.03
Prob(Q): 0.72 Prob(JB): 0.00
Heteroskedasticity (H): 1.19 Skew: 1.96
Prob(H) (two-sided): 0.00 Kurtosis: 41.19



Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

split_date = '1-jan-2023'
df_train_day = df_day.loc[df_day.index <= split_date].copy()
df_test_day = df_day.loc[df_day.index > split_date].copy()

fig, ax = plt.subplots(figsize=(15,5))
df_train_day.plot(ax=ax, label='Training Data', title='Train/Test Data Split BY Day')
df_test_day.plot(ax=ax, label='Testing Data')
ax.axvline(split_date, color='black', ls='--')
ax.legend(['Training Data', 'Test Data'])
plt.show()

png

#from statsmodels.tsa.arima_model import ARIMA
model_day=sm.tsa.ARIMA(df_train_day['count'], order=(5,0,5))
model_day=model_day.fit()
model_day.summary()
SARIMAX Results
Dep. Variable: count No. Observations: 1084
Model: ARIMA(5, 0, 5) Log Likelihood -6698.985
Date: Wed, 10 May 2023 AIC 13421.970
Time: 16:44:37 BIC 13481.831
Sample: 0 HQIC 13444.632
- 1084
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
const 230.3111 39.164 5.881 0.000 153.551 307.072
ar.L1 1.7576 0.022 81.197 0.000 1.715 1.800
ar.L2 -2.1975 0.024 -92.527 0.000 -2.244 -2.151
ar.L3 2.1699 0.034 64.358 0.000 2.104 2.236
ar.L4 -1.7470 0.028 -62.794 0.000 -1.802 -1.693
ar.L5 0.9487 0.023 41.549 0.000 0.904 0.993
ma.L1 -1.4176 0.028 -49.868 0.000 -1.473 -1.362
ma.L2 1.7757 0.042 42.510 0.000 1.694 1.858
ma.L3 -1.6703 0.059 -28.253 0.000 -1.786 -1.554
ma.L4 1.2575 0.047 26.648 0.000 1.165 1.350
ma.L5 -0.5664 0.038 -15.068 0.000 -0.640 -0.493
sigma2 1.678e+04 251.650 66.696 0.000 1.63e+04 1.73e+04
Ljung-Box (L1) (Q): 4.49 Jarque-Bera (JB): 193826.67
Prob(Q): 0.03 Prob(JB): 0.00
Heteroskedasticity (H): 2.63 Skew: 4.64
Prob(H) (two-sided): 0.00 Kurtosis: 67.85



Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

Make Predictions On Daily Test Set

start=len(df_train_day)
end=len(df_train_day)+len(df_test_day)-1
pred_day=model_day.predict(start=start, end=end, typ='levels')
#print(pred) # dates did not appear do use next line of code
pred_day.index=df_day.index[start:end+1]
print(pred_day)
time
2023-01-02    526.434333
2023-01-03    772.138844
2023-01-04    714.534101
2023-01-05    611.437116
2023-01-06    378.830014
                 ...    
2023-05-05    190.894294
2023-05-06    -15.404300
2023-05-07     26.984342
2023-05-08    295.538955
2023-05-09    421.600123
Name: predicted_mean, Length: 128, dtype: float64
pred_day.plot(legend=True)
df_test_day['count'].plot(legend=True)
<AxesSubplot: xlabel='time'>

png

df_day['count'].mean()
229.16006600660066
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse_day=sqrt(mean_squared_error(pred_day, df_test_day['count']))
rmse_day
157.9142119549536
#Fit model on all df5
model_day = sm.tsa.ARIMA(df_day['count'], order=(5,0,5))
model_day=model_day.fit()
df_day.tail() # predict into the future after training
count
time
2023-05-05 286
2023-05-06 51
2023-05-07 51
2023-05-08 482
2023-05-09 387
index_future_dates=pd.date_range(start='2023-05-09', end='2023-12-31', freq='D')
pred_day=model_day.predict(start=len(df_day), end=len(df_day)+(236), type='levels').rename('ARIMA Predictions BY Day')
pred_day.index=index_future_dates
print(pred_day)
2023-05-09    297.317413
2023-05-10    336.257542
2023-05-11    253.195036
2023-05-12     48.160946
2023-05-13    112.191123
                 ...    
2023-12-27    262.186749
2023-12-28    227.061904
2023-12-29    157.228323
2023-12-30    188.266458
2023-12-31    265.360301
Freq: D, Name: ARIMA Predictions BY Day, Length: 237, dtype: float64
pred_day.info()
<class 'pandas.core.series.Series'>
DatetimeIndex: 237 entries, 2023-05-09 to 2023-12-31
Freq: D
Series name: ARIMA Predictions BY Day
Non-Null Count  Dtype  
--------------  -----  
237 non-null    float64
dtypes: float64(1)
memory usage: 3.7 KB
pred_day.plot(figsize=(12,5),legend=True)
<AxesSubplot: >

png

the zigzag pattern on the above projection is caused by low call volume on weekends