SARIMAX#
import os
os.chdir("../../")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
from statsmodels.tsa.seasonal import seasonal_decompose, STL
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA
import pmdarima as pm
from pmdarima import model_selection
from pmdarima import auto_arima
from scripts.python.tsa.ts_utils import *
from scripts.python.tsa.utsmodel import *
from scripts.python.tsa.ts_eval import *
import warnings
warnings.filterwarnings('ignore')
Show code cell source
countries = ["solomon_islands", "palau", "samoa", "vanuatu", "tonga"]
# Set parameter range
p, d, q = range(0, 3), range(0, 2), range(0, 3)
P, D, Q, s = range(0, 3), range(0, 2), range(0, 3), [12]
# list of all parameter combos
pdq = list(itertools.product(p, d, q))
seasonal_pdq = list(itertools.product(P, D, Q, s))
all_param = list(itertools.product(pdq, seasonal_pdq))
for country in countries:
mod = SARIMAXPipeline(country=country, y_var="total",
data=None,
exog_var=["covid", "stringency_index",
str(country)+"_travel"],
transform_method="scaledlogit",
training_ratio=1,
verbose=False)
mod.read_and_merge()
display(mod.data.head(5))
mod.transform()
print(f"The Benchmark Evaluation for {country}".upper(), "\n")
mod.get_benchmark_evaluation()
print(f"Started to conduct stepwise searching for {country}".upper(), "\n")
mod.stepwise_search()
print(f"Started to conduct Grid searching for {country}".upper(), "\n")
mod_msres = mod.manual_search(params=all_param)
mod_msres.sort(key=lambda x: x[1])
mod_swm = mod.stepwise_model
if mod_msres[0][-1] == (mod_swm["order"], mod_swm["seasonal_order"]):
print(mod_msres[0][-1])
else:
cv_models = []
cv_models.append(pm.ARIMA(
mod_swm["order"], mod_swm["seasonal_order"], exog=mod.exog[:mod.training_size]))
# Append top five GridSearch results
for res in mod_msres[:5]:
info_criteria = res[1]
if info_criteria < 100:
pass
else:
order, seasonal_order = res[-1]
model = pm.ARIMA(order, seasonal_order,
exog=mod.exog[:mod.training_size])
cv_models.append(model)
print(
f"Started to conduct Cross-validation for {country}".upper(), "\n")
mod_cv_comp = mod.compare_models(
y=mod.transformed_y, exog=mod.exog, models=cv_models)
best_cv_idx = np.nanargmin(mod_cv_comp["avg_error"])
print(
f"Best Models from Cross-validation is {cv_models[best_cv_idx]}", "\n")
if best_cv_idx > 0:
best_mod = mod_msres[best_cv_idx-1][0]
best_mod_pred = mod.get_prediction_df(
best_mod, mod.test_size, mod.exog[-mod.test_size:])
lower = mod.data["total"].min() - 1
upper = mod.data["total"].max() + 1
for col_idx, col in enumerate(best_mod_pred.columns):
for row_idx, _ in enumerate(best_mod_pred[col]):
best_mod_pred.iloc[row_idx, col_idx] = mod.inverse_scaledlogit(
best_mod_pred.iloc[row_idx, col_idx], upper, lower)
# Merge the prediction with actual values
best_mod_pred.columns.name = None
best_mod_pred = pd.concat(
[mod.data[["date", "total"]], best_mod_pred], axis=1)
save_pred_path = mod.country_data_folder + "/model/sarimax_" + str(country) + ".csv"
best_mod_pred.to_csv(save_pred_path, encoding="utf-8")
if mod.test_size != 0:
pred_series = (best_mod_pred["train_pred"].fillna(0)
+ best_mod_pred["test_pred"].fillna(0))
else:
pred_series = best_mod_pred["train_pred"]
mod_eval = (pd.DataFrame(calculate_evaluation(
best_mod_pred["total"], pred_series), index=["SARIMAX"]))
mod_eval = pd.concat([mod.benchmark, mod_eval], axis=0)
display(mod_eval)
mod_eval.to_csv(mod.country_data_folder + "/model/sarimax_eval_" + str(country) + ".csv",
encoding="utf-8")
date | total | stringency_index | covid | solomon_islands_flights | solomon_islands_hotel | solomon_islands_travel | |
---|---|---|---|---|---|---|---|
0 | 2009-01-01 | 1602 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 |
1 | 2009-02-01 | 1422 | 0.0 | 0.0 | 0.484667 | 0.0 | 0.203672 |
2 | 2009-03-01 | 1249 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 |
3 | 2009-04-01 | 1499 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 |
4 | 2009-05-01 | 1393 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.478020 |
training size : 144, testing size : 0
THE BENCHMARK EVALUATION FOR SOLOMON_ISLANDS
STARTED TO CONDUCT STEPWISE SEARCHING FOR SOLOMON_ISLANDS
SARIMAX Results
============================================================================================
Dep. Variable: y No. Observations: 144
Model: SARIMAX(2, 1, 2)x(1, 0, [1], 12) Log Likelihood -170.543
Date: Wed, 14 Jun 2023 AIC 361.087
Time: 15:49:31 BIC 390.715
Sample: 0 HQIC 373.126
- 144
Covariance Type: opg
==========================================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------------------
covid -3.0401 1.491 -2.039 0.041 -5.962 -0.118
stringency_index -0.0648 0.026 -2.497 0.013 -0.116 -0.014
solomon_islands_travel -0.3945 1.574 -0.251 0.802 -3.479 2.690
ar.L1 -0.7511 0.172 -4.375 0.000 -1.088 -0.415
ar.L2 0.1471 0.125 1.180 0.238 -0.097 0.391
ma.L1 0.0672 0.105 0.640 0.522 -0.138 0.273
ma.L2 -0.8797 0.082 -10.683 0.000 -1.041 -0.718
ar.S.L12 0.9434 0.127 7.457 0.000 0.695 1.191
ma.S.L12 -0.8220 0.243 -3.383 0.001 -1.298 -0.346
sigma2 0.6149 0.054 11.411 0.000 0.509 0.721
===================================================================================
Ljung-Box (L1) (Q): 0.07 Jarque-Bera (JB): 16100.35
Prob(Q): 0.79 Prob(JB): 0.00
Heteroskedasticity (H): 0.33 Skew: 5.80
Prob(H) (two-sided): 0.00 Kurtosis: 53.67
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
STARTED TO CONDUCT GRID SEARCHING FOR SOLOMON_ISLANDS
STARTED TO CONDUCT CROSS-VALIDATION FOR SOLOMON_ISLANDS
Best Models from Cross-validation is ARIMA(2,0,2)(0,1,1)[12] intercept
MSE | RMSE | MAE | SMAPE | |
---|---|---|---|---|
naive | 130146.048951 | 360.757604 | 273.083916 | 19.605133 |
mean | 373388.932822 | 611.055589 | 428.780575 | 28.510498 |
seasonal naive | 719201.125000 | 848.057265 | 537.375000 | 42.210038 |
SARIMAX | 81169.100331 | 284.901914 | 201.117080 | 14.616283 |
date | japan | south_korea | taiwan | china | usa/canada | europe | others | total | stringency_index | covid | palau_flights | palau_hotel | palau_travel | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2007-06-01 | 856.0 | 1291.0 | 3245.0 | 86.0 | 669.0 | 99.0 | 463.0 | 6709.0 | 0.0 | 0.0 | 1.068426 | 8.334537 | 0.320004 |
1 | 2007-07-01 | 2119.0 | 1366.0 | 3269.0 | 33.0 | 653.0 | 144.0 | 437.0 | 8021.0 | 0.0 | 0.0 | 0.000000 | 9.667343 | 0.952482 |
2 | 2007-08-01 | 3476.0 | 1354.0 | 3046.0 | 46.0 | 580.0 | 256.0 | 438.0 | 9196.0 | 0.0 | 0.0 | 0.704914 | 6.906282 | 0.453967 |
3 | 2007-09-01 | 3022.0 | 910.0 | 2497.0 | 61.0 | 559.0 | 145.0 | 401.0 | 7595.0 | 0.0 | 0.0 | 0.798284 | 3.591027 | 0.000000 |
4 | 2007-10-01 | 1807.0 | 1082.0 | 2298.0 | 49.0 | 774.0 | 390.0 | 395.0 | 6795.0 | 0.0 | 0.0 | 0.431326 | 4.111283 | 0.000000 |
training size : 184, testing size : 0
THE BENCHMARK EVALUATION FOR PALAU
STARTED TO CONDUCT STEPWISE SEARCHING FOR PALAU
SARIMAX Results
==========================================================================================
Dep. Variable: y No. Observations: 184
Model: SARIMAX(1, 0, 1)x(1, 0, 1, 12) Log Likelihood -231.068
Date: Wed, 14 Jun 2023 AIC 478.136
Time: 15:54:50 BIC 503.855
Sample: 0 HQIC 488.560
- 184
Covariance Type: opg
====================================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------------
covid -5.9728 0.833 -7.170 0.000 -7.605 -4.340
stringency_index -0.0172 0.018 -0.950 0.342 -0.053 0.018
palau_travel 0.3199 0.571 0.560 0.575 -0.800 1.439
ar.L1 0.9569 0.033 28.662 0.000 0.891 1.022
ma.L1 -0.5431 0.052 -10.518 0.000 -0.644 -0.442
ar.S.L12 0.9414 0.132 7.137 0.000 0.683 1.200
ma.S.L12 -0.8473 0.210 -4.026 0.000 -1.260 -0.435
sigma2 0.7038 0.028 24.753 0.000 0.648 0.760
===================================================================================
Ljung-Box (L1) (Q): 0.07 Jarque-Bera (JB): 19273.54
Prob(Q): 0.80 Prob(JB): 0.00
Heteroskedasticity (H): 6.06 Skew: 4.88
Prob(H) (two-sided): 0.00 Kurtosis: 52.18
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
STARTED TO CONDUCT GRID SEARCHING FOR PALAU
STARTED TO CONDUCT CROSS-VALIDATION FOR PALAU
Best Models from Cross-validation is ARIMA(0,1,1)(0,1,1)[12] intercept
MSE | RMSE | MAE | SMAPE | |
---|---|---|---|---|
naive | 2.096767e+06 | 1448.021907 | 1092.759563 | 23.272616 |
mean | 1.639656e+07 | 4049.266859 | 3178.648757 | 51.716972 |
seasonal naive | 1.123704e+07 | 3352.169852 | 2405.935201 | 56.509474 |
SARIMAX | 1.826282e+06 | 1351.399848 | 947.467373 | 21.701883 |
date | total | american_samoa | australia | europe | new_zealand | usa | other_countries | cruise_ships | stringency_index | covid | samoa_flights | samoa_hotel | samoa_travel | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2002-08-01 | 8260.0 | 2877.0 | 1115.0 | 429.0 | 2035.0 | 933.0 | 871.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 2002-09-01 | 6708.0 | 2119.0 | 972.0 | 372.0 | 1879.0 | 612.0 | 754.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 2002-10-01 | 5737.0 | 1892.0 | 737.0 | 468.0 | 1492.0 | 577.0 | 571.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 2002-11-01 | 6653.0 | 2530.0 | 864.0 | 393.0 | 1675.0 | 483.0 | 708.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 2002-12-01 | 13042.0 | 4678.0 | 1918.0 | 272.0 | 4401.0 | 1007.0 | 766.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
training size : 242, testing size : 0
THE BENCHMARK EVALUATION FOR SAMOA
STARTED TO CONDUCT STEPWISE SEARCHING FOR SAMOA
SARIMAX Results
============================================================================================
Dep. Variable: y No. Observations: 242
Model: SARIMAX(0, 1, 2)x(2, 0, [1], 12) Log Likelihood -335.391
Date: Wed, 14 Jun 2023 AIC 690.782
Time: 16:00:58 BIC 725.630
Sample: 0 HQIC 704.822
- 242
Covariance Type: opg
====================================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------------
intercept 0.0097 0.010 0.961 0.337 -0.010 0.029
covid -7.2406 0.737 -9.827 0.000 -8.685 -5.796
stringency_index -0.0750 0.010 -7.871 0.000 -0.094 -0.056
samoa_travel -0.0038 0.268 -0.014 0.989 -0.529 0.521
ma.L1 -0.6578 0.037 -17.626 0.000 -0.731 -0.585
ma.L2 -0.3261 0.067 -4.854 0.000 -0.458 -0.194
ar.S.L12 -0.3483 0.400 -0.871 0.384 -1.132 0.435
ar.S.L24 0.3509 0.118 2.975 0.003 0.120 0.582
ma.S.L12 0.6056 0.406 1.492 0.136 -0.190 1.401
sigma2 0.9498 0.045 21.045 0.000 0.861 1.038
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 23312.56
Prob(Q): 0.96 Prob(JB): 0.00
Heteroskedasticity (H): 25.64 Skew: 5.65
Prob(H) (two-sided): 0.00 Kurtosis: 49.84
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
STARTED TO CONDUCT GRID SEARCHING FOR SAMOA
STARTED TO CONDUCT CROSS-VALIDATION FOR SAMOA
Best Models from Cross-validation is ARIMA(2,1,1)(0,1,1)[12] intercept
MSE | RMSE | MAE | SMAPE | |
---|---|---|---|---|
naive | 1.180276e+07 | 3435.514859 | 2313.560166 | 23.936179 |
mean | 2.193805e+07 | 4683.807205 | 3367.534731 | 43.918693 |
seasonal naive | 1.643507e+07 | 4054.018920 | 2003.082645 | 31.466659 |
SARIMAX | 2.952049e+06 | 1718.152711 | 1020.766988 | 33.573105 |
date | australia | new_zealand | new_caledonia | other_pic | europe | north_america | japan | china | other_countries | total | fileyear | not_stated | stringency_index | covid | vanuatu_flights | vanuatu_hotel | vanuatu_travel | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2003-12-01 | 2938.0 | 580.0 | 646.0 | 136.0 | 201.0 | 88.0 | 42.0 | 0.0 | 103.0 | 4734.0 | 2007.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 |
1 | 2004-01-01 | 2864.0 | 439.0 | 717.0 | 122.0 | 186.0 | 67.0 | 36.0 | 0.0 | 61.0 | 4492.0 | 2005.0 | 0.0 | 0.0 | 0.0 | 2.232933 | 3.102047 | 2.453441 |
2 | 2004-02-01 | 2158.0 | 194.0 | 416.0 | 282.0 | 229.0 | 82.0 | 34.0 | 0.0 | 93.0 | 3488.0 | 2005.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 |
3 | 2004-03-01 | 2733.0 | 420.0 | 255.0 | 160.0 | 195.0 | 115.0 | 53.0 | 0.0 | 83.0 | 4014.0 | 2005.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 |
4 | 2004-04-01 | 2377.0 | 438.0 | 469.0 | 206.0 | 225.0 | 115.0 | 45.0 | 0.0 | 80.0 | 3955.0 | 2005.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 1.694580 |
training size : 224, testing size : 0
THE BENCHMARK EVALUATION FOR VANUATU
STARTED TO CONDUCT STEPWISE SEARCHING FOR VANUATU
SARIMAX Results
==========================================================================================
Dep. Variable: y No. Observations: 224
Model: SARIMAX(1, 1, 1)x(1, 0, 1, 12) Log Likelihood -278.286
Date: Wed, 14 Jun 2023 AIC 572.571
Time: 16:10:25 BIC 599.829
Sample: 0 HQIC 583.575
- 224
Covariance Type: opg
====================================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------------
covid -9.1747 0.708 -12.966 0.000 -10.562 -7.788
stringency_index -0.0419 0.011 -3.973 0.000 -0.063 -0.021
vanuatu_travel 0.0298 0.213 0.140 0.888 -0.387 0.447
ar.L1 0.0483 0.102 0.475 0.635 -0.151 0.248
ma.L1 -0.8991 0.072 -12.570 0.000 -1.039 -0.759
ar.S.L12 0.9723 0.042 22.969 0.000 0.889 1.055
ma.S.L12 -0.8100 0.102 -7.956 0.000 -1.009 -0.610
sigma2 0.6688 0.034 19.531 0.000 0.602 0.736
===================================================================================
Ljung-Box (L1) (Q): 0.02 Jarque-Bera (JB): 18394.33
Prob(Q): 0.89 Prob(JB): 0.00
Heteroskedasticity (H): 17.69 Skew: 5.20
Prob(H) (two-sided): 0.00 Kurtosis: 46.26
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
STARTED TO CONDUCT GRID SEARCHING FOR VANUATU
STARTED TO CONDUCT CROSS-VALIDATION FOR VANUATU
Best Models from Cross-validation is ARIMA(1,0,1)(2,1,2)[12] intercept
MSE | RMSE | MAE | SMAPE | |
---|---|---|---|---|
naive | 2.915180e+06 | 1707.389827 | 1292.829596 | 20.537447 |
mean | 1.125218e+07 | 3354.426412 | 2628.520727 | 47.767194 |
seasonal naive | 8.396013e+06 | 2897.587484 | 1568.424107 | 34.617696 |
SARIMAX | 1.238855e+06 | 1113.038657 | 795.607455 | 35.718702 |
date | yacht | all | total | ship | stringency_index | covid | tonga_flights | tonga_hotel | tonga_travel | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2010-01-01 | 4 | 3808 | 3158 | 646 | 0.0 | 0.0 | 1.118195 | 1.296399 | 0.212187 |
1 | 2010-02-01 | 5 | 2384 | 2379 | 0 | 0.0 | 0.0 | 0.811451 | 1.013375 | 0.480398 |
2 | 2010-03-01 | 5 | 3992 | 3134 | 853 | 0.0 | 0.0 | 0.872967 | 1.359485 | 0.233651 |
3 | 2010-04-01 | 30 | 5650 | 2818 | 2802 | 0.0 | 0.0 | 0.583649 | 1.142021 | 0.400232 |
4 | 2010-05-01 | 177 | 9150 | 3670 | 5303 | 0.0 | 0.0 | 0.906579 | 1.746620 | 0.749327 |
training size : 144, testing size : 0
THE BENCHMARK EVALUATION FOR TONGA
STARTED TO CONDUCT STEPWISE SEARCHING FOR TONGA
SARIMAX Results
===========================================================================================
Dep. Variable: y No. Observations: 144
Model: SARIMAX(0, 0, 1)x(2, 0, [], 12) Log Likelihood -196.126
Date: Wed, 14 Jun 2023 AIC 406.251
Time: 16:17:52 BIC 427.040
Sample: 0 HQIC 414.699
- 144
Covariance Type: opg
====================================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------------
covid -4.4685 0.936 -4.775 0.000 -6.303 -2.634
stringency_index -0.0572 0.020 -2.866 0.004 -0.096 -0.018
tonga_travel -0.3096 0.880 -0.352 0.725 -2.034 1.414
ma.L1 0.2152 0.056 3.831 0.000 0.105 0.325
ar.S.L12 0.2147 0.054 3.988 0.000 0.109 0.320
ar.S.L24 0.2993 0.058 5.150 0.000 0.185 0.413
sigma2 0.8666 0.036 24.334 0.000 0.797 0.936
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 10347.59
Prob(Q): 0.96 Prob(JB): 0.00
Heteroskedasticity (H): 6.03 Skew: 4.58
Prob(H) (two-sided): 0.00 Kurtosis: 43.51
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
STARTED TO CONDUCT GRID SEARCHING FOR TONGA
STARTED TO CONDUCT CROSS-VALIDATION FOR TONGA
Best Models from Cross-validation is ARIMA(0,0,1)(0,1,1)[12] intercept
MSE | RMSE | MAE | SMAPE | |
---|---|---|---|---|
naive | 2.543522e+06 | 1594.842251 | 1043.874126 | 35.445486 |
mean | 4.529700e+06 | 2128.309258 | 1637.978395 | 53.528230 |
seasonal naive | 4.760284e+06 | 2181.807619 | 1256.819444 | 53.730230 |
SARIMAX | 7.153640e+05 | 845.791954 | 556.793463 | 30.118200 |