import numpy as np
import pandas as pd
import scipy as sp
from scipy.stats import mode
from sklearn import linear_model
import matplotlib
import matplotlib.pyplot as plt
from sklearn import discriminant_analysis
from sklearn.decomposition import PCA
from sklearn import preprocessing
%matplotlib inline
import datetime
import statsmodels.api as sm
from statsmodels.tsa.stattools import acf
from statsmodels.tsa.stattools import pacf
from statsmodels.tsa.seasonal import seasonal_decompose
import seaborn as sb
sb.set_style('darkgrid')
from scipy import stats
from statsmodels.graphics.api import qqplot
from sklearn.metrics import r2_score
from sklearn import model_selection
import seaborn as sb
import warnings
warnings.filterwarnings('ignore')
daily_data = pd.read_csv('../exchangeratedata/daily_full.csv', skiprows=7, header=0)
monthly_data = pd.read_csv('../exchangeratedata/monthly_rates.csv', skiprows=11, header=0)
daily_data['datetime'] = pd.to_datetime(daily_data['DATE'])
daily_data['dayofweek'] = daily_data['datetime'].apply(lambda row: row.dayofweek)
weekly_data = daily_data[daily_data['dayofweek'] == 4]
print weekly_data.head(5)
Univariate time series models use only combinations of lags and moving averages of the outcome variable as predictors for next time step
Multivariate models use lags of other related predictors in this case we are considering using lags of the Libor rate.
UK_US = pd.DataFrame()
UK_US['UK_US']=weekly_data['XUDLUSS']
UK_US['LIBOR']=weekly_data['IUDSOIA']
UK_US = UK_US.set_index(weekly_data['datetime'].values) # index has to be the variable depicting date-time information
UK_EU = pd.DataFrame()
UK_EU['UK_EU']=weekly_data['XUDLERS']
UK_EU['LIBOR']=weekly_data['IUDSOIA']
UK_EU.set_index(weekly_data['datetime'].values)
# dates = pd.DatetimeIndex(UK_US.index)
# dates[:'2000-03-10']
ts = UK_US['UK_US']
libor = UK_US['LIBOR']
Plot of the UK/US exchange rate:
plt.figure(figsize=(10, 5))
plt.plot(UK_US['UK_US'])
Time series data can be indexed using dates like the following:
ts['2000-01-07']
libor['2000-01-07']
Before fitting a model it is necessary to ensure that the time series is stationary:
The time series can be assumed to be stationary if it has constant statistical properties over time, ie. the following:
Stationarity can be determined using the following methods:
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
#Determing rolling statistics
rolmean = pd.rolling_mean(timeseries, window=12)
rolstd = pd.rolling_std(timeseries, window=12)
#Plot rolling statistics:
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
#Perform Dickey-Fuller test:
print 'Results of Dickey-Fuller Test:'
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print dfoutput
test_stationarity(UK_US['UK_US'])
test_stationarity(UK_EU['UK_EU'])
For both series, the test statistic is more than the critical values so the series are not stationary. Note that the signed values should be compared and not the absolute values.
There are 2 major reasons behind non-stationaruty of a TS:
The underlying principle is to model or estimate the trend and seasonality in the series and remove those from the series to get a stationary series. Then statistical forecasting techniques can be implemented on this series. The final step would be to convert the forecasted values into the original scale by applying trend and seasonality constraints back.
It is common practice to take the logarithm of the time series instead of the absolute value:
ts = UK_US['UK_US']
ts_log = np.log(UK_US['UK_US'])
plt.plot(ts_log)
moving_avg = pd.rolling_mean(ts_log,12)
plt.plot(ts_log)
plt.plot(moving_avg, color='red')
ts_log_moving_avg_diff = ts_log - moving_avg
ts_log_moving_avg_diff.dropna(inplace=True)
test_stationarity(ts_log_moving_avg_diff)
The test statistic is smaller than the 1% critical value so with 99% confidence we can say that the modified series is stationary
expwighted_avg = pd.ewma(ts_log, halflife=12)
plt.plot(ts_log)
plt.plot(expwighted_avg, color='red')
ts_log_ewma_diff = ts_log - expwighted_avg
test_stationarity(ts_log_ewma_diff)
This also results in a test statistic which is smaller than the 1% critical value
ts_log_diff = ts_log - ts_log.shift()
plt.plot(ts_log_diff)
ts_diff = UK_US['UK_US'] - UK_US['UK_US'].shift()
ts_log_diff.dropna(inplace=True)
test_stationarity(ts_log_diff)
The test statistic is much lower than the 1% critical value (-16 comared to -3)
ARIMA stands for Autoregressive integrated moviing average. The ARIMA forecasting for a stationary time series is similar to a linear regression equation. The predictors depend on the parameters (p,d,q) of the ARIMA model:
An importance concern here is how to determine the value of ‘p’ and ‘q’. We use two plots to determine these numbers.
Autocorrelation Function (ACF): It is a measure of the correlation between the the TS with a lagged version of itself. For instance at lag 5, ACF would compare series at time instant ‘t1’…’t2’ with series at instant ‘t1-5’…’t2-5’ (t1-5 and t2 being end points).
Partial Autocorrelation Function (PACF): This measures the correlation between the TS with a lagged version of itself but after eliminating the variations already explained by the intervening comparisons. Eg at lag 5, it will check the correlation but remove the effects already explained by lags 1 to 4.
from statsmodels.tsa.stattools import acf, pacf
lag_acf = acf(ts_log_diff, nlags=20)
lag_pacf = pacf(ts_log_diff, nlags=20, method='ols')
#Plot ACF:
plt.figure()
plt.subplot(121)
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')
#Plot PACF:
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.tight_layout()
The two dotted lines on either sides of 0 are the confidence intervals. These can be used to determine the ‘p’ and ‘q’ values for the ARIMA model as:
There are three types of ARIMA models we could try
from statsmodels.tsa.arima_model import ARIMA
model = ARIMA(ts_log, order=(6, 0, 0))
results_AR = model.fit(disp=-1)
plt.plot(ts_log)
plt.plot(results_AR.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_AR.fittedvalues-ts_log)**2))
model = ARIMA(ts_log, order=(6, 0, 0), exog=libor)
results_AR = model.fit(disp=-1)
plt.plot(ts_log)
plt.plot(results_AR.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_AR.fittedvalues-ts_log)**2))
model = ARIMA(ts_log, order=(0,0, 2))
results_MA = model.fit(disp=-1)
plt.plot(ts_log)
plt.plot(results_MA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_MA.fittedvalues-ts_log)**2))
model = ARIMA(ts_log, order=(0,0, 2), exog = libor)
results_MA = model.fit(disp=-1)
plt.plot(ts_log)
plt.plot(results_MA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_MA.fittedvalues-ts_log)**2))
model = ARIMA(ts_log, order=(6, 1, 5))
results_ARIMA = model.fit(disp=-1)
plt.plot(ts_log_diff)
plt.plot(results_ARIMA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_ARIMA.fittedvalues-ts_log)**2))
model = ARIMA(ts_log, order=(6, 0, 4), exog=libor)
results_ARIMA = model.fit(disp=-1)
plt.plot(ts_log)
plt.plot(results_ARIMA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_ARIMA.fittedvalues-ts_log)**2))
def transform_back(results):
predictions_diff = pd.Series(results.fittedvalues, copy=True)
predictions_diff_cumsum = predictions_diff.cumsum()
# print predictions_diff_cumsum[0:10]
predictions_diff = pd.Series(ts_log.ix[0], index=ts_log.index)
predictions_diff = predictions_diff.add(predictions_diff_cumsum,fill_value=0)
predictions_diff_exp = np.exp(predictions_diff)
# print predictions_diff_exp[0:10]
# print ts[0:10]
return predictions_diff_exp
# return np.exp(results.fittedvalues)
plt.figure(figsize=(20,15))
plt.plot(ts)
# plt.plot(transform_back(results_ARMA), label='ARMA')
plt.plot(transform_back(results_ARIMA), label='ARIMA')
# plt.plot(transform_back(results_MA), label='MA')
# plt.plot(transform_back(results_AR), label='AR')
plt.legend()
# plt.title('RMSE: %.4f'% np.sqrt(sum((predictions_ARIMA_exp-UK_US['UK_US'])**2)/len(UK_US['UK_US'])))
When the model is fit to log differences and transformed back into the original scale, the magnitude of fitted values are significantly dampened.
From this observation it was decided that the models will now be fit to the actual values rather tha the values after differencing.
Several different forecasting methods were explored.
# Mean squared error
def score(y, yhat):
return sum((yhat-y)**2)
def forecast_cumulatively(ts):
plt.figure(figsize=(50,100))
j=0
for year in range(2001, 2015):
ts_train = ts[str(2000):str(year)+str('-06')] # first 6 months of the year
ts_years = ts[str(2000):str(year)]
ts_forecast_period = ts[str(year)+str('-07'): str(year)+str('-12')]
model = ARIMA(ts_train, order=(6, 0, 0))
results_ARIMA = model.fit(disp=-1)
n_weeks = len(ts_forecast_period)
forecast = results_ARIMA.forecast(n_weeks)[0] # 6 month forecast
forecast_series = pd.DataFrame({'forecast':forecast})
forecast_series = forecast_series.set_index(ts_forecast_period.index)
j+=1
plt.subplot(8, 2, j)
plt.plot(ts_years, label='us/uk exchange rate')
plt.plot(results_ARIMA.fittedvalues, 'r--')
plt.plot(forecast_series['forecast'], 'r--', label='forecast')
plt.title('RMSE: %.4f'% score(ts_forecast_period, forecast), fontsize=30)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.legend(fontsize=30, loc='best')
forecast_cumulatively(ts)
Forecasting 6 months ahead is hard! Although the RMSE is low for most of the years, the predictions are usually always just linear increases or decreases from the current value. The RMSE is the highers for 2008 at the end of which there is a dramatic drop in the exchange rate due to the financial crisis.
def forecast_by_year(ts):
plt.figure(figsize=(50,100))
j=0
for year in range(2001, 2015):
ts_train = ts[str(year)+str('-01'):str(year)+str('-09')] # first 6 months of the year
ts_forecast_period = ts[str(year)+str('-10'): str(year)+str('-12')]
model = ARIMA(ts_train, order=(5, 0, 0))
results_ARIMA = model.fit(disp=-1)
n_weeks = len(ts_forecast_period)
forecast = results_ARIMA.forecast(n_weeks)[0] # 6 month forecast
forecast_series = pd.DataFrame({'forecast':forecast})
forecast_series = forecast_series.set_index(ts_forecast_period.index)
j+=1
plt.subplot(8, 2, j)
plt.plot(ts[str(year)], label='us/uk exchange rate')
plt.plot(results_ARIMA.fittedvalues, 'g--', label='fitted values')
plt.plot(forecast_series['forecast'], 'r--', label='forecast')
plt.title('RMSE: %.4f'% score(ts_forecast_period, forecast), fontsize=30)
plt.xticks(fontsize=20)
plt.yticks(fontsize=30)
plt.legend(fontsize=30, loc='best')
forecast_by_year(ts)
def forecast_by_year_libor(ts):
plt.figure(figsize=(50,100))
j=0
for year in range(2001, 2015):
ts_train = ts[str(year)+str('-01'):str(year)+str('-09')] # first 6 months of the year
ts_forecast_period = ts[str(year)+str('-10'): str(year)+str('-12')]
model = ARIMA(ts_train, order=(5, 0, 0), exog=libor)
results_ARIMA = model.fit(disp=-1)
n_weeks = len(ts_forecast_period)
forecast = results_ARIMA.forecast(n_weeks)[0] # 6 month forecast
forecast_series = pd.DataFrame({'forecast':forecast})
forecast_series = forecast_series.set_index(ts_forecast_period.index)
j+=1
plt.subplot(8, 2, j)
plt.plot(ts[str(year)], label='us/uk exchange rate')
plt.plot(results_ARIMA.fittedvalues, 'g--', label='fitted values')
plt.plot(forecast_series['forecast'], 'r--', label='forecast')
plt.title('RMSE: %.4f'% score(ts_forecast_period, forecast), fontsize=30)
plt.xticks(fontsize=20)
plt.yticks(fontsize=30)
plt.legend(fontsize=30, loc='best')
forecast_by_year(ts)
Inclusion of Libor rates does not improve the RMSE
Current model uses 5 lags, and first difference, with no moving average terms.
# pass in the time series, horizon and lags
def forecast_variable_horizon(ts, horizon, lags=5):
# start predictions at 2006
ts_2006 = ts['2006':]
n_forecasts = len(ts_2006)
actual_values = np.append(ts_2006.values, [0])
# initialise empty array for forecast
forecasts = np.zeros((n_forecasts+1, horizon+1))
confidence_intervals = np.zeros((n_forecasts+1, 2, horizon+1))
# choose a 5 year rolling window for predictions so current week - 5 years
five_years = datetime.timedelta(weeks=260)
# start at 2005
for week_i, week in enumerate(ts_2006.index):
# take the past 5 years as the training set
ts_train = ts[week-five_years: week]
model = ARIMA(ts_train, order=(lags, 1, 0))
results_ARIMA = model.fit(disp=-1)
forecast = results_ARIMA.forecast(horizon) # forecast for the horizon
forecast_vals = forecast[0]
forecast_conf_intervals = forecast[2]
for f in range(1,horizon+1):
if week_i <= n_forecasts - (f):
forecasts[week_i+f, f] = forecast_vals[f-1]
confidence_intervals[week_i+f, :, f] = forecast_conf_intervals[f-1,:]
# start the prediction for the next week so the first prediction value will be zero as we
# don't predict for the first week
newtime = np.datetime64(ts_2006.index[-1]+datetime.timedelta(weeks=horizon))
new_time_index = pd.DatetimeIndex(np.append(ts_2006.index, newtime))
forecast_df = pd.DataFrame({'actual': actual_values}, index=new_time_index)
for col in range(horizon):
forecast_df[col+1] = forecasts[:, col+1]
return forecast_df, confidence_intervals
forecast_variable_horizon_df, confidence_intervals = forecast_variable_horizon(ts, 2)
forecast_variable_horizon_df.head()
horizon = 2
plt.figure(figsize=(50,80))
j=0
for fc in range(1, horizon+1):
j+=1
plt.subplot(3, 1, j)
plt.plot(forecast_variable_horizon_df['actual'][:-1], 'g', linewidth=2, label='actual values')
plt.plot(forecast_variable_horizon_df.index[fc:], forecast_variable_horizon_df[fc][fc:], 'r--', linewidth=2, label=str(fc) +' week forecast')
actual = forecast_variable_horizon_df['actual'][fc:]
# confidence bounds:
lower = confidence_intervals[:,:,1][fc:, 0]
upper = confidence_intervals[:,:,1][fc:, 1]
plt.plot(forecast_variable_horizon_df.index[fc:], lower, 'c--', label='lower confidence bound')
plt.plot(forecast_variable_horizon_df.index[fc:], upper, 'c--', label='upper confidence bound')
predicted = forecast_variable_horizon_df[fc][fc:]
rmse = score(actual, predicted)
r2 = r2_score(actual, predicted)
plt.title('Forecast horizon ' + str(fc) + ' weeks: MSE: %.4f '% rmse + ' R2: %.4f' % r2, fontsize=30)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.legend(fontsize=30, loc='best')
The MSE and R2 vaues for forecast horizons of 1 and 2 weeks show that the predictive accuracy decreases considerably as you try to predict more time steps into the future.
ts_train = ts['2002':'2007']
model = ARIMA(ts_train, order=(5, 1, 0))
results_ARIMA = model.fit(disp=-1)
results_ARIMA.summary()
Looking at the coefficients of the fitted model, the coefficient for the previous lag is not much higher than for all the other lags. This shows that the model is not just predicting based on the previous value
Specifiy a time window to plot so the prediction and actual data can be compared more closely
forecast_window = forecast_variable_horizon_df['2007':'2009']
forecast_window.head()
def plot_prediction_window(forecast_variable_horizon_df, start_year, end_year):
plt.figure(figsize=(20,10))
start_index = len(forecast_variable_horizon_df['actual']) - len(forecast_variable_horizon_df['actual'][start_year:])
end_index = start_index + len(forecast_variable_horizon_df['actual'][start_year:end_year])
# slice the predictions dataframe:
forecast_window = forecast_variable_horizon_df[start_year:end_year]
# set forecast horizon
fc = 2
# plot the actual values and the forecast
plt.plot(forecast_window['actual'][:-1], 'g', linewidth=2, label='actual values')
plt.plot(forecast_window.index, forecast_window[fc], 'r--', linewidth=2, label=str(fc) +' week forecast')
# plot the confidence bounds:
lower = confidence_intervals[:,:,fc][start_index:end_index, 0]
upper = confidence_intervals[:,:,fc][start_index:end_index, 1]
plt.plot(forecast_window.index, lower, 'c--', label='lower confidence bound')
plt.plot(forecast_window.index, upper, 'c--', label='upper confidence bound')
predicted = forecast_window[fc]
actual = forecast_window['actual']
# score the predictions
rmse = score(actual, predicted)
r2 = r2_score(actual, predicted)
# set plote legend
plt.title('Forecast horizon ' + str(fc) + ' weeks: MSE: %.4f '% rmse + ' R2: %.4f' % r2, fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(fontsize=20, loc='best')
The residuals should be normally distributed with mean 0 and the autocorrelation should show no correlation between lagged values of residuals
forecast_window = forecast_variable_horizon_df['2006': '2007']
plt.hist(((forecast_window['actual']-forecast_window[1])), bins=100)
plt.xlim([-0.2,0.2])
residuals = (forecast_window['actual']-forecast_window[1])
lag_acf = acf(residuals, nlags=20)
plt.figure()
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(residuals)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(residuals)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')
There is no correlation between residuals at different time steps
start_year = '2008'
end_year = '2010'
plot_prediction_window(forecast_variable_horizon_df, start_year, end_year)
start_year = '2016'
end_year = '2016'
plot_prediction_window(forecast_variable_horizon_df, start_year, end_year)
start_year = '2013-04'
end_year = '2015-02'
plot_prediction_window(forecast_variable_horizon_df, start_year, end_year)
The predictions appear to be lagged values of the previous week's value
Same concept as before, but the number of lags is varied - the average r2 is calculated after the model is fit to each subset and then averaged for each value of lag. The lag value at which the r2 is the lowest can then be selected.
def roll_forward_cross_validate(ts, num_lags):
scores = np.zeros((num_lags))
for lag in range(1, num_lags+1):
forecast_df, conf_int = forecast_variable_horizon(ts, 1, lag)
scores[lag-1] = r2_score(forecast_df['actual'][1:], forecast_df[1][1:])
print 'lag ', lag, 'score ', scores[lag-1]
return scores
scores = roll_forward_cross_validate(ts, 5)