import numpy as np
import pandas as pd
#Sci-Kit Learn Classification Trees
import sklearn as skl
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import tree
import matplotlib
from matplotlib import pyplot as plt
from google.colab import drive
drive.mount('/content/gdrive');
I was able to pull all the data off of the Bank of Canada website found here.
raw_dataset = pd.read_csv('/content/gdrive/My Drive/Canadian Bond Yields/April2010-April2018.csv')
print('Successfully Imported :')
raw_dataset.head(5)
#Reserved.
Let's try simply predicting the direction of the next day's price actioin on a single 2yr Bond.
This shouldn't return predictions that are accurate more than 65% of the time. Otherwise the Canadian Bond Market really is a joke.
Below are the visualizations of the yield curve as well as the individual return of the 2yr(?) bond. Not to scale.
Things look miserable here in Canada. Bonds are selling like mad. 2yr down to +0.25%. Yikes.
input_data = pd.DataFrame(raw_dataset[['Date','V39051','V39052','V39053','V39054','V39055']])
input_data = input_data.sort_index(ascending=False, axis=0).reset_index()
input_data.head()
input_data.V39051 = pd.to_numeric(input_data.V39051, errors='coerce')
input_data.V39052 = pd.to_numeric(input_data.V39052, errors='coerce')
input_data.V39053 = pd.to_numeric(input_data.V39053, errors='coerce')
input_data.V39054 = pd.to_numeric(input_data.V39054, errors='coerce')
input_data.V39055 = pd.to_numeric(input_data.V39055, errors='coerce')
input_data = input_data.dropna()
input_data['Date'] = 1
input_data['Date'].loc[input_data.V39051.shift(1) > input_data.V39051] = -1
input_data['Date'].loc[input_data.V39051.shift(1) < input_data.V39051] = 1
validation = input_data['Date']
input_data = input_data.drop(['Date'], axis=1)
input_data = input_data.drop(['index'], axis=1)
split = int(0.85 * (len(input_data)))
x_train, x_test, y_train, y_test = input_data[:split], input_data[split:], validation[:split], validation[split:]
print(x_train[30:45])
print(y_train[30:45])
plt.figure(0)
plt.plot(x_train)
plt.plot(x_test)
r_class = (1 - (input_data.V39051 / input_data.V39051.shift(1))) * 100
plt.figure(1)
plt.plot(r_class.cumsum())
dtc = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=10, min_samples_split=110, max_features='log2')
dtc.fit(x_train, y_train)
rfc = RandomForestClassifier(n_estimators = 10000, criterion='gini', max_depth=10, min_samples_split=90, max_features='log2')
rfc.fit(x_train, y_train)
pac = PassiveAggressiveClassifier()
pac.fit(x_train, y_train)
print("PassiveAggressiveClassifier Accuracy:", accuracy_score(y_test, pac.predict(x_test)))
print("RandomForestClassifier Accuracy:", accuracy_score(y_test, rfc.predict(x_test)))
print("DecisionTreeClassifier Accuracy:", accuracy_score(y_test, dtc.predict(x_test)))
vc = VotingClassifier(estimators=[('rfc', rfc), ('pac', pac), ('dtc', dtc)])
vc.fit(x_train, y_train)
print("VotingClassifier Accuracy:", accuracy_score(y_test, vc.predict(x_test)))
print(x_test.dtypes)
predict_class = vc.predict(x_test)
predict_regress = dtc.predict(x_test)
p_class = pd.DataFrame(predict_class)
p_regress = pd.DataFrame(predict_regress)
print('Here we start')
print(p_class.head(5))
p_class['true'] = p_class.astype('float64')
p_regress['true'] = p_regress.astype('float64')
return_class = (1 - (input_data.V39051 / input_data.V39051.shift(-1))) * 100
return_regress = (1 - (input_data.V39051 / input_data.V39051.shift(-1))) * 100
return_class = return_class.reset_index()
return_regress = return_regress.reset_index()
return_class = return_class.V39051 * p_class['true']
return_regress = return_regress.V39051 * p_regress['true']
print(return_regress.head(25))
plt.figure(0)
plt.plot(return_class.cumsum(), color='blue')
plt.plot(return_regress.cumsum(), color='cyan')
bond_root_return = (1 - (x_test.V39051 / x_test.V39051.shift(1))) * 100
plt.figure(1)
plt.plot(bond_root_return.cumsum())
plt.figure(2)
plt.plot(p_class['true'])
plt.figure(3)
plt.plot(p_regress['true'])
#tree.plot_tree(cls)
Seems like classifiers won't produce reliable prediciton models for the movement of the 2yr bond when given the previous curves.
It is interesting that it can consistently outperform a buy and hold strategy when Bonds are tanking. I wonder if it'll underperform as the Bonds get bid up.
I will elave this notebook here. I intend to use the Canadian Bond Yield Data in conjunction with some other economic data in a future endeavour. For now this was a fun way to brush up on SKLearn and make sure that the data I had procured was workable.