In [15]:
import numpy as np
import h5py
from sklearn import svm, cross_validation
from sklearn.naive_bayes import MultinomialNB
In [16]:
# First we load the file
file_location = '../results_database/text_wall_street_big.hdf5'
f = h5py.File(file_location, 'r')
# Now we need to get the letters and align them
text_directory = '../data/wall_street_letters.npy'
letters_sequence = np.load(text_directory)
Nletters = len(letters_sequence)
symbols = set(letters_sequence)
# Load the particular example
Nspatial_clusters = 5
Ntime_clusters = 15
Nembedding = 3
run_name = '/low-resolution'
parameters_string = '/' + str(Nspatial_clusters)
parameters_string += '-' + str(Ntime_clusters)
parameters_string += '-' + str(Nembedding)
nexa = f[run_name + parameters_string]
# Now we load the time and the code vectors
time = nexa['time']
code_vectors = nexa['code-vectors']
code_vectors_distance = nexa['code-vectors-distance']
code_vectors_softmax = nexa['code-vectors-softmax']
code_vectors_winner = nexa['code-vectors-winner']
Make the prediction for each delay. This takes a bit
In [18]:
N = 50000 # Amount of data
delays = np.arange(0, 10)
accuracy = []
# Make prediction with scikit-learn
for delay in delays:
X = code_vectors_winner[:(N - delay)]
y = letters_sequence[delay:N]
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)
clf = svm.SVC(C=1.0, cache_size=200, kernel='linear')
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) * 100.0
accuracy.append(score)
print('delay', delay)
print('score', score)
In [19]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.plot(delays, accuracy, 'o-', lw=2, markersize=10)
plt.xlabel('Delays')
plt.ylim([0, 105])
plt.xlim([-0.5, 10])
plt.ylabel('Accuracy %')
plt.title('Delays vs Accuracy')
fig = plt.gcf()
fig.set_size_inches((12, 9))
In [20]:
from sklearn import preprocessing
In [21]:
N = 50000 # Amount of data
delays = np.arange(0, 10)
accuracy_std = []
# Make prediction with scikit-learn
for delay in delays:
X = code_vectors_winner[:(N - delay)]
y = letters_sequence[delay:N]
X = preprocessing.scale(X)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)
clf = svm.SVC(C=1.0, cache_size=200, kernel='linear')
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) * 100.0
accuracy_std.append(score)
print('delay', delay)
print('score', score)
In [22]:
plt.plot(delays, accuracy, 'o-', lw=2, markersize=10., label='Accuracy')
plt.plot(delays, accuracy_std, 'o-', lw=2, markersize=10, label='Standarized Representations')
plt.xlabel('Delays')
plt.ylim([0, 105])
plt.xlim([-0.5, 10])
plt.ylabel('Accuracy %')
plt.title('Delays vs Accuracy')
fig = plt.gcf()
fig.set_size_inches((12, 9))
plt.legend()
Out[22]:
In [23]:
accuracy_softmax = []
# Make prediction with scikit-learn
for delay in delays:
X = code_vectors_softmax[:(N - delay)]
y = letters_sequence[delay:N]
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)
clf = svm.SVC(C=1.0, cache_size=200, kernel='linear')
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) * 100.0
accuracy_softmax.append(score)
print('delay', delay)
print('score', score)
In [24]:
accuracy_softmax_std = []
# Make prediction with scikit-learn
for delay in delays:
X = code_vectors_winner[:(N - delay)]
y = letters_sequence[delay:N]
X = preprocessing.scale(X)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)
clf = svm.SVC(C=1.0, cache_size=200, kernel='linear')
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) * 100.0
accuracy_softmax_std.append(score)
print('delay', delay)
print('score', score)
In [26]:
plt.plot(delays, accuracy_softmax, 'o-', lw=2, markersize=10., label='Accuracy')
plt.plot(delays, accuracy_softmax_std, 'o-', lw=2, markersize=10, label='Standarized Representations')
plt.xlabel('Delays')
plt.ylim([0, 105])
plt.xlim([-0.5, 10])
plt.ylabel('Accuracy %')
plt.title('Delays vs Accuracy (Softmax)')
fig = plt.gcf()
fig.set_size_inches((12, 9))
plt.legend()
Out[26]: