In [1]:
import numpy as np
import h5py
from sklearn import svm, cross_validation
from sklearn.naive_bayes import MultinomialNB
In [2]:
# First we load the file
file_location = '../results_database/text_wall_street_big.hdf5'
f = h5py.File(file_location, 'r')
# Now we need to get the letters and align them
text_directory = '../data/wall_street_letters.npy'
letters_sequence = np.load(text_directory)
Nletters = len(letters_sequence)
symbols = set(letters_sequence)
# Load the particular example
Nspatial_clusters = 5
Ntime_clusters = 15
Nembedding = 3
run_name = '/low-resolution'
parameters_string = '/' + str(Nspatial_clusters)
parameters_string += '-' + str(Ntime_clusters)
parameters_string += '-' + str(Nembedding)
nexa = f[run_name + parameters_string]
# Now we load the time and the code vectors
time = nexa['time']
code_vectors = nexa['code-vectors']
code_vectors_distance = nexa['code-vectors-distance']
code_vectors_softmax = nexa['code-vectors-softmax']
code_vectors_winner = nexa['code-vectors-winner']
In [14]:
data_ammount = np.logspace(2, 5, 30, dtype='int')
delay = 4
accuracy = []
In [15]:
# Make prediction with scikit-learn
for N in data_ammount:
X = code_vectors_winner[:(N - delay)]
y = letters_sequence[delay:N]
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)
clf = svm.SVC(C=1.0, cache_size=200, kernel='linear')
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) * 100.0
print('N_data', N)
print('SVM score', score)
accuracy.append(score)
print('Analysis donde')
In [25]:
# Plot this
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.plot(data_ammount, accuracy, 'o-', lw=2, markersize=10)
plt.xlabel('Total Data')
plt.ylabel('Accuracy %')
plt.title('Accuracy vs Amount of Data')
plt.ylim([0, 110])
plt.xscale('log')
fig = plt.gcf()
fig.set_size_inches((12, 9))
In [ ]: