In [1]:
import numpy as np
import h5py
from sklearn import svm, cross_validation
In [2]:
# First we load the file
file_location = '../results_database/text_wall_street_big.hdf5'
f = h5py.File(file_location, 'r')
# Now we need to get the letters and align them
text_directory = '../data/wall_street_letters.npy'
letters_sequence = np.load(text_directory)
Nletters = len(letters_sequence)
symbols = set(letters_sequence)
In [3]:
N = 5000 # Amount of data
delay = 5
Nembedding = 3
# Quantities to scale
time_clustering_collection = np.arange(5, 55, 5)
spatial_clustering_collection = np.arange(3, 11, 1)
N_time_clusters = time_clustering_collection.size
N_spatial_clusters = spatial_clustering_collection.size
accuracy = np.zeros((N_time_clusters, N_spatial_clusters))
In [4]:
for spatial_index, Nspatial_clusters in enumerate(spatial_clustering_collection):
for time_index, Ntime_clusters in enumerate(time_clustering_collection):
run_name = '/low-resolution'
parameters_string = '/' + str(Nspatial_clusters)
parameters_string += '-' + str(Ntime_clusters)
parameters_string += '-' + str(Nembedding)
nexa = f[run_name + parameters_string]
# Now we load the time and the code vectors
code_vectors = nexa['code-vectors']
code_vectors_distance = nexa['code-vectors-distance']
code_vectors_softmax = nexa['code-vectors-softmax']
code_vectors_winner = nexa['code-vectors-winner']
# Make prediction with scikit-learn
X = code_vectors_winner[:(N - delay)]
y = letters_sequence[delay:N]
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)
clf = svm.SVC(C=1.0, cache_size=200, kernel='linear')
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) * 100.0
print(parameters_string)
print('SVM score', score)
accuracy[time_index, spatial_index] = score
In [5]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
colormap = 'Blues'
origin = 'lower'
interpolation = 'none'
gs = gridspec.GridSpec(2, 2)
fig = plt.figure(figsize=(12, 9))
ax = plt.subplot(gs[:, :])
im = ax.imshow(accuracy, origin=origin, interpolation=interpolation, aspect='auto',
extent=[0, Ntime_clusters, 0, Nspatial_clusters], vmin=0, vmax=100,
cmap=colormap)
fig.colorbar(im)
ax.set_xlabel('Data Clusters')
ax.set_ylabel('Spatio Temporal Clusters')
ax.set_title('Accuracy as a function of Nexa parameters')
Out[5]:
In [22]:
import seaborn as sns
value1 = 0
value2 = 3
value3 = 7
print(accuracy.shape)
fig = plt.figure(figsize=(16, 12))
ax = fig.add_subplot(111)
ax.plot(time_clustering_collection, accuracy[:, value1],'o-', lw=2, markersize=10, label='Nst_clusters='+ str(value1 + 3))
ax.plot(time_clustering_collection, accuracy[:, value2],'o-', lw=2, markersize=10, label='Nst_clusters='+ str(value2 + 3))
ax.plot(time_clustering_collection, accuracy[:, value3],'o-', lw=2, markersize=10, label='Nst_clusters='+ str(value3 + 3))
ax.set_xlim(-5, 55)
ax.set_ylim(0, 110)
ax.set_title('Sample curves from the matrix as a function of data clusters')
ax.legend()
Out[22]:
In [30]:
value1 = 1
value2 = 3
value3 = 7
print(accuracy.shape)
fig = plt.figure(figsize=(16, 12))
ax = fig.add_subplot(111)
ax.plot(spatial_clustering_collection, accuracy[value1, :],'o-', lw=2, markersize=10, label='Ndata_clusters='+ str((value1*5)+5))
ax.plot(spatial_clustering_collection, accuracy[value2, :],'o-', lw=2, markersize=10, label='Ndata_clusters='+ str((value2*5)+5))
ax.plot(spatial_clustering_collection, accuracy[value3, :],'o-', lw=2, markersize=10, label='Ndata_clusters='+ str((value3*5)+5))
ax.set_xlim(0, 10)
ax.set_ylim(0, 110)
ax.set_title('Sample curves from the matrix as a function of number of spatio temporal clusters')
ax.legend()
Out[30]:
In [9]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
sns.set(rc={'image.cmap': 'inferno'})
%matplotlib inline
gs = gridspec.GridSpec(2, 2)
fig = plt.figure(figsize=(12, 9))
ax = plt.subplot(gs[:, :])
im = ax.imshow(accuracy, origin='lower', interpolation='none', aspect='auto',
extent=[0, Ntime_clusters, 0, Nspatial_clusters], vmin=0, vmax=100)
fig.colorbar(im)
ax.set_xlabel('Data Clusters')
ax.set_ylabel('Spatio Temporal Clusters')
ax.set_title('Accuracy as a function of Nexa parameters')
Out[9]:
In [ ]: