Prediction of text with Nexa Letter Latency.

This notbook is for seeing how much the delay between the code vector and the code is related to the accuaracy of the prediciton.


In [15]:
import numpy as np
import h5py
from sklearn import svm, cross_validation
from sklearn.naive_bayes import MultinomialNB

Load the data


In [16]:
# First we load the file 
file_location = '../results_database/text_wall_street_big.hdf5'
f = h5py.File(file_location, 'r')

# Now we need to get the letters and align them
text_directory = '../data/wall_street_letters.npy'
letters_sequence = np.load(text_directory)
Nletters = len(letters_sequence)
symbols = set(letters_sequence)

# Load the particular example
Nspatial_clusters = 5
Ntime_clusters = 15
Nembedding = 3

run_name = '/low-resolution'
parameters_string = '/' + str(Nspatial_clusters)
parameters_string += '-' + str(Ntime_clusters)
parameters_string += '-' + str(Nembedding)

nexa = f[run_name + parameters_string]

# Now we load the time and the code vectors
time = nexa['time']
code_vectors = nexa['code-vectors']
code_vectors_distance = nexa['code-vectors-distance']
code_vectors_softmax = nexa['code-vectors-softmax']
code_vectors_winner = nexa['code-vectors-winner']

Study the Latency of the Data by Accuracy

Make prediction with winner takes all

Make the prediction for each delay. This takes a bit


In [18]:
N = 50000  # Amount of data
delays = np.arange(0, 10)
accuracy = []

# Make prediction with scikit-learn
for delay in delays:
    X = code_vectors_winner[:(N - delay)]
    y = letters_sequence[delay:N]
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)

    clf = svm.SVC(C=1.0, cache_size=200, kernel='linear')
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test) * 100.0
    accuracy.append(score)
    print('delay', delay)
    print('score', score)


delay 0
score 25.22
delay 1
score 61.0
delay 2
score 64.86
delay 3
score 71.44
delay 4
score 70.72
delay 5
score 85.86
delay 6
score 28.86
delay 7
score 19.66
delay 8
score 19.0
delay 9
score 17.94

Plot it


In [19]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.plot(delays, accuracy, 'o-', lw=2, markersize=10)
plt.xlabel('Delays')
plt.ylim([0, 105])
plt.xlim([-0.5, 10])
plt.ylabel('Accuracy %')
plt.title('Delays vs Accuracy')
fig = plt.gcf()
fig.set_size_inches((12, 9))


Make predictions with representation standarization


In [20]:
from sklearn import preprocessing

In [21]:
N = 50000  # Amount of data
delays = np.arange(0, 10)
accuracy_std = []

# Make prediction with scikit-learn
for delay in delays:
    X = code_vectors_winner[:(N - delay)]
    y = letters_sequence[delay:N]
    X = preprocessing.scale(X)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)

    clf = svm.SVC(C=1.0, cache_size=200, kernel='linear')
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test) * 100.0
    accuracy_std.append(score)
    print('delay', delay)
    print('score', score)


delay 0
score 26.6
delay 1
score 62.22
delay 2
score 65.38
delay 3
score 70.92
delay 4
score 69.74
delay 5
score 85.82
delay 6
score 30.32
delay 7
score 20.94
delay 8
score 18.04
delay 9
score 18.1

Plot it


In [22]:
plt.plot(delays, accuracy, 'o-', lw=2, markersize=10., label='Accuracy')
plt.plot(delays, accuracy_std, 'o-', lw=2, markersize=10, label='Standarized Representations')
plt.xlabel('Delays')
plt.ylim([0, 105])
plt.xlim([-0.5, 10])
plt.ylabel('Accuracy %')
plt.title('Delays vs Accuracy')
fig = plt.gcf()
fig.set_size_inches((12, 9))
plt.legend()


Out[22]:
<matplotlib.legend.Legend at 0x7f6200f37c88>

Make prediction with softmax


In [23]:
accuracy_softmax = []

# Make prediction with scikit-learn
for delay in delays:
    X = code_vectors_softmax[:(N - delay)]
    y = letters_sequence[delay:N]
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)

    clf = svm.SVC(C=1.0, cache_size=200, kernel='linear')
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test) * 100.0
    accuracy_softmax.append(score)
    print('delay', delay)
    print('score', score)


delay 0
score 26.28
delay 1
score 60.68
delay 2
score 65.08
delay 3
score 70.54
delay 4
score 68.08
delay 5
score 85.8
delay 6
score 28.9
delay 7
score 19.4
delay 8
score 18.0
delay 9
score 18.9

Standarized predictions with softmax


In [24]:
accuracy_softmax_std = []

# Make prediction with scikit-learn
for delay in delays:
    X = code_vectors_winner[:(N - delay)]
    y = letters_sequence[delay:N]
    X = preprocessing.scale(X)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)

    clf = svm.SVC(C=1.0, cache_size=200, kernel='linear')
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test) * 100.0
    accuracy_softmax_std.append(score)
    print('delay', delay)
    print('score', score)


delay 0
score 26.38
delay 1
score 61.46
delay 2
score 65.96
delay 3
score 71.38
delay 4
score 70.12
delay 5
score 86.02
delay 6
score 29.34
delay 7
score 19.5
delay 8
score 17.92
delay 9
score 18.48

In [26]:
plt.plot(delays, accuracy_softmax, 'o-', lw=2, markersize=10., label='Accuracy')
plt.plot(delays, accuracy_softmax_std, 'o-', lw=2, markersize=10, label='Standarized Representations')
plt.xlabel('Delays')
plt.ylim([0, 105])
plt.xlim([-0.5, 10])
plt.ylabel('Accuracy %')
plt.title('Delays vs Accuracy (Softmax)')
fig = plt.gcf()
fig.set_size_inches((12, 9))
plt.legend()


Out[26]:
<matplotlib.legend.Legend at 0x7f620129c940>