Predictiong with SLM

In this notebook we compare how accurate is a SVM prediction using SLM as its features when the latter are normalized (its values are only 0, 1) in opposition to the current state where its values are 0 and 256.


In [1]:
import numpy as np
import h5py
from sklearn import svm, cross_validation, preprocessing

We load the file


In [2]:
# First we load the file 
file_location = '../results_database/text_wall_street_big.hdf5'
run_name = '/low-resolution'
f = h5py.File(file_location, 'r')


# Now we need to get the letters and align them
text_directory = '../data/wall_street_letters.npy'
letters_sequence = np.load(text_directory)
Nletters = len(letters_sequence)
symbols = set(letters_sequence)

# Nexa parameters
Nspatial_clusters = 5
Ntime_clusters = 15
Nembedding = 3

parameters_string = '/' + str(Nspatial_clusters)
parameters_string += '-' + str(Ntime_clusters)
parameters_string += '-' + str(Nembedding)

nexa = f[run_name + parameters_string]

In [3]:
delay = 4
N = 5000
cache_size = 1000

Accuracy with non-normalized SLM


In [4]:
# Exctrat and normalized SLM
SLM = np.array(f[run_name]['SLM'])

In [5]:
print('Standarized')
X = SLM[:,:(N - delay)].T
y = letters_sequence[delay:N]
# We now scale X
X = preprocessing.scale(X)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)

clf_linear = svm.SVC(C=1.0, cache_size=cache_size, kernel='linear')
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)
print('Score in linear', score)

clf_rbf = svm.SVC(C=1.0, cache_size=cache_size, kernel='rbf')
clf_rbf.fit(X_train, y_train)
score = clf_rbf.score(X_test, y_test) * 100.0
print('Score in rbf', score)

print('Not standarized')
X = SLM[:,:(N - delay)].T
y = letters_sequence[delay:N]

# We now scale X
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)

clf_linear = svm.SVC(C=1.0, cache_size=cache_size, kernel='linear')
clf_linear.fit(X_train, y_train)
score = clf_linear.score(X_test, y_test) * 100.0
print('Score in linear', score)

clf_rbf = svm.SVC(C=1.0, cache_size=cache_size, kernel='linear')
clf_rbf.fit(X_train, y_train)
score = clf_rbf.score(X_test, y_test) * 100.0
print('Score in rbf', score)


Standarized
Score in linear 99.0
Score in rbf 98.4
Not standarized
Score in linear 99.2
Score in rbf 99.2

Accuracy with normalized SLM


In [6]:
# Exctrat and normalized SLM
SLM = np.array(f[run_name]['SLM'])
SLM[SLM < 200] = 0
SLM[SLM >= 200] = 1

In [7]:
print('Standarized')
X = SLM[:,:(N - delay)].T
y = letters_sequence[delay:N]
# We now scale X
X = preprocessing.scale(X)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)

clf_linear = svm.SVC(C=1.0, cache_size=cache_size, kernel='linear')
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)
print('Score in linear', score)

clf_rbf = svm.SVC(C=1.0, cache_size=cache_size, kernel='rbf')
clf_rbf.fit(X_train, y_train)
score = clf_rbf.score(X_test, y_test) * 100.0
print('Score in rbf', score)

print('Not standarized')
X = SLM[:,:(N - delay)].T
y = letters_sequence[delay:N]

# We now scale X
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)

clf_linear = svm.SVC(C=1.0, cache_size=cache_size, kernel='linear')
clf_linear.fit(X_train, y_train)
score = clf_linear.score(X_test, y_test) * 100.0
print('Score in linear', score)

clf_rbf = svm.SVC(C=1.0, cache_size=cache_size, kernel='linear')
clf_rbf.fit(X_train, y_train)
score = clf_rbf.score(X_test, y_test) * 100.0
print('Score in rbf', score)


Standarized
Score in linear 99.4
Score in rbf 98.2
Not standarized
Score in linear 99.0
Score in rbf 99.0

In [ ]: