In [20]:
import h5py
import numpy
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
In [12]:
with h5py.File('data/chorales.hdf5', "r", libver='latest') as f:
Xtrain = f['Xtrain'].value
ytrain = f['ytrain'].value
Xdev = f['Xdev'].value
ydev = f['ydev'].value
Xtest = f['Xtest'].value
ytest = f['ytest'].value
Xtrain, Xdev, Xtest = Xtrain[:, range(10)], Xdev[:, range(10)], Xtest[:, range(10)]
Xtrain = numpy.vstack((Xtrain, Xdev))
Xall = numpy.vstack((Xtrain, Xtest))
with h5py.File('data/chorales_sm.hdf5', "r", libver='latest') as f:
ytrain = f['ytrainfeat'].value
ytest = f['ytestfeat'].value
yall = f['yallfeat'].value
Xtrain.shape, Xall.shape
Out[12]:
In [13]:
Xtestex, ytestex = [], []
assert len(Xtest) == len(ytest)
for idx, x in enumerate(ytest):
if x in ytrain:
Xtestex.append(Xtest[idx])
ytestex.append(ytest[idx])
Xtestex = numpy.matrix(Xtestex)
ytestex = numpy.array(ytestex)
In [16]:
logistic = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1000)
encoder = OneHotEncoder()
encoder.fit(Xall)
Xtrainsparse = encoder.transform(Xtrain)
Xtestsparse = encoder.transform(Xtest)
Xtestexsparse = encoder.transform(Xtestex)
In [48]:
logistic.fit(Xtrainsparse, ytrain)
Out[48]:
In [49]:
logistic.score(Xtrainsparse, ytrain)
Out[49]:
In [50]:
logistic.score(Xtestsparse, ytest)
Out[50]:
In [51]:
logistic.score(Xtestexsparse, ytestex)
Out[51]:
In [18]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(10, "entropy")
RF.fit(Xtrainsparse,ytrain)
print RF.score(Xtrainsparse, ytrain)
print RF.score(Xtestsparse, ytest)
print RF.score(Xtestexsparse, ytestex)
In [91]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
def plotNLLandConfidence(filename):
with open(filename, 'r') as f:
lines = f.readlines()
formatted = map(lambda x: x[:-2] if '\t\n' in x else x[:-1], lines)
formatted = [map(float, x.split('\t')) for x in formatted]
nll = [x[0] for x in formatted]
per = [x[1] for x in formatted]
fig, ax = plt.subplots(1, 2, figsize=(10,5))
fig.tight_layout()
ax[0].hist(per)
ax[0].set_xlabel("Probability assigned to the correct decision")
ax[1].hist(nll)
ax[1].set_xlabel("NLL of the parameters given each outcome in the test set")
plt.show()
In [92]:
plotNLLandConfidence("smerror.txt")
In [94]:
plotNLLandConfidence("smerror_train.txt")