Sklearn on the full dataset



In [20]:

    
import h5py
import numpy
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder



In [12]:

    
with h5py.File('data/chorales.hdf5', "r", libver='latest') as f:
    Xtrain = f['Xtrain'].value
    ytrain = f['ytrain'].value
    Xdev = f['Xdev'].value
    ydev = f['ydev'].value
    Xtest = f['Xtest'].value
    ytest = f['ytest'].value

Xtrain, Xdev, Xtest = Xtrain[:, range(10)], Xdev[:, range(10)], Xtest[:, range(10)]
Xtrain = numpy.vstack((Xtrain, Xdev))
Xall = numpy.vstack((Xtrain, Xtest))

with h5py.File('data/chorales_sm.hdf5', "r", libver='latest') as f:
    ytrain = f['ytrainfeat'].value
    ytest = f['ytestfeat'].value
    yall = f['yallfeat'].value

Xtrain.shape, Xall.shape









    Out[12]:





((18788, 10), (20773, 10))



In [13]:

    
Xtestex, ytestex = [], []
assert len(Xtest) == len(ytest)
for idx, x in enumerate(ytest):
    if x in ytrain:
        Xtestex.append(Xtest[idx])
        ytestex.append(ytest[idx])
Xtestex = numpy.matrix(Xtestex)
ytestex = numpy.array(ytestex)



In [16]:

    
logistic = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1000)
encoder = OneHotEncoder()
encoder.fit(Xall)
Xtrainsparse = encoder.transform(Xtrain)
Xtestsparse = encoder.transform(Xtest)
Xtestexsparse = encoder.transform(Xtestex)



In [48]:

    
logistic.fit(Xtrainsparse, ytrain)









    Out[48]:





LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0)



In [49]:

    
logistic.score(Xtrainsparse, ytrain)









    Out[49]:





0.59724292101341281



In [50]:

    
logistic.score(Xtestsparse, ytest)









    Out[50]:





0.2554156171284635



In [51]:

    
logistic.score(Xtestexsparse, ytestex)









    Out[51]:





0.27011188066062869



In [18]:

    
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(10, "entropy")
RF.fit(Xtrainsparse,ytrain)
print RF.score(Xtrainsparse, ytrain)
print RF.score(Xtestsparse, ytest)
print RF.score(Xtestexsparse, ytestex)









    



0.89360229934
0.291687657431
0.308470964305



In [91]:

    
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt

def plotNLLandConfidence(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
    formatted = map(lambda x: x[:-2] if '\t\n' in x else x[:-1], lines)
    formatted = [map(float, x.split('\t')) for x in formatted]
    nll = [x[0] for x in formatted]
    per = [x[1] for x in formatted]
    fig, ax = plt.subplots(1, 2, figsize=(10,5))
    fig.tight_layout()
    ax[0].hist(per)
    ax[0].set_xlabel("Probability assigned to the correct decision")
    ax[1].hist(nll)
    ax[1].set_xlabel("NLL of the parameters given each outcome in the test set")
    plt.show()



In [92]:

    
plotNLLandConfidence("smerror.txt")



In [94]:

    
plotNLLandConfidence("smerror_train.txt")