This file is very close to ModelSampler.ipynb, but it uses the k-Nearest Neighbors output as one of the features. We see below that it reaches 68% prediction accuracy, which is actually below the prediction accuracy of k-Nearest Neighbors alone. Perhaps our model does not generalize well.
In [3]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
%matplotlib inline
In [101]:
def logprior(b, sigma2):
return -np.log(sigma2)
def logistic(x, b):
theta = linear(x,b)
return 1.0/(1+np.exp(theta))
def linear(x, b):
nFeat = x.shape[0]
y = np.zeros(x.shape[1])
for i in xrange(nFeat):
y += x[i]*b[i+1]
return b[0] + y
#return b[0] + x[0]*b[1] + x[1]*b[2] + x[2]*b[3] + x[3]*b[4]
def loglikelihood(b, sigma2, x, y, model):
n = len(x)
return -n*np.log(sigma2) - 0.5*((y-model(x, b))**2).sum()/sigma2
def logpost(b, sigma2, x, y, model):
return logprior(b, sigma2) + loglikelihood(b, sigma2, x, y, model)
In [48]:
# mcmc algorithm
def mcmc(b_init, sig2, x, y, N, burnin, be, sig2e, model):
nFeat = len(b_init) - 1
B = np.zeros((N,nFeat + 1))
Sig2 = np.zeros(N)
b_prev = b_init
sig2_prev = sig2
count = 0
r = np.random.random(N)
for i in xrange(N):
b_star = np.random.normal(b_prev,be)
sig2_star = abs(np.random.normal(sig2_prev, sig2e))
p = logpost(b_star, sig2_star, x, y, model) - logpost(b_prev, sig2_prev, x, y, model)
if np.log(r[i]) < p:
b_prev = b_star
sig2_prev = sig2_star
count += 1
B[i] = b_prev
Sig2[i] = sig2_prev
print "The acceptance rate is "+ str(float(count)/N)+"."
return B, Sig2
In [115]:
def mcmcComp(b_init, sig2, x, y, N, burnin, be, sig2e, model):
#MCMC with componentwise updating instead of block updating
nParam = len(b_init)
B = np.zeros((N,nParam))
Sig2 = np.zeros(N)
b_prev = b_init
sig2_prev = sig2
count = 0
r = np.random.random((N,nParam+1))
for i in xrange(N):
#updating all the beta parameters
for j in xrange(nParam):
b_star = np.copy(b_prev)
b_star[j] = np.random.normal(b_prev[j],be[j])
p = logpost(b_star, sig2_prev, x, y, model) - logpost(b_prev, sig2_prev, x, y, model)
if np.log(r[i,j]) < p:
b_prev = b_star
count += 1
#updating sig2
sig2_star = abs(np.random.normal(sig2_prev, sig2e))
p = logpost(b_prev, sig2_star, x, y, model) - logpost(b_prev, sig2_prev, x, y, model)
if np.log(r[i,-1]) < p:
sig2_prev = sig2_star
count += 1
B[i] = b_prev
Sig2[i] = sig2_prev
print "The acceptance rate is "+ str(float(count)/(N*(nParam+1)))+"."
return B, Sig2
In [6]:
def import_data():
#import data and labels
train_data = pd.read_csv('Waterpump-training-values.csv')
train_labels = pd.read_csv('Waterpump-training-labels.csv')
#separating dataset into training and testing for cross-validation - 90% into training
test_idx = np.random.uniform(0, 1, len(train_data)) <= 0.9
train = train_data[test_idx==True]
trainLabels = train_labels[test_idx==True]
test = train_data[test_idx==False]
testLabels = train_labels[test_idx==False]
return train, trainLabels, test, testLabels
In [93]:
def trainKNN(train, trainLabels):
#trains classifier using longitude and latitude data
features = ['longitude','latitude']
trainLoc = train[features]
hasLocIdx = trainLoc['longitude']>1
trainLoc = trainLoc[hasLocIdx] #remove rows with empty location data
trainLabelsLoc = trainLabels[hasLocIdx] #only keep labels corresponding to rows with non-empty location data
#train kNN classifier
nNeighbors = 60
clf = KNeighborsClassifier(n_neighbors=nNeighbors,weights='distance',algorithm='auto')
clf.fit(trainLoc[features], trainLabelsLoc['status_group'])
return clf
def addKNNCol(data, clf):
#uses trained classifier to add predictions for different classes
features = ['longitude','latitude']
probaPredict = clf.predict_proba(data[features])
data['knnFunctional'] = probaPredict[:,0]
data['knnNeedsRepair'] = probaPredict[:,1]
data['knnNonFunctional'] = probaPredict[:,2]
return data
def processAllData(train, trainLabels, test, testLabels):
clf = trainKNN(train, trainLabels)
addKNNCol(train, clf)
addKNNCol(test, clf)
train, nFeatures = processData(train)
test, _ = processData(test)
trainLabelsVect = pd.get_dummies(trainLabels['status_group'])
trainLabelsVect['functionality'] = trainLabelsVect['functional'] + 0.5*trainLabelsVect['functional needs repair']
return train, trainLabelsVect, test, testLabels, nFeatures
def processData(data):
features = ['age', 'gps_height', 'knnFunctional', 'knnNonFunctional', 'dry']
nFeatures = len(features)
data['age'] = 2015 - data['construction_year']
data['dry'] = data['quantity'] == 'dry'
return np.transpose(data[features].values), nFeatures
In [94]:
train, trainLabels, test, testLabels = import_data()
train, trainLabels, test, testLabels, nFeatures = processAllData(train, trainLabels, test, testLabels)
In [95]:
numBeta = nFeatures + 1 #1 more for the constant
In [105]:
model = logistic
b_init = [0.5, 0, 0, -4, 0.5, 0]
be = [0.05, 0.001, 0.001, 0.1, 0.01, 0.1]
B, sig2= mcmc(b_init, 10, train, trainLabels['functionality'], 10000, 0, be, 0.1, model)
In [106]:
plt.plot(B[:,0], label='b0')
plt.plot(B[:,1], label='b1')
plt.plot(B[:,2], label='b2')
plt.plot(B[:,3], label='b3')
plt.plot(B[:,4], label='b4')
plt.plot(B[:,5], label='b5')
plt.legend()
Out[106]:
In [107]:
plt.plot(sig2)
Out[107]:
In [108]:
b_final = B[-1,:]
In [86]:
b_final
Out[86]:
In [109]:
#predictions - these are the continuous values, need to convert to labels
yPredict = model(test, b_final)
In [110]:
n = len(testLabels)
correct = 0.0
upBound =0.6
lowBound = 0.4
for yPred, label in zip(yPredict, testLabels['status_group']):
if yPred >= upBound and label=='functional':
correct += 1
elif yPred <= lowBound and label == 'non functional':
correct += 1
elif yPred > lowBound and yPred < upBound and label == 'functional needs repair':
correct += 1
print correct/n
In [111]:
plt.hist(yPredict)
Out[111]:
In [121]:
#using componentwise updating
model = logistic
b_init = [2.0, 0, 0, -4, 1, 2]
be = [0.1, 0.001, 0.001, 0.1, 0.01, 0.1]
B, sig2= mcmcComp(b_init, 10, train, trainLabels['functionality'], 10000, 0, be, 0.1, model)
In [122]:
plt.plot(B[:,0], label='b0')
plt.plot(B[:,1], label='b1')
plt.plot(B[:,2], label='b2')
plt.plot(B[:,3], label='b3')
plt.plot(B[:,4], label='b4')
plt.plot(B[:,5], label='b5')
plt.legend()
Out[122]:
In [123]:
b_final = B[-1,:]
#predictions - these are the continuous values, need to convert to labels
yPredict = model(test, b_final)
n = len(testLabels)
correct = 0.0
upBound =0.6
lowBound = 0.4
for yPred, label in zip(yPredict, testLabels['status_group']):
if yPred >= upBound and label=='functional':
correct += 1
elif yPred <= lowBound and label == 'non functional':
correct += 1
elif yPred > lowBound and yPred < upBound and label == 'functional needs repair':
correct += 1
print correct/n
In [ ]: