This file is very close to ModelSampler.ipynb, but it uses the k-Nearest Neighbors output as one of the features. We see below that it reaches 68% prediction accuracy, which is actually below the prediction accuracy of k-Nearest Neighbors alone. Perhaps our model does not generalize well.


In [3]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
%matplotlib inline

In [101]:
def logprior(b, sigma2):
    return -np.log(sigma2)

def logistic(x, b):
    theta = linear(x,b)
    return 1.0/(1+np.exp(theta))

def linear(x, b):
    nFeat = x.shape[0]
    y = np.zeros(x.shape[1])
    for i in xrange(nFeat):
        y += x[i]*b[i+1]
    return b[0] + y
    #return b[0] + x[0]*b[1] + x[1]*b[2] + x[2]*b[3] + x[3]*b[4]

def loglikelihood(b, sigma2, x, y, model):
    n = len(x)
    return -n*np.log(sigma2) - 0.5*((y-model(x, b))**2).sum()/sigma2

def logpost(b, sigma2, x, y, model):
    return logprior(b, sigma2) + loglikelihood(b, sigma2, x, y, model)

In [48]:
# mcmc algorithm
def mcmc(b_init, sig2, x, y, N, burnin, be, sig2e, model):
    nFeat = len(b_init) - 1
    
    B = np.zeros((N,nFeat + 1))
    Sig2 = np.zeros(N)
    
    b_prev = b_init
    sig2_prev = sig2
    count = 0
    r = np.random.random(N)
    
    for i in xrange(N):
        b_star = np.random.normal(b_prev,be)

        sig2_star = abs(np.random.normal(sig2_prev, sig2e))
        
        p = logpost(b_star, sig2_star, x, y, model) - logpost(b_prev, sig2_prev, x, y, model)
        if np.log(r[i]) < p:
            b_prev = b_star
            sig2_prev = sig2_star
            count += 1
                           
        B[i] = b_prev
        Sig2[i] = sig2_prev
    print "The acceptance rate is "+ str(float(count)/N)+"."
    return B, Sig2

In [115]:
def mcmcComp(b_init, sig2, x, y, N, burnin, be, sig2e, model):
    #MCMC with componentwise updating instead of block updating
    nParam = len(b_init)
    
    B = np.zeros((N,nParam))
    Sig2 = np.zeros(N)
    
    b_prev = b_init
    sig2_prev = sig2
    count = 0
    r = np.random.random((N,nParam+1))
    
    for i in xrange(N):
        #updating all the beta parameters
        for j in xrange(nParam):
            b_star = np.copy(b_prev)
            b_star[j] = np.random.normal(b_prev[j],be[j])
            p = logpost(b_star, sig2_prev, x, y, model) - logpost(b_prev, sig2_prev, x, y, model)
            if np.log(r[i,j]) < p:
                b_prev = b_star
                count += 1
        
        #updating sig2
        sig2_star = abs(np.random.normal(sig2_prev, sig2e))
        p = logpost(b_prev, sig2_star, x, y, model) - logpost(b_prev, sig2_prev, x, y, model)
        if np.log(r[i,-1]) < p:
            sig2_prev = sig2_star
            count += 1
        B[i] = b_prev
        Sig2[i] = sig2_prev
        
    print "The acceptance rate is "+ str(float(count)/(N*(nParam+1)))+"."
    return B, Sig2

In [6]:
def import_data():
    #import data and labels
    train_data = pd.read_csv('Waterpump-training-values.csv')
    train_labels = pd.read_csv('Waterpump-training-labels.csv')
    
    #separating dataset into training and testing for cross-validation - 90% into training
    test_idx = np.random.uniform(0, 1, len(train_data)) <= 0.9
    train = train_data[test_idx==True]
    trainLabels = train_labels[test_idx==True]
    test = train_data[test_idx==False]
    testLabels = train_labels[test_idx==False]
    
    return train, trainLabels, test, testLabels

In [93]:
def trainKNN(train, trainLabels):
    #trains classifier using longitude and latitude data
    features = ['longitude','latitude']
    trainLoc = train[features]
    hasLocIdx = trainLoc['longitude']>1
    trainLoc = trainLoc[hasLocIdx] #remove rows with empty location data
    trainLabelsLoc = trainLabels[hasLocIdx] #only keep labels corresponding to rows with non-empty location data
    
    #train kNN classifier
    nNeighbors = 60
    clf = KNeighborsClassifier(n_neighbors=nNeighbors,weights='distance',algorithm='auto')
    clf.fit(trainLoc[features], trainLabelsLoc['status_group'])
    
    return clf

def addKNNCol(data, clf):
    #uses trained classifier to add predictions for different classes
    features = ['longitude','latitude']
    probaPredict = clf.predict_proba(data[features])
    data['knnFunctional'] = probaPredict[:,0]
    data['knnNeedsRepair'] = probaPredict[:,1]
    data['knnNonFunctional'] = probaPredict[:,2]
    return data
    
def processAllData(train, trainLabels, test, testLabels):
    clf = trainKNN(train, trainLabels)
    addKNNCol(train, clf)
    addKNNCol(test, clf)
    
    train, nFeatures = processData(train)
    test, _ = processData(test)
    
    trainLabelsVect = pd.get_dummies(trainLabels['status_group'])
    trainLabelsVect['functionality'] = trainLabelsVect['functional'] + 0.5*trainLabelsVect['functional needs repair']
    
    return train, trainLabelsVect, test, testLabels, nFeatures

def processData(data):
    features = ['age', 'gps_height', 'knnFunctional', 'knnNonFunctional', 'dry']
    nFeatures = len(features)
    data['age'] = 2015 - data['construction_year']
    data['dry'] = data['quantity'] == 'dry'
    return np.transpose(data[features].values), nFeatures

In [94]:
train, trainLabels, test, testLabels = import_data()
train, trainLabels, test, testLabels, nFeatures = processAllData(train, trainLabels, test, testLabels)

In [95]:
numBeta = nFeatures + 1  #1 more for the constant

In [105]:
model = logistic
b_init = [0.5, 0, 0, -4, 0.5, 0]
be = [0.05, 0.001, 0.001, 0.1, 0.01, 0.1]
B, sig2= mcmc(b_init, 10, train, trainLabels['functionality'], 10000, 0, be, 0.1, model)


The acceptance rate is 0.0431.

In [106]:
plt.plot(B[:,0], label='b0')
plt.plot(B[:,1], label='b1')
plt.plot(B[:,2], label='b2')
plt.plot(B[:,3], label='b3')
plt.plot(B[:,4], label='b4')
plt.plot(B[:,5], label='b5')
plt.legend()


Out[106]:
<matplotlib.legend.Legend at 0x22c29828>

In [107]:
plt.plot(sig2)


Out[107]:
[<matplotlib.lines.Line2D at 0x23289710>]

In [108]:
b_final = B[-1,:]

In [86]:
b_final


Out[86]:
array([  1.71770911e+00,   2.18640660e-04,  -4.22197508e-04,
        -6.64113238e+00,   1.10874746e+00])

In [109]:
#predictions - these are the continuous values, need to convert to labels
yPredict = model(test, b_final)

In [110]:
n = len(testLabels)
correct = 0.0
upBound =0.6
lowBound = 0.4
for yPred, label in zip(yPredict, testLabels['status_group']):
    if yPred >= upBound and label=='functional':
        correct += 1
    elif yPred <= lowBound and label == 'non functional':
        correct += 1
    elif yPred > lowBound and yPred < upBound and label == 'functional needs repair':
        correct += 1
print correct/n


0.651818485152

In [111]:
plt.hist(yPredict)


Out[111]:
(array([  387.,   346.,   297.,   259.,   275.,   314.,   507.,   527.,
          761.,  2321.]),
 array([ 0.00702402,  0.10604868,  0.20507335,  0.30409801,  0.40312268,
         0.50214734,  0.601172  ,  0.70019667,  0.79922133,  0.89824599,
         0.99727066]),
 <a list of 10 Patch objects>)

In [121]:
#using componentwise updating
model = logistic
b_init = [2.0, 0, 0, -4, 1, 2]
be = [0.1, 0.001, 0.001, 0.1, 0.01, 0.1]
B, sig2= mcmcComp(b_init, 10, train, trainLabels['functionality'], 10000, 0, be, 0.1, model)


The acceptance rate is 0.769914285714.

In [122]:
plt.plot(B[:,0], label='b0')
plt.plot(B[:,1], label='b1')
plt.plot(B[:,2], label='b2')
plt.plot(B[:,3], label='b3')
plt.plot(B[:,4], label='b4')
plt.plot(B[:,5], label='b5')
plt.legend()


Out[122]:
<matplotlib.legend.Legend at 0x1f5db240>

In [123]:
b_final = B[-1,:]
#predictions - these are the continuous values, need to convert to labels
yPredict = model(test, b_final)
n = len(testLabels)
correct = 0.0
upBound =0.6
lowBound = 0.4
for yPred, label in zip(yPredict, testLabels['status_group']):
    if yPred >= upBound and label=='functional':
        correct += 1
    elif yPred <= lowBound and label == 'non functional':
        correct += 1
    elif yPred > lowBound and yPred < upBound and label == 'functional needs repair':
        correct += 1
print correct/n


0.680680680681

In [ ]: