k-nearest neighbors (k-NN)

Exploring k-NN algorithm on dataset, using just latitude and longitude data. This algorithm decides the calssification using the k nearest neighbors, either taking the majority classification of those neighbors or weighting their classifciations using their distance.



In [1]:

    
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
import matplotlib.pylab as plt 
%matplotlib inline



In [2]:

    
train_file = "WaterPump-training-values.csv"
train_labels = "WaterPump-training-labels.csv"
test_file = "WaterPump-test-values.csv"

df = pd.read_csv(train_file, parse_dates=True,index_col='id') #read into dataframe, parse dates, and set ID as index



In [3]:

    
labels = pd.read_csv(train_labels, index_col = 'id')



In [4]:

    
features = ['longitude','latitude']
dfCut = df[features]
hasLoc = dfCut['longitude']>1
dfCut = dfCut[hasLoc] #remove rows with empty location data
labels = labels[hasLoc] #only keep labels corresponding to rows with non-empty location data



In [7]:

    
#separating dataset into training and testing for cross-validation
test_idx = np.random.uniform(0, 1, len(dfCut)) <= 0.9
train = dfCut[test_idx==True]
trainLabels = labels[test_idx==True]
test = dfCut[test_idx==False]
testLabels = labels[test_idx==False]



In [8]:

    
results = []
for n in range(21, 91, 2):
    clf = KNeighborsClassifier(n_neighbors=n,weights='distance',algorithm='auto')
    clf.fit(train[features], trainLabels['status_group'])
    preds = clf.predict(test[features])
    accuracy = np.where(preds==testLabels['status_group'], 1, 0).sum() / float(len(test))
    #print "Neighbors: %d, Accuracy: %3f" % (n, accuracy)
 
    results.append([n, accuracy])

results = pd.DataFrame(results, columns=["n", "accuracy"])

plt.plot(results.n, results.accuracy)
plt.title("Accuracy with Increasing K")
plt.show()



In [31]:

    
print clf.predict_proba(test[features])









    



[[ 0.90090996  0.02149957  0.07759046]
 [ 0.64791557  0.01630249  0.33578194]
 [ 0.53760758  0.08828581  0.37410662]
 ..., 
 [ 0.51515503  0.29372389  0.19112107]
 [ 0.53723527  0.04761919  0.41514554]
 [ 0.60515664  0.04413806  0.3507053 ]]



In [19]:

    
preds









    Out[19]:





array(['functional', 'functional', 'functional', ..., 'functional',
       'non functional', 'non functional'], dtype=object)



In [20]:

    
len(df)









    Out[20]:





59400



In [ ]: