In [1]:
import pandas as pd
import pylab as pl
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv("https://s3.amazonaws.com/demo-datasets/wine.csv")
 
test_idx = np.random.uniform(0, 1, len(df)) <= 0.3
train = df[test_idx==True]
test = df[test_idx==False]
 
features = ['density', 'sulphates', 'residual_sugar']
 
results = []
for n in range(1, 51, 2):
    clf = KNeighborsClassifier(n_neighbors=n)
    clf.fit(train[features], train['high_quality'])
    preds = clf.predict(test[features])
    accuracy = np.where(preds==test['high_quality'], 1, 0).sum() / float(len(test))
    print("Neighbors: %d, Accuracy: %3f" % (n, accuracy))
 
    results.append([n, accuracy])
 
results = pd.DataFrame(results, columns=["n", "accuracy"])
 
pl.plot(results.n, results.accuracy)
pl.title("Accuracy with Increasing K")
pl.show()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-163e3a07e8e6> in <module>()
      1 df = pd.read_csv("https://s3.amazonaws.com/demo-datasets/wine.csv")
      2 
----> 3 test_idx = np.random.uniform(0, 1, len(df)) <= 0.3
      4 train = df[test_idx==True]
      5 test = df[test_idx==False]

NameError: name 'np' is not defined

In [ ]:
import numpy as np
df = pd.read_csv("https://s3.amazonaws.com/demo-datasets/wine.csv")
 
test_idx = np.random.uniform(0, 1, len(df)) <= 0.3
train = df[test_idx==True]
test = df[test_idx==False]
 
features = ['density', 'sulphates', 'residual_sugar']
 
results = []
for n in range(1, 51, 2):
    clf = KNeighborsClassifier(n_neighbors=n)
    clf.fit(train[features], train['high_quality'])
    preds = clf.predict(test[features])
    accuracy = np.where(preds==test['high_quality'], 1, 0).sum() / float(len(test))
    print("Neighbors: %d, Accuracy: %3f" % (n, accuracy))
 
    results.append([n, accuracy])
 
results = pd.DataFrame(results, columns=["n", "accuracy"])
 
pl.plot(results.n, results.accuracy)
pl.title("Accuracy with Increasing K")
pl.show()


Neighbors: 1, Accuracy: 0.755211
Neighbors: 3, Accuracy: 0.769845
Neighbors: 5, Accuracy: 0.772284
Neighbors: 7, Accuracy: 0.777384
Neighbors: 9, Accuracy: 0.784922
Neighbors: 11, Accuracy: 0.788248
Neighbors: 13, Accuracy: 0.794457
Neighbors: 15, Accuracy: 0.795565
Neighbors: 17, Accuracy: 0.795122
Neighbors: 19, Accuracy: 0.796674
Neighbors: 21, Accuracy: 0.796896
Neighbors: 23, Accuracy: 0.799335
Neighbors: 25, Accuracy: 0.800443
Neighbors: 27, Accuracy: 0.800887
Neighbors: 29, Accuracy: 0.801330
Neighbors: 31, Accuracy: 0.801774
Neighbors: 33, Accuracy: 0.801774
Neighbors: 35, Accuracy: 0.801996
Neighbors: 37, Accuracy: 0.802439
Neighbors: 39, Accuracy: 0.802217
Neighbors: 41, Accuracy: 0.802217
Neighbors: 43, Accuracy: 0.802217
Neighbors: 45, Accuracy: 0.802217
Neighbors: 47, Accuracy: 0.802217
Neighbors: 49, Accuracy: 0.802217

In [ ]:
%matplotlib inline

In [ ]:
df = pd.read_csv("https://s3.amazonaws.com/demo-datasets/wine.csv")
 
test_idx = np.random.uniform(0, 1, len(df)) <= 0.3
train = df[test_idx==True]
test = df[test_idx==False]

features = ['density', 'sulphates', 'residual_sugar']
 
results = []
for n in range(1, 51, 2):
    clf = KNeighborsClassifier(n_neighbors=n)
    clf.fit(train[features], train['high_quality'])
    preds = clf.predict(test[features])
    accuracy = np.where(preds==test['high_quality'], 1, 0).sum() / float(len(test))
    print("Neighbors: %d, Accuracy: %3f" % (n, accuracy))
 
    results.append([n, accuracy])
 
results = pd.DataFrame(results, columns=["n", "accuracy"])
 
pl.plot(results.n, results.accuracy)
pl.title("Accuracy with Increasing K")
pl.show()

In [ ]: