In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

# load data
# The result will be two lists of tuples, processed_training_data and processed_testing_data.
# Each tuple in the list will have only two parts: data and label (in that order)
# The data is a list of numerical attributes stored as floats.
# The label is a single float that represents the label.
with open("iris_train.data", 'r') as raw_training_data:
    processed_training_data = [[float(x) for x in str.split(raw_datum, ",")] for raw_datum in raw_training_data]
    processed_training_data = [(datapoint[0:-2], datapoint[-1]) for datapoint in processed_training_data]

with open("iris_test.data", 'r') as raw_test_data:
    processed_test_data = [[float(x) for x in str.split(raw_datum, ",")] for raw_datum in raw_test_data]
    processed_test_data = [(datapoint[0:-2], datapoint[-1]) for datapoint in processed_test_data]

In [3]:
def distance(vec1, vec2):
    # calculate the distance between two vectors, or data points. 
    pass;

def majority_vote(labels):
    # labels is expected to be a list of labels (in this case floats) from nearest to furtherest.
    # Figure out which label is most represented and what to do if there are ties. 
    # Hint: import Counter
    pass;

def knn_classify(k, labeled_points, new_point):
    # k is an int
    # labeled_points will most likely be the training_data, whose format is described above.
    # new_point will be a list of numerical attributes that represents a new, unlabelled data point.
    pass;

In [4]:
def run_test(test_data_entry, k=1):
    predicted_label = knn_classify(k, processed_training_data, test_data_entry[0])
    given_label = test_data_entry[1]
    return (1 if given_label == predicted_label else 0, predicted_label, given_label);

full_test_results = [run_test(datum) for datum in processed_test_data]
test_results = [result[0] for result in full_test_results]
accuracy = float(sum(test_results)) / len(test_results)
print """Accuracy = %0.2f"""%(accuracy)


Accuracy = 0.00

In [5]:
K_RANGE = range(1, 120)
accuracy_for_k = range(0, len(K_RANGE)) # Only as placeholder
for idx, k in enumerate(K_RANGE):
    test_results = [run_test(datum, k)[0] for datum in processed_test_data]
    accuracy_for_k[idx] = float(sum(test_results)) / len(test_results)
    
plt.plot(K_RANGE, accuracy_for_k)


Out[5]:
[<matplotlib.lines.Line2D at 0x7b4df28>]