In [12]:
import matplotlib.pyplot as plt
%matplotlib inline
USE_TOY_DATA = True
TRAINING_DATA_FILE = """shuttle_%strain.data"""%("toy_" if USE_TOY_DATA else "")
TESTING_DATA_FILE = """shuttle_%stest.data"""%("toy_" if USE_TOY_DATA else "")
# load data
with open(TRAINING_DATA_FILE, 'r') as raw_training_data:
processed_training_data = [[float(str.strip(x)) for x in str.split(raw_datum, " ")] for raw_datum in raw_training_data]
processed_training_data = [(datapoint[0:-2], datapoint[-1]) for datapoint in processed_training_data]
with open(TESTING_DATA_FILE, 'r') as raw_test_data:
processed_test_data = [[float(str.strip(x)) for x in str.split(raw_datum, " ")] for raw_datum in raw_test_data]
processed_test_data = [(datapoint[0:-2], datapoint[-1]) for datapoint in processed_test_data]
In [13]:
def distance(vec1, vec2):
# calculate the distance between two vectors, or data points.
pass;
def majority_vote(labels):
# labels is expected to be a list of labels (in this case floats) from nearest to furtherest.
# Figure out which label is most represented and what to do if there are ties.
# Hint: import Counter
pass;
def knn_classify(k, labeled_points, new_point):
# k is an int
# labeled_points will most likely be the training_data, whose format is described above.
# new_point will be a list of numerical attributes that represents a new, unlabelled data point.
pass;
In [14]:
def run_test(test_data_entry, k=1):
predicted_label = knn_classify(k, processed_training_data, test_data_entry[0])
given_label = test_data_entry[1]
return (1 if given_label == predicted_label else 0, predicted_label, given_label);
if (USE_TOY_DATA):
full_test_results = [run_test(datum) for datum in processed_test_data]
test_results = [result[0] for result in full_test_results]
accuracy = float(sum(test_results)) / len(test_results)
print """Accuracy = %0.2f"""%(accuracy)
else:
test_results = range(0, len(processed_test_data))
for idx, datum in enumerate(processed_test_data):
test_results[idx] = run_test(datum)[0]
if idx != 0 and idx % 100 == 0:
print """Accuracy at iteration %d = %0.2f"""%(idx, float(sum(test_results[0:idx])) / len(test_results[0:idx]))
accuracy = float(sum(test_results)) / len(test_results)
print """Accuracy = %0.2f"""%(accuracy)
In [15]:
print Counter([datum[1] for datum in processed_test_data])
print Counter([datum[1] for datum in processed_training_data])
print accuracy