In [2]:
from pml.api import *
In [7]:
data = load("../dataset_ext2.csv")
data = data.label_filter(["s", "f"])
In [8]:
slice = data.slice_features(["MATH101", "PHYS125"])
slice = slice.drop_empty_samples()
slice.fill_missing_with_feature_means()
In [9]:
print "Number of samples: %d" % data.num_samples()
print "Label value counts:"
print data.get_label_value_counts()
In [12]:
import matplotlib.pyplot as plt
markers = {"s": "o", "p": "^", "f": "x"}
colours = {"s": "g", "p": "y", "f": "r"}
df = slice._dataframe
fig = plt.figure()
ax = fig.add_subplot(111)
for label in slice.get_label_set():
filtered_slice = slice.label_filter(label)
xs = filtered_slice.get_column("MATH101")
ys = filtered_slice.get_column("PHYS125")
ax.scatter(xs, ys, color=colours[label],
marker=markers[label])
ax.set_xlabel("MATH101")
ax.set_ylabel("PHYS125")
plt.show()
In [13]:
def print_results(cluster_results):
print "Rand index: %f" % cluster_results.calculate_rand_index()
print "Purity: %f" % cluster_results.calculate_purity()
In [23]:
# Use random centroids
cluster_rand = kmeans(slice, k=2, distance=euclidean)
print_results(cluster_rand)
In [ ]: