In [2]:
from pml.api import *

In [7]:
data = load("../dataset_ext2.csv")
data = data.label_filter(["s", "f"])

In [8]:
slice = data.slice_features(["MATH101", "PHYS125"])
slice = slice.drop_empty_samples()
slice.fill_missing_with_feature_means()

In [9]:
print "Number of samples: %d" % data.num_samples()
print "Label value counts:"
print data.get_label_value_counts()


Number of samples: 59
Label value counts:
f    30
s    29

In [12]:
import matplotlib.pyplot as plt

markers = {"s": "o", "p": "^", "f": "x"}
colours = {"s": "g", "p": "y", "f": "r"}

df = slice._dataframe

fig = plt.figure()
ax = fig.add_subplot(111)

for label in slice.get_label_set():
    filtered_slice = slice.label_filter(label)
    
    xs = filtered_slice.get_column("MATH101")
    ys = filtered_slice.get_column("PHYS125")

    ax.scatter(xs, ys, color=colours[label], 
               marker=markers[label])

ax.set_xlabel("MATH101")
ax.set_ylabel("PHYS125")

plt.show()

In [13]:
def print_results(cluster_results):
    print "Rand index: %f" % cluster_results.calculate_rand_index()
    print "Purity:     %f" % cluster_results.calculate_purity()

In [23]:
# Use random centroids
cluster_rand = kmeans(slice, k=2, distance=euclidean)
print_results(cluster_rand)


Rand index: 0.683258
Purity:     0.807692

In [ ]: