In [2]:
from pml.api import *

In [6]:
data = load("../dataset_ext2.csv")
data = data.label_filter(["s", "f"])
data = data.drop_empty_samples()
data.fill_missing_with_feature_means()
#data.combine_labels(["p", "f"], "f")
data.get_label_value_counts()


Out[6]:
f    30
s    26

In [7]:
def print_results(cluster_results):
    print "Rand index: %f" % cluster_results.calculate_rand_index()
    print "Purity:     %f" % cluster_results.calculate_purity()

In [15]:
cluster_euc = kmeans(data, k=2, distance=euclidean)
print_results(cluster_euc)


Rand index: 0.701299
Purity:     0.821429

In [9]:
cluster_cos = kmeans(data, k=2, distance=cosine_distance)
print_results(cluster_cos)


Rand index: 0.569481
Purity:     0.696429

In [ ]: