In [2]:
from pml.api import *
In [6]:
data = load("../dataset_ext2.csv")
data = data.label_filter(["s", "f"])
data = data.drop_empty_samples()
data.fill_missing_with_feature_means()
#data.combine_labels(["p", "f"], "f")
data.get_label_value_counts()
Out[6]:
In [7]:
def print_results(cluster_results):
print "Rand index: %f" % cluster_results.calculate_rand_index()
print "Purity: %f" % cluster_results.calculate_purity()
In [15]:
cluster_euc = kmeans(data, k=2, distance=euclidean)
print_results(cluster_euc)
In [9]:
cluster_cos = kmeans(data, k=2, distance=cosine_distance)
print_results(cluster_cos)
In [ ]: