In [6]:
from pml.api import *
import pandas as pd
In [7]:
data = load("../dataset_ext2.csv")
data = data.drop_empty_samples()
In [8]:
data = data.label_filter(["f", "s"])
In [9]:
print "Number of samples: %d" % data.num_samples()
print "Label value counts:"
print data.get_label_value_counts()
In [10]:
data.fill_missing_with_feature_means()
In [11]:
get_pct_variance_per_principal_component(data)
Out[11]:
In [12]:
pca_data = pca(data, 2)
In [13]:
pca_data.get_first_component_impacts()
Out[13]:
In [16]:
slice = data.slice_features(["MATH110", "MECH141", "CSC115"])
In [17]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
markers = {"s": "o", "p": "^", "f": "x"}
colours = {"s": "g", "p": "y", "f": "r"}
for label in slice.get_label_set():
filtered_slice = slice.label_filter(label)
xs = filtered_slice.get_column("MATH110")
ys = filtered_slice.get_column("MECH141")
zs = filtered_slice.get_column("CSC115")
ax.scatter(xs, ys, zs, color=colours[label],
marker=markers[label])
ax.set_xlabel("MATH110")
ax.set_ylabel("MECH141")
ax.set_zlabel("CSC115")
plt.show()
In [20]:
# Using all 3 labels.
# Based on the 3D plot, choose some decent initial centroids
centroids = [pd.Series({"MATH110": 6.5, "CSC111": 6, "CSC115": 6}), # s
pd.Series({"MATH110": 3, "CSC111": 2, "CSC115": 4}), # p
pd.Series({"MATH110": 1.5, "CSC111": 3.5, "CSC115": 2}) # f
]
In [43]:
# Using p and f combined.
# Based on the 3D plot, choose some decent initial centroids
centroids = [pd.Series({"MATH110": 7, "CSC111": 6, "CSC115": 7}), # s
pd.Series({"MATH110": 2, "CSC111": 3, "CSC115": 3}) # f
]
In [18]:
# p filtered out
# Based on the 3D plot, choose some decent initial centroids
centroids = [pd.Series({"MATH110": 6, "MECH141": 4.5, "CSC115": 7}), # s
pd.Series({"MATH110": 2, "MECH141": 1, "CSC115": 3}) # f
]
In [19]:
def print_results(cluster_results):
print "Rand index: %f" % cluster_results.calculate_rand_index()
print "Purity: %f" % cluster_results.calculate_purity()
In [26]:
# Use random centroids
cluster_rand = kmeans(slice, k=2)
print_results(cluster_rand)
In [22]:
cluster_cent = kmeans(slice, k=2, centroids=centroids)
print_results(cluster_cent)
In [ ]: