In [6]:
from pml.api import *
import pandas as pd

In [7]:
data = load("../dataset_ext2.csv")
data = data.drop_empty_samples()

In [8]:
data = data.label_filter(["f", "s"])

In [9]:
print "Number of samples: %d" % data.num_samples()
print "Label value counts:"
print data.get_label_value_counts()


Number of samples: 56
Label value counts:
f    30
s    26

In [10]:
data.fill_missing_with_feature_means()

In [11]:
get_pct_variance_per_principal_component(data)


Out[11]:
0     0.414377
1     0.125420
2     0.111666
3     0.085183
4     0.058361
5     0.051809
6     0.042735
7     0.038501
8     0.030552
9     0.023278
10    0.015827
11    0.002119
12    0.000172

In [12]:
pca_data = pca(data, 2)

In [13]:
pca_data.get_first_component_impacts()


Out[13]:
MATH110    0.495145
MECH141    0.330735
CSC115     0.309409
PHYS122    0.292875
CSC111     0.292110
PHYS125    0.287645
ELEC199    0.283053
MATH101    0.270553
MATH100    0.270476
CHEM150    0.248746
ENGL135    0.086606
ENGR120    0.003390
ENGR110    0.002649

In [16]:
slice = data.slice_features(["MATH110", "MECH141", "CSC115"])

In [17]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")

markers = {"s": "o", "p": "^", "f": "x"}
colours = {"s": "g", "p": "y", "f": "r"}

for label in slice.get_label_set():
    filtered_slice = slice.label_filter(label)
    
    xs = filtered_slice.get_column("MATH110")
    ys = filtered_slice.get_column("MECH141")
    zs = filtered_slice.get_column("CSC115")

    ax.scatter(xs, ys, zs, color=colours[label], 
               marker=markers[label])

ax.set_xlabel("MATH110")
ax.set_ylabel("MECH141")
ax.set_zlabel("CSC115")

plt.show()

In [20]:
# Using all 3 labels.  
# Based on the 3D plot, choose some decent initial centroids
centroids = [pd.Series({"MATH110": 6.5, "CSC111": 6,   "CSC115": 6}), # s
             pd.Series({"MATH110": 3,   "CSC111": 2,   "CSC115": 4}), # p
             pd.Series({"MATH110": 1.5, "CSC111": 3.5, "CSC115": 2})  # f
            ]

In [43]:
# Using p and f combined.  
# Based on the 3D plot, choose some decent initial centroids
centroids = [pd.Series({"MATH110": 7, "CSC111": 6, "CSC115": 7}), # s
             pd.Series({"MATH110": 2, "CSC111": 3, "CSC115": 3})  # f
            ]

In [18]:
# p filtered out
# Based on the 3D plot, choose some decent initial centroids
centroids = [pd.Series({"MATH110": 6, "MECH141": 4.5, "CSC115": 7}), # s
             pd.Series({"MATH110": 2, "MECH141": 1,   "CSC115": 3})  # f
            ]

In [19]:
def print_results(cluster_results):
    print "Rand index: %f" % cluster_results.calculate_rand_index()
    print "Purity:     %f" % cluster_results.calculate_purity()

In [26]:
# Use random centroids
cluster_rand = kmeans(slice, k=2)
print_results(cluster_rand)


Rand index: 0.637013
Purity:     0.767857

In [22]:
cluster_cent = kmeans(slice, k=2, centroids=centroids)
print_results(cluster_cent)


Rand index: 0.637013
Purity:     0.767857

In [ ]: