notebook.community

Edit and run



In [6]:

    
from pml.api import *
import pandas as pd



In [7]:

    
data = load("../dataset_ext2.csv")
data = data.drop_empty_samples()



In [8]:

    
data = data.label_filter(["f", "s"])



In [9]:

    
print "Number of samples: %d" % data.num_samples()
print "Label value counts:"
print data.get_label_value_counts()









    



Number of samples: 56
Label value counts:
f    30
s    26



In [10]:

    
data.fill_missing_with_feature_means()



In [11]:

    
get_pct_variance_per_principal_component(data)









    Out[11]:





0     0.414377
1     0.125420
2     0.111666
3     0.085183
4     0.058361
5     0.051809
6     0.042735
7     0.038501
8     0.030552
9     0.023278
10    0.015827
11    0.002119
12    0.000172



In [12]:

    
pca_data = pca(data, 2)



In [13]:

    
pca_data.get_first_component_impacts()









    Out[13]:





MATH110    0.495145
MECH141    0.330735
CSC115     0.309409
PHYS122    0.292875
CSC111     0.292110
PHYS125    0.287645
ELEC199    0.283053
MATH101    0.270553
MATH100    0.270476
CHEM150    0.248746
ENGL135    0.086606
ENGR120    0.003390
ENGR110    0.002649



In [16]:

    
slice = data.slice_features(["MATH110", "MECH141", "CSC115"])



In [17]:

    
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")

markers = {"s": "o", "p": "^", "f": "x"}
colours = {"s": "g", "p": "y", "f": "r"}

for label in slice.get_label_set():
    filtered_slice = slice.label_filter(label)
    
    xs = filtered_slice.get_column("MATH110")
    ys = filtered_slice.get_column("MECH141")
    zs = filtered_slice.get_column("CSC115")

    ax.scatter(xs, ys, zs, color=colours[label], 
               marker=markers[label])

ax.set_xlabel("MATH110")
ax.set_ylabel("MECH141")
ax.set_zlabel("CSC115")

plt.show()



In [20]:

    
# Using all 3 labels.  
# Based on the 3D plot, choose some decent initial centroids
centroids = [pd.Series({"MATH110": 6.5, "CSC111": 6,   "CSC115": 6}), # s
             pd.Series({"MATH110": 3,   "CSC111": 2,   "CSC115": 4}), # p
             pd.Series({"MATH110": 1.5, "CSC111": 3.5, "CSC115": 2})  # f
            ]



In [43]:

    
# Using p and f combined.  
# Based on the 3D plot, choose some decent initial centroids
centroids = [pd.Series({"MATH110": 7, "CSC111": 6, "CSC115": 7}), # s
             pd.Series({"MATH110": 2, "CSC111": 3, "CSC115": 3})  # f
            ]



In [18]:

    
# p filtered out
# Based on the 3D plot, choose some decent initial centroids
centroids = [pd.Series({"MATH110": 6, "MECH141": 4.5, "CSC115": 7}), # s
             pd.Series({"MATH110": 2, "MECH141": 1,   "CSC115": 3})  # f
            ]



In [19]:

    
def print_results(cluster_results):
    print "Rand index: %f" % cluster_results.calculate_rand_index()
    print "Purity:     %f" % cluster_results.calculate_purity()



In [26]:

    
# Use random centroids
cluster_rand = kmeans(slice, k=2)
print_results(cluster_rand)









    



Rand index: 0.637013
Purity:     0.767857



In [22]:

    
cluster_cent = kmeans(slice, k=2, centroids=centroids)
print_results(cluster_cent)









    



Rand index: 0.637013
Purity:     0.767857



In [ ]: