In [1]:
from pml.api import *
import matplotlib.pyplot as plt
In [2]:
data = load("../dataset_ext2.csv")
data = data.drop_empty_samples()
data.fill_missing_with_feature_means()
In [3]:
data.get_label_value_counts()
Out[3]:
In [4]:
# Get first principal component
princomp = pca(data, 1)
princomp.get_first_component_impacts()
Out[4]:
In [5]:
princomp.feature_list()
Out[5]:
In [14]:
# plot configuration
markers = {"s": "o", "p": "^", "f": "x"}
colours = {"s": "g", "p": "y", "f": "r"}
# width x height in inches
figsize = (15, 10)
In [9]:
# Plot 1st PC Weights For Each Sample
fig = plt.figure(figsize=figsize)
for label in data.get_label_set():
filtered = data.label_filter(label)
xs = filtered.get_sample_ids()
ys = princomp.get_rows(xs).get_column(0)
plt.scatter(xs, ys, color=colours[label],
marker=markers[label])
plt.xlabel("Sample Id")
plt.ylabel("First Principal Component")
plt.title("1st PC Weights For Each Sample")
fig.savefig("scatter_pca_weights.png")
In [11]:
# For each course, plot grades for each sample (student)
for i, course in enumerate(data.feature_list()):
# Plot 1 used for 1st PC, so start at 2 here
fig = plt.figure(figsize=figsize)
for label in data.get_label_set():
filtered = data.label_filter(label)
xs = filtered.get_sample_ids()
ys = filtered.get_rows(xs).get_column(course)
plt.scatter(xs, ys, color=colours[label],
marker=markers[label])
plt.xlabel("Sample Id")
plt.ylabel("%s Grade" % course)
plt.title("%s Grade For Each Sample" % course)
fig.savefig("scatter_%s.png" % course)
In [ ]: