In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import time
import gzip
import shutil
import seaborn as sns
from collections import Counter
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth, AgglomerativeClustering
from sklearn.metrics import silhouette_score #, make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import kneighbors_graph
Do some preprocessing to group the data by 'Anon Stud Id'
and extract features for further analysis
In [2]:
def hdf_fixed_write_compress(df):
df.to_hdf('data1-step1.hdf','test',mode='w',complib='blosc')
return
def hdf_fixed_read_compress():
df = pd.read_hdf('data.hdf','test')
return df
In [3]:
with gzip.open('data1.hdf.gz', 'rb') as f_in, open('data.hdf', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
!ls -lh data.hdf
data = hdf_fixed_read_compress()
data.head()
Out[3]:
Note to reviewers: this algorithm is quite slow (~45 minutes), so you may consider processing a substantial subset of data (e.g. processing 500,000 rows takes only ~1 minute).
In [4]:
def prepare_stud_data_new(df):
start_time = time.time()
stud_list = df['Anon Student Id'].unique()
cols=['num_sess', \
'num_days', \
'num_probs', \
'num_atts', \
'num_hints', \
'frac_corr_atts', \
'frac_3s_atts', \
'frac_1s_hints', \
'time_atts', \
'time_hints', \
'max_probl_views', \
'max_atts']
numbers = []
#stud_data = pd.DataFrame(columns=cols)
stud_info_df = pd.DataFrame()
i = 0
for stud_name in stud_list:
stud_info_df = df[df['Anon Student Id'] == stud_name].copy()
# total number of days loading the system
num_days = len(set(stud_info_df['Day']))
# total number of sessions opened
num_sessions = len(set(stud_info_df['Session Id']))
# total number of problems entered
num_problems = len(set(stud_info_df['Problem Name']))
# total number of attempts made by the student
num_attempts = stud_info_df[stud_info_df['Student Response Type'] == 0].shape[0]
# total number of hints made by the student
num_hints = stud_info_df[stud_info_df['Student Response Type'] == 1].shape[0]
# fraction of short attemps (with time <= 3 sec)
if (num_attempts > 0):
frac_3s_atts = stud_info_df[(stud_info_df['Student Response Type'] == 0) & (stud_info_df['Duration (sec)'] <= 3.0)].shape[0] / num_attempts
else:
frac_3s_atts = 0
# fraction of short hints (with time <= 1 sec)
if (num_hints > 0):
frac_1s_hints = stud_info_df[(stud_info_df['Student Response Type'] == 1) & (stud_info_df['Duration (sec)'] <= 1.0)].shape[0] / num_hints
else:
frac_1s_hints = 0
# fraction of correct attempts
if (num_attempts > 0):
fraction_correct_attempts = stud_info_df[(stud_info_df['Student Response Type'] == 0) & (stud_info_df['Outcome'] == 0)].shape[0] / num_attempts
else:
fraction_correct_attempts = 0
# total number of time spent for attempts (in seconds)
total_time_attempts = stud_info_df[stud_info_df['Student Response Type'] == 0]['Duration (sec)'].sum()
# total number of time spent for hints (in seconds)
total_time_hints = stud_info_df[stud_info_df['Student Response Type'] == 1]['Duration (sec)'].sum()
# averaged maximal numbers of 'Problem View'
avg_max_problem_views = stud_info_df[['Problem Name', 'Problem View']].groupby(['Problem Name']).agg(np.max).mean()[0]
# averaged maximal number of attempts ('x')
avg_max_attempts = stud_info_df[['Problem Name', 'x']].groupby(['Problem Name']).agg(np.max).mean()[0]
stud_name = i # assign unique numerical ID to each student
if num_attempts != 0:
avd_time_att = total_time_attempts / num_attempts
else:
avg_time_att = 0
if num_hints != 0:
avg_time_hint = total_time_hints / num_hints
else:
avg_time_hint = 0
numbers.append([num_sessions, \
num_days, \
num_problems, \
num_attempts, \
num_hints, \
fraction_correct_attempts, \
frac_3s_atts, \
frac_1s_hints, \
total_time_attempts, \
total_time_hints, \
avg_max_problem_views, \
avg_max_attempts])
print("\r\t>>> Progress\t:{:.4%}".format((i + 1)/len(stud_list)), end='')
i += 1
stud_data = pd.DataFrame(data=numbers, columns=cols)
end_time = time.time()
print("\n\t>>> Exec. time\t:{}s".format(end_time-start_time))
return stud_data
Reading from the scratch instead:
In [5]:
#stud_data = prepare_stud_data_new(data.head(500000).copy())
#stud_data = prepare_stud_data_new(data.copy())
stud_data = pd.read_hdf('stud_data.hdf','test')
Making backup for stud_data
in HDF5 format:
In [6]:
#stud_data.to_hdf('stud_data.hdf','test',mode='w',complib='blosc')
In [7]:
stud_data.shape
Out[7]:
In [8]:
stud_data.describe()
Out[8]:
Our baseline cluster model should show 2 clusters: one with "gaming" and one with "non-gaming" behaviour.
In [9]:
def calculate_gaming_score():
i = 0
start_time = time.time()
best_score = -1 # mininal silhouette_score
best_atts_threshold = 0.001
best_hints_threshold = 0.001
n_bins = 50
for atts_threshold in np.linspace(0.001, 0.999, n_bins):
for hints_threshold in np.linspace(0.001, 0.999, n_bins):
#print(atts_threshold, hints_threshold)
gaming_preds = np.array((stud_data['frac_3s_atts'] > atts_threshold) | (stud_data['frac_1s_hints'] > hints_threshold), dtype=int)
gaming_score = silhouette_score(log_scaled_data, gaming_preds)
if best_score < gaming_score:
best_score = gaming_score
best_atts_threshold = atts_threshold
best_hints_threshold = hints_threshold
print("\r\t>>> Progress\t:{:.4%}".format((i + 1)/(n_bins**2)), end='')
i += 1
end_time = time.time()
print("\n\t>>> Exec. time\t:{}s".format(end_time-start_time))
return best_score, best_atts_threshold, best_hints_threshold
#calculate_gaming_score()
print("The calculate_gaming_score() runs ~45 minutes, so I give the final score:\n \
(0.4473167950112576, 0.89716326530612245, 0.73422448979591837)")
#gaming_preds = np.array((stud_data['frac_3s_atts'] > 0.2) | (stud_data['frac_1s_hints'] > 0.2), dtype=int)
#gaming_score = silhouette_score(log_scaled_data, gaming_preds)
#print(gaming_score)
However, the benchmark model chooses only 14 students (out of 8,980) for a "gaming" cluster:
In [10]:
stud_data[(stud_data['frac_3s_atts'] > 0.89716326530612245) | (stud_data['frac_1s_hints'] > 0.73422448979591837)].shape[0]
Out[10]:
Write a new clustering algorithm that:
In [11]:
# old name: process_data
def transform_data(selected_columns, data):
'''
Apply log-transform and MinMaxScaler() to the selected data columns which are not fractions (frac_*)
Parameters
==========
selected_columns : list
list of columns to leave in processed data
data : pandas.DataFrame
data to process (note that data should contain all selected_columns)
Returns
=======
log_scaled_data : pandas.DataFrame
log-transformed and scaled data selected by selected_columns
'''
data.reset_index(drop=True, inplace=True)
log_data = data[selected_columns].copy()
skewed = log_data.columns.tolist()
skewed = [item for item in skewed if not item.startswith('frac_')]
log_data[skewed] = log_data[skewed].apply(lambda x: np.log10(x + 1))
scaler = MinMaxScaler().fit(log_data)
log_scaled_data = scaler.transform(log_data)
log_scaled_data = pd.DataFrame(log_scaled_data, columns=log_data.columns)
return log_scaled_data
In [12]:
def replace_group_numbers(best_preds):
'''
Replace group numbers in best_preds with sorting by group size
(so that the largest group is 0, the second largest is 1 etc.)
Parameters
==========
best_preds : numpy array
unsorted array of predictions
Returns
=======
best_preds_sorted : numpy array
sorted array of predictions
'''
pp = pd.DataFrame(best_preds, columns = ["old_group"])
dict_pp = {item[0]: i for i, item in enumerate(Counter(best_preds).most_common())}
pp['new_group'] = pp['old_group'].replace(dict_pp)
best_preds_sorted = np.array(pp['new_group'])
return best_preds_sorted
In [13]:
def kmeans(log_scaled_data):
'''
Apply KMeans clustering algorithm with 2 <= cluster_number <= 6 to log_scaled_data
(transformed and scaled by transform_data() function)
Parameters
==========
log_scaled_data : pandas.DataFrame
data log-transormed and MinMaxScaler()-ed for KMeans clustering
Returns
=======
best_clusterer : sklearn Model
clustering algorithm with the largest Silhouette Coefficient
best_score : float
the largest value of the Silhouette Coefficient
best_preds_sorted : numpy.array
array with clustering predictions for log_scaled_data
(0 is the largest cluster, 1 is the second largest etc.)
'''
best_score = 0
for n_clusters in range(2,6):
clusterer = KMeans(n_clusters=n_clusters, n_init=10, random_state=0)
clusterer.fit(log_scaled_data)
preds = clusterer.predict(log_scaled_data)
# Calculate the mean silhouette coefficient for the number of clusters chosen
score = silhouette_score(log_scaled_data, preds)
if best_score < score:
best_clusterer = clusterer
# Predict the cluster for each data point
best_preds = best_clusterer.predict(log_scaled_data)
best_score = score
best_clusters = n_clusters
best_preds_sorted = replace_group_numbers(best_preds)
return best_clusterer, best_score, best_preds_sorted
Choose the pair of columns with best score:
In [14]:
all_columns = ['num_sess', 'num_days', 'num_probs', 'num_atts', 'num_hints', 'frac_corr_atts', \
'frac_3s_atts', 'frac_1s_hints', 'time_atts', 'time_hints', 'max_probl_views', 'max_atts']
In [15]:
def choose_pair_columns_kmeans(all_columns, log_scaled_all_data):
'''
Selects pair of columns in data that produces clusters with the largest score.
In this function, only KMeans clustering algorithm is used
Parameters
==========
all_columns : list
list of columns to look for the pair with the largest score
log_scaled_data : pandas DataFrame
properly scaled DataFrame with all columns
Returns
=======
best_columns : list
pair of data columns with the largest score
best_score : float
the largest value of the score
best_clusterer : sklearn Model
clustering algorithm with the largest score
best_preds : numpy.array
array with clustering predictions for log_scaled_data
(0 is the largest cluster, 1 is the second largest etc.)
'''
best_score = 0
best_columns = []
j = 0
l = len(all_columns)
num_pairs = (l-1)*l/2
for column in all_columns:
selected_columns = [column]
columns_to_add = [a for a in all_columns if (a not in selected_columns)]
for column1 in columns_to_add:
if all_columns.index(column) < all_columns.index(column1):
selected_columns = [column, column1]
print("\r\t>>> Progress\t:{:.4%}".format((j+1)/num_pairs), end='')
j += 1
#log_scaled_data = transform_data(selected_columns, stud_data)
clusterer, score, preds = kmeans(log_scaled_all_data[selected_columns])
if score > best_score:
best_score = score
best_clusterer = clusterer
best_preds = preds
best_columns = selected_columns.copy()
return best_columns, best_score, best_clusterer, best_preds
In [16]:
start_time = time.time()
# consider skipping the step below because it takes some time (~8.5 minites)
log_scaled_all_data = transform_data(all_columns, stud_data)
#best_columns, best_kmeans_score, best_kmeans_clusterer, best_kmeans_preds = choose_pair_columns_kmeans(all_columns, log_scaled_all_data)
# Instead run it single time (6 seconds only)
best_columns = ['frac_1s_hints', 'max_probl_views']
best_kmeans_clusterer, best_kmeans_score, best_kmeans_preds = kmeans(log_scaled_all_data[best_columns])
end_time = time.time()
print("\n\t>>> Exec. time\t:{}s".format(end_time-start_time))
print("\t>>> Best pair of cols:", best_columns)
print("\t>>> Best score:", best_kmeans_score)
print("\t>>> Best clusterer:", best_kmeans_clusterer)
print("\t>>> Best preds:", best_kmeans_preds)
In [17]:
def preds_to_indices(preds): # gives array and returns array of indices with 1s
new_list = []
for i, val in enumerate(preds):
if val == 1:
new_list.append(i)
return np.array(new_list)
Visualising the KMeans clusters:
In [18]:
log_scaled_all_data.describe()
Out[18]:
In [19]:
best_kmeans_preds_mask = preds_to_indices(best_kmeans_preds)
log_scaled_all_data_kmeans_0 = log_scaled_all_data.copy()[~log_scaled_all_data.index.isin(best_kmeans_preds_mask)]
log_scaled_all_data_kmeans_1 = log_scaled_all_data.copy()[log_scaled_all_data.index.isin(best_kmeans_preds_mask)]
plt.scatter(log_scaled_all_data_kmeans_0['frac_1s_hints'], \
log_scaled_all_data_kmeans_0['max_probl_views'], \
alpha=0.6, s=15, c='lightgreen')
plt.scatter(log_scaled_all_data_kmeans_1['frac_1s_hints'], \
log_scaled_all_data_kmeans_1['max_probl_views'], \
alpha=0.6, s=15, c='grey')
plt.xlim([0.0, 0.6])
plt.ylim([0.0, 0.4])
plt.figtext(x=0.64, y=0.56, s='Group 1', ha='center', size=14, color='black')
plt.figtext(x=0.20, y=0.69, s='Group 0', ha='center', size=14, color='darkgreen')
ax = plt.gca()
ax.set_xlabel('frac_1s_hints', size=14)
ax.set_ylabel('max_probl_views (log-transformed, scaled)', size=14)
plt.plot((0.14, 0.14), (0.001, 0.399), 'k--', c='blue')
plt.show()
Then, consider adding one more column to further increase the score:
In [20]:
def cols_iterate_kmeans(selected_columns, best_score, best_clusterer, best_preds):
all_columns = ['num_sess', 'num_days', 'num_probs', 'num_atts', \
'num_hints', 'frac_corr_atts', 'frac_3s_atts', 'frac_1s_hints', \
'time_atts', 'time_hints', 'max_probl_views', 'max_atts']
columns_to_add = [a for a in all_columns if (a not in selected_columns)]
#print(columns_to_add)
for column in columns_to_add:
print("*"*40)
print("*** Trying to add column", column)
print("*"*40)
selected_columns.append(column)
log_scaled_data = transform_data(selected_columns, stud_data)
clusterer, score, preds = kmeans(log_scaled_data)
if score > best_score:
print("!!! Success !!!")
best_score = score
best_clusterer = clusterer
best_preds = preds
print("!!! New score is", best_score)
print("!!! New best clusterer is", best_clusterer)
print("!!! New best selected_columns are", selected_columns)
columns_to_add.remove(column)
else:
print("!!! Last score is equal or worse then our best one")
print("!!! According to Occam's razor, remove the column", column)
selected_columns.remove(column)
print("!!! Still the best selected columns are", selected_columns)
return selected_columns, best_score, best_clusterer, best_preds
In [21]:
# Just skip this step, it does not give new results:
# kmeans_clusterer = best_kmeans_clusterer
# kmeans_score = best_kmeans_score
# kmeans_preds = best_kmeans_preds
# selected_columns = best_columns # ['frac_1s_hints', 'max_probl_views']
# new_columns, new_kmeans_score, new_kmeans_clusterer, new_kmeans_preds = cols_iterate_kmeans(selected_columns, kmeans_score, kmeans_clusterer, kmeans_preds)
# if new_kmeans_score > kmeans_score:
# print("+++ SUCCESS")
# selected_columns = new_columns
# best_kmeans_score = new_kmeans_score
# best_kmeans_clusterer = new_kmeans_clusterer
# best_kmeans_preds = new_kmeans_preds
# else:
# print("--- GIVE UP")
As expected, the pair ['frac_1s_hints', 'max_probl_views'] still gives the best score.
Now, trying with different clusterers.
MeanShift:
In [22]:
def largest_cluster_fraction(preds):
'''
calculates the fraction of students that are in the largest group
Parameters
==========
preds : list
list of predictions
Returns
=======
fraction : float
largest fraction of students
best_i : integer
number of the largest group
'''
fraction = 0
ll = len(preds)
for i in np.unique(preds):
frac = len(preds[preds == i])/ll
if frac > fraction:
fraction = frac
best_i = i
return fraction, best_i
# Rewrite similar to kmeans procedure !!!
def meanshift(log_scaled_data):
'''
Apply MeanShift clustering algorithm to log_scaled_data
(transformed and scaled by transform_data() function)
Number of clusters is selected according to estimate_badwidth procedure
with quantiles in np.linspace(0.01, 0.99, 99)
Parameters
==========
log_scaled_data : pandas.DataFrame
data log-transormed and MinMaxScaler()-ed for KMeans clustering
Returns
=======
best_clusterer : sklearn Model
clustering algorithm with the largest Silhouette Coefficient
best_score : float
the largest value of the Silhouette Coefficient
best_preds_sorted : numpy.array
array with clustering predictions for log_scaled_data
(0 is the largest cluster, 1 is the second largest etc.)
cluster_frac : float
fraction of students inside the largest group
'''
start_time = time.time()
best_score = 0
best_cluster_frac = 0
for alpha in np.linspace(0.01, 0.99, 99):
bandwidth = estimate_bandwidth(log_scaled_data, quantile=alpha, n_samples=None, random_state=0)
clusterer = MeanShift(bandwidth=bandwidth, bin_seeding=True)
clusterer.fit(log_scaled_data)
preds = clusterer.fit_predict(log_scaled_data)
cluster_frac = largest_cluster_fraction(preds)[0]
# Calculate the mean silhouette coefficient for the number of clusters chosen
try:
score = silhouette_score(log_scaled_data, preds)
except ValueError:
score = 0
print(alpha, clusterer.cluster_centers_.shape[0], score, cluster_frac)
# setting cluster_frac > 0.85, the value obtained in KMeans algorithm for ['frac_1s_hints', 'max_probl_views']
if (best_score < score) and (cluster_frac < 0.85):
best_clusterer = clusterer
best_preds = preds
best_score = score
best_clusters = clusterer.cluster_centers_.shape[0]
best_cluster_frac = cluster_frac
print('*'*68)
print("Our best model has", best_clusters, "clusters and sihlouette is", best_score)
end_time = time.time()
print("Running time is {}s".format(end_time-start_time))
print('>'*68)
best_preds_sorted = replace_group_numbers(best_preds)
cluster_frac = best_cluster_frac
return best_clusterer, best_score, best_preds_sorted, cluster_frac
# Rinning MeanShift is too slow: runs about 12 min for 1 pair,
# and produces too bad results (largest score = 0.55 for reasonable max_fractions < 0.85)
# start_time = time.time()
# log_scaled_data = transform_data(best_columns, stud_data)
# best_meanshift_clusterer, best_meansift_score, best_meanshift_preds, _ = meanshift(log_scaled_data)
# print(best_meanshift_clusterer, best_meanshift_score, best_meanshift_preds)
# end_time = time.time()
# print("Running time is {}s".format(end_time-start_time))
GaussianMixture:
In [23]:
def gaussmix(log_scaled_data): # GaussianMixture
start_time = time.time()
max_score = 0
for n_clusters in range(2,6):
clusterer = GaussianMixture(random_state=0, n_init=50, n_components=n_clusters).fit(log_scaled_data)
preds = clusterer.predict(log_scaled_data)
# Calculate the mean silhouette coefficient for the number of clusters chosen
score = silhouette_score(log_scaled_data, preds)
print("For our model with", clusterer.n_components, "clusters, the sihlouette score is", score)
if max_score < score:
best_clusterer = clusterer
# Predict the cluster for each data point
best_preds = best_clusterer.predict(log_scaled_data)
max_score = score
best_clusters = n_clusters
print('*'*68)
print("Our best model has", best_clusters, "clusters and sihlouette is", max_score)
end_time = time.time()
print("Running time is {}s".format(end_time-start_time))
print('>'*68)
best_preds_sorted = replace_group_numbers(best_preds)
return best_clusterer, max_score, best_preds_sorted
def run_clustering_gaussmix(log_scaled_data):
best_score = 0
print(">>> GaussianMixture:")
clusterer, score, preds = gaussmix(log_scaled_data)
if score > best_score:
best_clusterer = clusterer
best_score = score
best_preds = preds
print("Best clusterer is", best_clusterer)
print("Max score is", best_score)
print("Best preds is", best_preds)
return best_clusterer, best_score, best_preds
# ~0.6 min running time but very small score (~0.15)
# start_time = time.time()
# log_scaled_data = transform_data(best_columns, stud_data)
# gaussmix_best_clusterer, gaussmix_best_score, gaussmix_best_preds = run_clustering_gaussmix(log_scaled_data)
# print(gaussmix_best_clusterer, gaussmix_best_score, gaussmix_best_preds)
# end_time = time.time()
# print("Running time is {}s".format(end_time-start_time))
AgglomerativeClustering:
In [24]:
def agglom(log_scaled_data): # AgglomerativeClustering with 'ward' connectivity
start_time = time.time()
max_score = 0
for n_clusters in range(2,3): # use only 2 clusters
connectivity = kneighbors_graph(log_scaled_data, n_neighbors=100, include_self=False)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
clusterer = AgglomerativeClustering(n_clusters=n_clusters, \
linkage='ward', \
connectivity=connectivity)
preds = clusterer.fit_predict(log_scaled_data)
# Calculate the mean silhouette coefficient for the number of clusters chosen
score = silhouette_score(log_scaled_data, preds)
print("For our model with", clusterer.n_clusters, "clusters, and the sihlouette score is", score)
if max_score < score:
best_clusterer = clusterer
# Predict the cluster for each data point
best_preds = preds
max_score = score
best_clusters = n_clusters
print('*'*68)
print("Our best model has", best_clusters, "clusters and sihlouette is", max_score)
end_time = time.time()
print("Running time is {}s".format(end_time-start_time))
print('>'*68)
best_preds_sorted = replace_group_numbers(best_preds)
return best_clusterer, max_score, best_preds_sorted
def run_clustering_agglom(log_scaled_data):
best_score = 0
print(">>> AgglomerativeClustering:")
clusterer, score, preds = agglom(log_scaled_data)
if score > best_score:
best_clusterer = clusterer
best_score = score
best_preds = preds
print("Best clusterer is", best_clusterer)
print("Max score is", best_score)
print("Best preds is", best_preds)
return best_clusterer, best_score, best_preds
# Gives results very similar to KMeans but takes ~4 times more running time
start_time = time.time()
log_scaled_data = transform_data(best_columns, stud_data)
best_agglom_clusterer, best_agglom_score, best_agglom_preds = run_clustering_agglom(log_scaled_data)
print(best_agglom_clusterer, best_agglom_score, best_agglom_preds)
end_time = time.time()
print("Running time is {}s".format(end_time-start_time))
Visualising the AgglomerativeClustering clusters:
In [25]:
best_agglom_preds_mask = preds_to_indices(best_agglom_preds)
log_scaled_data_agglom_0 = log_scaled_data.copy()[~log_scaled_data.index.isin(best_agglom_preds_mask)]
log_scaled_data_agglom_1 = log_scaled_data.copy()[log_scaled_data.index.isin(best_agglom_preds_mask)]
plt.scatter(log_scaled_data_agglom_0['frac_1s_hints'], \
log_scaled_data_agglom_0['max_probl_views'], \
alpha=0.6, s=15, c='lightgreen')
plt.scatter(log_scaled_data_agglom_1['frac_1s_hints'], \
log_scaled_data_agglom_1['max_probl_views'], \
alpha=0.6, s=15, c='grey')
plt.xlim([0.0, 0.6])
plt.ylim([0.0, 0.4])
plt.figtext(x=0.64, y=0.56, s='Group 1', ha='center', size=14, color='black')
plt.figtext(x=0.20, y=0.69, s='Group 0', ha='center', size=14, color='darkgreen')
ax = plt.gca()
ax.set_xlabel('frac_1s_hints', size=14)
ax.set_ylabel('max_probl_views (log-transformed, scaled)', size=14)
plt.plot((0.145, 0.145), (0.001, 0.399), 'k--', c='blue')
plt.show()
I start from group 0 that contains 7686 students:
In [26]:
best_kmeans_preds_mask = preds_to_indices(best_kmeans_preds)
log_scaled_all_data_kmeans_0 = log_scaled_all_data.copy()[~log_scaled_all_data.index.isin(best_kmeans_preds_mask)]
# In this particular splitting, take drop=False to save the initial index
# (simplifying students recovery for step 2)
log_scaled_all_data_kmeans_0.reset_index(inplace=True, drop=False)
log_scaled_all_data_kmeans_0.index
Out[26]:
In [27]:
start_time = time.time()
# best_kmeans_columns_0, \
# best_kmeans_score_0, \
# best_kmeans_clusterer_0, \
# best_kmeans_preds_0 = choose_pair_columns_kmeans(all_columns, log_scaled_all_data_kmeans_0)
best_kmeans_columns_0 = ['frac_3s_atts', 'max_probl_views']
best_kmeans_clusterer_0, best_kmeans_score_0, best_kmeans_preds_0 = kmeans(log_scaled_all_data_kmeans_0[best_kmeans_columns_0])
end_time = time.time()
print("\n\t>>> Exec. time\t:{}s".format(end_time-start_time))
print("\t>>> Best pair of cols:", best_kmeans_columns_0)
print("\t>>> Best score:", best_kmeans_score_0)
print("\t>>> Best clusterer:", best_kmeans_clusterer_0)
print("\t>>> Best preds:", best_kmeans_preds_0)
In [28]:
print(sum(best_kmeans_preds_0), len(best_kmeans_preds_0), len(best_kmeans_preds_0[best_kmeans_preds_0 == 0]))
In [29]:
log_scaled_all_data_kmeans_0.reset_index(inplace=True, drop=True)
Visualise obtained clusters:
In [30]:
best_kmeans_preds_mask_0 = preds_to_indices(best_kmeans_preds_0)
log_scaled_all_data_kmeans_00 = log_scaled_all_data_kmeans_0.copy()[~log_scaled_all_data_kmeans_0.index.isin(best_kmeans_preds_mask_0)]
log_scaled_all_data_kmeans_01 = log_scaled_all_data_kmeans_0.copy()[log_scaled_all_data_kmeans_0.index.isin(best_kmeans_preds_mask_0)]
plt.scatter(log_scaled_all_data_kmeans_00[best_kmeans_columns_0[0]], \
log_scaled_all_data_kmeans_00[best_kmeans_columns_0[1]], \
alpha=0.6, s=15, c='lightgreen')
plt.scatter(log_scaled_all_data_kmeans_01[best_kmeans_columns_0[0]], \
log_scaled_all_data_kmeans_01[best_kmeans_columns_0[1]], \
alpha=0.6, s=15, c='grey')
plt.xlim([0.0, 0.6])
plt.ylim([0.0, 0.4])
plt.figtext(x=0.64, y=0.56, s='Group 01', ha='center', size=14, color='black')
plt.figtext(x=0.20, y=0.69, s='Group 00', ha='center', size=14, color='darkgreen')
ax = plt.gca()
ax.set_xlabel(best_kmeans_columns_0[0], size=14)
ax.set_ylabel(best_kmeans_columns_0[1], size=14)
plt.plot((0.13, 0.13), (0.001, 0.499), 'k--', c='blue')
plt.show()
As we see, group 01 contains more students with "gaming" behaviour, so I proceed with group 00:
In [31]:
len(best_kmeans_preds_0)
Out[31]:
In [32]:
#best_kmeans_preds_mask_0 = preds_to_indices(best_kmeans_preds_0) # already implemented during group0 visualisation
log_scaled_all_data_kmeans_00 = log_scaled_all_data_kmeans_0.copy()[~log_scaled_all_data_kmeans_0.index.isin(best_kmeans_preds_mask_0)]
log_scaled_all_data_kmeans_00.reset_index(inplace=True, drop=True)
log_scaled_all_data_kmeans_00.index
Out[32]:
In [33]:
start_time = time.time()
# best_kmeans_columns_00, \
# best_kmeans_score_00, \
# best_kmeans_clusterer_00, \
# best_kmeans_preds_00 = choose_pair_columns_kmeans(all_columns, log_scaled_all_data_kmeans_00)
best_kmeans_columns_00 = ['frac_3s_atts', 'time_hints']
best_kmeans_clusterer_00, \
best_kmeans_score_00, \
best_kmeans_preds_00 = kmeans(log_scaled_all_data_kmeans_00[best_kmeans_columns_00])
end_time = time.time()
print("\n\t>>> Exec. time\t:{}s".format(end_time-start_time))
print("\t>>> Best pair of cols:", best_kmeans_columns_00)
print("\t>>> Best score:", best_kmeans_score_00)
print("\t>>> Best clusterer:", best_kmeans_clusterer_00)
print("\t>>> Best preds:", best_kmeans_preds_00)
In [34]:
print(sum(best_kmeans_preds_00), len(best_kmeans_preds_00), len(best_kmeans_preds_00[best_kmeans_preds_00 == 0]))
In [35]:
best_kmeans_preds_mask_00 = preds_to_indices(best_kmeans_preds_00)
log_scaled_all_data_kmeans_000 = log_scaled_all_data_kmeans_00.copy()[~log_scaled_all_data_kmeans_00.index.isin(best_kmeans_preds_mask_00)]
log_scaled_all_data_kmeans_001 = log_scaled_all_data_kmeans_00.copy()[log_scaled_all_data_kmeans_00.index.isin(best_kmeans_preds_mask_00)]
plt.scatter(log_scaled_all_data_kmeans_000[best_kmeans_columns_00[0]], \
log_scaled_all_data_kmeans_000[best_kmeans_columns_00[1]], \
alpha=0.6, s=15, c='lightgreen')
plt.scatter(log_scaled_all_data_kmeans_001[best_kmeans_columns_00[0]], \
log_scaled_all_data_kmeans_001[best_kmeans_columns_00[1]], \
alpha=0.6, s=15, c='grey')
# plt.xlim([0.0, 0.6])
# plt.ylim([0.0, 0.4])
# plt.figtext(x=0.64, y=0.56, s='Group 01', ha='center', size=14, color='black')
# plt.figtext(x=0.20, y=0.69, s='Group 00', ha='center', size=14, color='darkgreen')
ax = plt.gca()
ax.set_xlabel(best_kmeans_columns_00[0], size=14)
ax.set_ylabel(best_kmeans_columns_00[1], size=14)
#plt.plot((0.13, 0.13), (0.001, 0.499), 'k--', c='blue')
plt.show()
So, there is a subgroup 001 of 1109 students that do not use many hints. What about the rest (000, 6186 students)?
In [36]:
log_scaled_all_data_kmeans_000 = log_scaled_all_data_kmeans_00.copy()[~log_scaled_all_data_kmeans_00.index.isin(best_kmeans_preds_mask_00)]
log_scaled_all_data_kmeans_000.reset_index(inplace=True, drop=True)
log_scaled_all_data_kmeans_000.index
Out[36]:
In [37]:
start_time = time.time()
# best_kmeans_columns_000, \
# best_kmeans_score_000, \
# best_kmeans_clusterer_000, \
# best_kmeans_preds_000 = choose_pair_columns_kmeans(all_columns, log_scaled_all_data_kmeans_000)
best_kmeans_columns_000 = ['num_sess', 'num_probs']
best_kmeans_clusterer_000, \
best_kmeans_score_000, \
best_kmeans_preds_000 = kmeans(log_scaled_all_data_kmeans_000[best_kmeans_columns_000])
end_time = time.time()
print("\n\t>>> Exec. time\t:{}s".format(end_time-start_time))
print("\t>>> Best pair of cols:", best_kmeans_columns_000)
print("\t>>> Best score:", best_kmeans_score_000)
print("\t>>> Best clusterer:", best_kmeans_clusterer_000)
print("\t>>> Best preds:", best_kmeans_preds_000)
In [38]:
print(sum(best_kmeans_preds_000), len(best_kmeans_preds_000), len(best_kmeans_preds_000[best_kmeans_preds_000 == 0]))
In [39]:
best_kmeans_preds_mask_000 = preds_to_indices(best_kmeans_preds_000)
log_scaled_all_data_kmeans_0000 = log_scaled_all_data_kmeans_000.copy()[~log_scaled_all_data_kmeans_000.index.isin(best_kmeans_preds_mask_000)]
log_scaled_all_data_kmeans_0001 = log_scaled_all_data_kmeans_000.copy()[log_scaled_all_data_kmeans_000.index.isin(best_kmeans_preds_mask_000)]
plt.scatter(log_scaled_all_data_kmeans_0000[best_kmeans_columns_000[0]], \
log_scaled_all_data_kmeans_0000[best_kmeans_columns_000[1]], \
alpha=0.6, s=15, c='lightgreen')
plt.scatter(log_scaled_all_data_kmeans_0001[best_kmeans_columns_000[0]], \
log_scaled_all_data_kmeans_0001[best_kmeans_columns_000[1]], \
alpha=0.6, s=15, c='grey')
# plt.figtext(x=0.64, y=0.56, s='Group 01', ha='center', size=14, color='black')
# plt.figtext(x=0.20, y=0.69, s='Group 00', ha='center', size=14, color='darkgreen')
ax = plt.gca()
ax.set_xlabel(best_kmeans_columns_000[0], size=14)
ax.set_ylabel(best_kmeans_columns_000[1], size=14)
#plt.plot((0.13, 0.13), (0.001, 0.499), 'k--', c='blue')
plt.show()
Splitting group 0000 (students with large 'num_sess'
and 'num_probs'
)
In [40]:
log_scaled_all_data_kmeans_0000 = log_scaled_all_data_kmeans_000.copy()[~log_scaled_all_data_kmeans_000.index.isin(best_kmeans_preds_mask_000)]
log_scaled_all_data_kmeans_0000.reset_index(inplace=True, drop=True)
log_scaled_all_data_kmeans_0000.index
Out[40]:
In [41]:
start_time = time.time()
# best_kmeans_columns_0000, \
# best_kmeans_score_0000, \
# best_kmeans_clusterer_0000, \
# best_kmeans_preds_0000 = choose_pair_columns_kmeans(all_columns, log_scaled_all_data_kmeans_0000)
best_kmeans_columns_0000 = ['num_sess', 'num_probs']
best_kmeans_clusterer_0000, \
best_kmeans_score_0000, \
best_kmeans_preds_0000 = kmeans(log_scaled_all_data_kmeans_0000[best_kmeans_columns_0000])
end_time = time.time()
print("\n\t>>> Exec. time\t:{}s".format(end_time-start_time))
print("\t>>> Best pair of cols:", best_kmeans_columns_0000)
print("\t>>> Best score:", best_kmeans_score_0000)
print("\t>>> Best clusterer:", best_kmeans_clusterer_0000)
print("\t>>> Best preds:", best_kmeans_preds_0000)
In [42]:
print(sum(best_kmeans_preds_0000), \
len(best_kmeans_preds_0000), \
len(best_kmeans_preds_0000[best_kmeans_preds_0000 == 0]))
In [43]:
best_kmeans_preds_mask_0000 = preds_to_indices(best_kmeans_preds_0000)
log_scaled_all_data_kmeans_00000 = log_scaled_all_data_kmeans_0000.copy()[~log_scaled_all_data_kmeans_0000.index.isin(best_kmeans_preds_mask_0000)]
log_scaled_all_data_kmeans_00001 = log_scaled_all_data_kmeans_0000.copy()[log_scaled_all_data_kmeans_0000.index.isin(best_kmeans_preds_mask_0000)]
plt.scatter(log_scaled_all_data_kmeans_00000[best_kmeans_columns_0000[0]], \
log_scaled_all_data_kmeans_00000[best_kmeans_columns_0000[1]], \
alpha=0.6, s=15, c='lightgreen')
plt.scatter(log_scaled_all_data_kmeans_00001[best_kmeans_columns_0000[0]], \
log_scaled_all_data_kmeans_00001[best_kmeans_columns_0000[1]], \
alpha=0.6, s=15, c='grey')
# plt.xlim([0.0, 0.6])
# plt.ylim([0.0, 0.4])
# plt.figtext(x=0.64, y=0.56, s='Group 01', ha='center', size=14, color='black')
# plt.figtext(x=0.20, y=0.69, s='Group 00', ha='center', size=14, color='darkgreen')
ax = plt.gca()
ax.set_xlabel(best_kmeans_columns_0000[0], size=14)
ax.set_ylabel(best_kmeans_columns_0000[1], size=14)
#plt.plot((0.13, 0.13), (0.001, 0.499), 'k--', c='blue')
plt.show()
As we see, these two groups represent students with "intermediate experience" (00000) and "largest experience" (00001).
During Step 1, I splitted 8980 ASSISTments students into 6 different groups:
'frac_1s_hints'
("gaming" behaviour);'frac_1s_hints'
and large 'frac_3s_atts'
("gaming" behaviour);'time_hints'
("non-gaming" behaviour, small usage of hints);'num_sess'
and 'num_probs'
("non-gaming" behaviour, large usage of hints, small experience);'num_sess'
and 'num_probs'
("non-gaming" behaviour, large usage of hints, medium experience);'num_sess'
and 'num_probs'
("non-gaming" behaviour, large usage of hints, large experience).The final result of this step is the joint cluster index that contains numbers 1-6 for each student:
In [44]:
group1_index = np.array(log_scaled_all_data_kmeans_1.index)
len(group1_index)
Out[44]:
In [45]:
group2_index = np.array(log_scaled_all_data_kmeans_01['index'])
len(group2_index)
Out[45]:
In [46]:
group3_index = np.array(log_scaled_all_data_kmeans_001['index'])
len(group3_index)
Out[46]:
In [47]:
group4_index = np.array(log_scaled_all_data_kmeans_0001['index'])
len(group4_index)
Out[47]:
In [48]:
group5_index = np.array(log_scaled_all_data_kmeans_00000['index'])
len(group5_index)
Out[48]:
In [49]:
group6_index = np.array(log_scaled_all_data_kmeans_00001['index'])
len(group6_index)
Out[49]:
In [50]:
def create_joint_cluster_index():
'''
Saves group index files into cluster_index.csv for further analysis
'''
cluster_index_lst = []
for i in range(len(stud_data)+1):
if i in group1_index:
cluster_index_lst.append(1)
elif i in group2_index:
cluster_index_lst.append(2)
elif i in group3_index:
cluster_index_lst.append(3)
elif i in group4_index:
cluster_index_lst.append(4)
elif i in group5_index:
cluster_index_lst.append(5)
elif i in group6_index:
cluster_index_lst.append(6)
print(Counter(cluster_index_lst))
cluster_index = pd.Series(cluster_index_lst, dtype=int)
cluster_index.to_csv('cluster_index.csv')
return
In [51]:
create_joint_cluster_index()
In [52]:
! ls -lh cluster_index.csv
In [ ]: