Udacity MLND Capstone Project

"Determination of students’ interaction patterns with an intelligent tutoring system and study of their correlation with successful learning"

Step 1 (clustering)



In [1]:

    
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import time
import gzip
import shutil
import seaborn as sns
from collections import Counter

from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth, AgglomerativeClustering
from sklearn.metrics import silhouette_score #, make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import kneighbors_graph

Do some preprocessing to group the data by 'Anon Stud Id' and extract features for further analysis



In [2]:

    
def hdf_fixed_write_compress(df):
    df.to_hdf('data1-step1.hdf','test',mode='w',complib='blosc')
    return

def hdf_fixed_read_compress():
    df = pd.read_hdf('data.hdf','test')
    return df



In [3]:

    
with gzip.open('data1.hdf.gz', 'rb') as f_in, open('data.hdf', 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

!ls -lh data.hdf

data = hdf_fixed_read_compress()
data.head()









    



-rw-rw-r-- 1 dima806 dima806 73M Nov  4 19:17 data.hdf






    Out[3]:







  
    
      
      Anon Student Id
      Session Id
      Duration (sec)
      Student Response Type
      Problem Name
      Problem View
      Attempt At Step
      Outcome
      Day
      x
    
  
  
    
      0
      Stu_001d187b1b375fe98b88696b250177f0
      647501
      102.0
      1
      2218
      1.0
      1.0
      2.0
      2004-11-10
      0
    
    
      1
      Stu_001d187b1b375fe98b88696b250177f0
      647501
      46.0
      0
      2218
      1.0
      2.0
      0.0
      2004-11-10
      1
    
    
      2
      Stu_001d187b1b375fe98b88696b250177f0
      647792
      70.0
      1
      3093
      1.0
      1.0
      2.0
      2004-11-10
      0
    
    
      3
      Stu_001d187b1b375fe98b88696b250177f0
      647792
      22.0
      1
      3093
      1.0
      1.0
      2.0
      2004-11-10
      0
    
    
      4
      Stu_001d187b1b375fe98b88696b250177f0
      647792
      2.0
      1
      3093
      1.0
      2.0
      2.0
      2004-11-10
      0

Note to reviewers: this algorithm is quite slow (~45 minutes), so you may consider processing a substantial subset of data (e.g. processing 500,000 rows takes only ~1 minute).



In [4]:

    
def prepare_stud_data_new(df):

    start_time = time.time()
    stud_list = df['Anon Student Id'].unique()
    cols=['num_sess', \
          'num_days', \
          'num_probs', \
          'num_atts', \
          'num_hints', \
          'frac_corr_atts', \
          'frac_3s_atts', \
          'frac_1s_hints', \
          'time_atts', \
          'time_hints', \
          'max_probl_views', \
          'max_atts']
    
    numbers = []
    #stud_data = pd.DataFrame(columns=cols)
    stud_info_df = pd.DataFrame()
    i = 0
    for stud_name in stud_list:
        stud_info_df = df[df['Anon Student Id'] == stud_name].copy()

        # total number of days loading the system
        num_days = len(set(stud_info_df['Day']))

        # total number of sessions opened
        num_sessions = len(set(stud_info_df['Session Id']))

        # total number of problems entered
        num_problems = len(set(stud_info_df['Problem Name']))

        # total number of attempts made by the student 
        num_attempts = stud_info_df[stud_info_df['Student Response Type'] == 0].shape[0]

        # total number of hints made by the student 
        num_hints = stud_info_df[stud_info_df['Student Response Type'] == 1].shape[0]

        # fraction of short attemps (with time <= 3 sec)
        if (num_attempts > 0):
            frac_3s_atts = stud_info_df[(stud_info_df['Student Response Type'] == 0) & (stud_info_df['Duration (sec)'] <= 3.0)].shape[0] / num_attempts
        else:
            frac_3s_atts = 0

        # fraction of short hints (with time <= 1 sec)
        if (num_hints > 0):
            frac_1s_hints = stud_info_df[(stud_info_df['Student Response Type'] == 1) & (stud_info_df['Duration (sec)'] <= 1.0)].shape[0] / num_hints
        else:
            frac_1s_hints = 0

        # fraction of correct attempts
        if (num_attempts > 0):
            fraction_correct_attempts = stud_info_df[(stud_info_df['Student Response Type'] == 0) & (stud_info_df['Outcome'] == 0)].shape[0] / num_attempts
        else:
            fraction_correct_attempts = 0

        # total number of time spent for attempts (in seconds)
        total_time_attempts = stud_info_df[stud_info_df['Student Response Type'] == 0]['Duration (sec)'].sum()

        # total number of time spent for hints (in seconds)
        total_time_hints = stud_info_df[stud_info_df['Student Response Type'] == 1]['Duration (sec)'].sum()

        # averaged maximal numbers of 'Problem View'
        avg_max_problem_views = stud_info_df[['Problem Name', 'Problem View']].groupby(['Problem Name']).agg(np.max).mean()[0]

        # averaged maximal number of attempts ('x')
        avg_max_attempts = stud_info_df[['Problem Name', 'x']].groupby(['Problem Name']).agg(np.max).mean()[0]

        stud_name = i # assign unique numerical ID to each student 

        if num_attempts != 0:
            avd_time_att = total_time_attempts / num_attempts
        else:
            avg_time_att = 0
        if num_hints != 0:
            avg_time_hint = total_time_hints / num_hints
        else:
            avg_time_hint = 0 
        numbers.append([num_sessions, \
                                 num_days, \
                                 num_problems, \
                                 num_attempts, \
                                 num_hints, \
                                 fraction_correct_attempts, \
                                 frac_3s_atts, \
                                 frac_1s_hints, \
                                 total_time_attempts, \
                                 total_time_hints, \
                                 avg_max_problem_views, \
                                 avg_max_attempts])
        print("\r\t>>> Progress\t:{:.4%}".format((i + 1)/len(stud_list)), end='')
        i += 1
    stud_data = pd.DataFrame(data=numbers, columns=cols)
    end_time = time.time()
    print("\n\t>>> Exec. time\t:{}s".format(end_time-start_time))
    return stud_data

Reading from the scratch instead:



In [5]:

    
#stud_data = prepare_stud_data_new(data.head(500000).copy())
#stud_data = prepare_stud_data_new(data.copy())

stud_data = pd.read_hdf('stud_data.hdf','test')

Making backup for stud_data in HDF5 format:



In [6]:

    
#stud_data.to_hdf('stud_data.hdf','test',mode='w',complib='blosc')



In [7]:

    
stud_data.shape









    Out[7]:





(8980, 12)



In [8]:

    
stud_data.describe()









    Out[8]:







  
    
      
      num_sess
      num_days
      num_probs
      num_atts
      num_hints
      frac_corr_atts
      frac_3s_atts
      frac_1s_hints
      time_atts
      time_hints
      max_probl_views
      max_atts
    
  
  
    
      count
      8980.000000
      8980.000000
      8980.000000
      8980.000000
      8980.000000
      8980.000000
      8980.000000
      8980.000000
      8980.000000
      8980.000000
      8980.000000
      8980.000000
    
    
      mean
      76.158129
      5.767038
      62.234298
      213.419933
      89.186192
      0.562803
      0.046694
      0.052563
      7621.217689
      1511.659815
      1.191333
      3.606050
    
    
      std
      88.289228
      4.898646
      64.666099
      234.867659
      147.576020
      0.155490
      0.075056
      0.096731
      7433.529297
      1694.585366
      0.364244
      1.941819
    
    
      min
      1.000000
      1.000000
      1.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.000000
    
    
      25%
      13.000000
      2.000000
      11.000000
      38.000000
      9.000000
      0.478962
      0.000000
      0.000000
      1544.750000
      273.000000
      1.017241
      2.433333
    
    
      50%
      44.000000
      4.000000
      40.000000
      132.000000
      32.000000
      0.557991
      0.021805
      0.000000
      5377.500000
      893.000000
      1.103175
      3.319091
    
    
      75%
      110.000000
      9.000000
      95.000000
      313.000000
      105.000000
      0.647806
      0.061798
      0.067568
      11703.250000
      2223.250000
      1.244350
      4.400000
    
    
      max
      1067.000000
      55.000000
      522.000000
      2091.000000
      2465.000000
      1.000000
      1.000000
      1.000000
      69259.000000
      18370.000000
      22.000000
      46.000000

Clustering analysis: baseline model

Our baseline cluster model should show 2 clusters: one with "gaming" and one with "non-gaming" behaviour.



In [9]:

    
def calculate_gaming_score():
    i = 0
    start_time = time.time()
    best_score = -1 # mininal silhouette_score
    best_atts_threshold = 0.001
    best_hints_threshold = 0.001
    n_bins = 50
    for atts_threshold in np.linspace(0.001, 0.999, n_bins):
        for hints_threshold in np.linspace(0.001, 0.999, n_bins):
            #print(atts_threshold, hints_threshold)
            gaming_preds = np.array((stud_data['frac_3s_atts'] > atts_threshold) | (stud_data['frac_1s_hints'] > hints_threshold), dtype=int)
            gaming_score = silhouette_score(log_scaled_data, gaming_preds)
            if best_score < gaming_score:
                best_score = gaming_score
                best_atts_threshold = atts_threshold
                best_hints_threshold = hints_threshold
            print("\r\t>>> Progress\t:{:.4%}".format((i + 1)/(n_bins**2)), end='')
            i += 1
    end_time = time.time()
    print("\n\t>>> Exec. time\t:{}s".format(end_time-start_time))
    return best_score, best_atts_threshold, best_hints_threshold

#calculate_gaming_score()
print("The calculate_gaming_score() runs ~45 minutes, so I give the final score:\n \
(0.4473167950112576, 0.89716326530612245, 0.73422448979591837)")
#gaming_preds = np.array((stud_data['frac_3s_atts'] > 0.2) | (stud_data['frac_1s_hints'] > 0.2), dtype=int)
#gaming_score = silhouette_score(log_scaled_data, gaming_preds)
#print(gaming_score)









    



The calculate_gaming_score() runs ~45 minutes, so I give the final score:
 (0.4473167950112576, 0.89716326530612245, 0.73422448979591837)

However, the benchmark model chooses only 14 students (out of 8,980) for a "gaming" cluster:



In [10]:

    
stud_data[(stud_data['frac_3s_atts'] > 0.89716326530612245) | (stud_data['frac_1s_hints'] > 0.73422448979591837)].shape[0]









    Out[10]:





14

Clustering

Write a new clustering algorithm that:

starts from stud_data or its subset (with monotonic index)
finds a 2-column set with the largest score (using KMeans) and renames it that 0 is the largest group, 1 is the second largest etc.
returns index file (with indices 0, 1, ) that could be used for further analysis



In [11]:

    
# old name: process_data
def transform_data(selected_columns, data):
    '''
    Apply log-transform and MinMaxScaler() to the selected data columns which are not fractions (frac_*)
    
    Parameters
    ==========
    selected_columns : list
        list of columns to leave in processed data
    data : pandas.DataFrame
        data to process (note that data should contain all selected_columns)
        
    Returns
    =======
    log_scaled_data : pandas.DataFrame
        log-transformed and scaled data selected by selected_columns
    '''
    
    data.reset_index(drop=True, inplace=True)
    log_data = data[selected_columns].copy()
    
    skewed = log_data.columns.tolist()
    skewed = [item for item in skewed if not item.startswith('frac_')]
    log_data[skewed] = log_data[skewed].apply(lambda x: np.log10(x + 1))

    scaler = MinMaxScaler().fit(log_data)
    log_scaled_data = scaler.transform(log_data)
    log_scaled_data = pd.DataFrame(log_scaled_data, columns=log_data.columns)
    
    return log_scaled_data



In [12]:

    
def replace_group_numbers(best_preds):
    '''
    Replace group numbers in best_preds with sorting by group size 
    (so that the largest group is 0, the second largest is 1 etc.)
    
    Parameters
    ==========
    best_preds : numpy array
        unsorted array of predictions
    
    Returns
    =======
    best_preds_sorted : numpy array
        sorted array of predictions
    '''
    
    pp = pd.DataFrame(best_preds, columns = ["old_group"])
    dict_pp = {item[0]: i for i, item in enumerate(Counter(best_preds).most_common())}
    pp['new_group'] = pp['old_group'].replace(dict_pp)
    best_preds_sorted = np.array(pp['new_group'])
    return best_preds_sorted



In [13]:

    
def kmeans(log_scaled_data):
    '''
    Apply KMeans clustering algorithm with 2 <= cluster_number <= 6 to log_scaled_data 
    (transformed and scaled by transform_data() function)
    
    Parameters
    ==========
    log_scaled_data : pandas.DataFrame
        data log-transormed and MinMaxScaler()-ed for KMeans clustering
    
    Returns
    =======
    best_clusterer : sklearn Model
        clustering algorithm with the largest Silhouette Coefficient
    best_score : float
        the largest value of the Silhouette Coefficient
    best_preds_sorted : numpy.array
        array with clustering predictions for log_scaled_data 
        (0 is the largest cluster, 1 is the second largest etc.) 
    '''
    
    best_score = 0
    for n_clusters in range(2,6):
        clusterer = KMeans(n_clusters=n_clusters, n_init=10, random_state=0)
        clusterer.fit(log_scaled_data)
        preds = clusterer.predict(log_scaled_data)
        # Calculate the mean silhouette coefficient for the number of clusters chosen
        score = silhouette_score(log_scaled_data, preds)
        if best_score < score:
            best_clusterer = clusterer
            # Predict the cluster for each data point
            best_preds = best_clusterer.predict(log_scaled_data)
            best_score = score
            best_clusters = n_clusters
    best_preds_sorted = replace_group_numbers(best_preds)
    
    return best_clusterer, best_score, best_preds_sorted

Choose the pair of columns with best score:



In [14]:

    
all_columns = ['num_sess', 'num_days', 'num_probs', 'num_atts', 'num_hints', 'frac_corr_atts', \
                   'frac_3s_atts', 'frac_1s_hints', 'time_atts', 'time_hints', 'max_probl_views', 'max_atts']



In [15]:

    
def choose_pair_columns_kmeans(all_columns, log_scaled_all_data):
    '''
    Selects pair of columns in data that produces clusters with the largest score.
    In this function, only KMeans clustering algorithm is used

    Parameters
    ==========
    all_columns : list 
        list of columns to look for the pair with the largest score
    log_scaled_data : pandas DataFrame
        properly scaled DataFrame with all columns

    Returns
    =======
    best_columns : list
        pair of data columns with the largest score
    best_score : float
        the largest value of the score
    best_clusterer : sklearn Model
        clustering algorithm with the largest score
    best_preds : numpy.array
        array with clustering predictions for log_scaled_data 
        (0 is the largest cluster, 1 is the second largest etc.)    
    '''
    
    best_score = 0
    best_columns = []
    j = 0
    l = len(all_columns)
    num_pairs = (l-1)*l/2
    for column in all_columns:
        selected_columns = [column]
        
        columns_to_add = [a for a in all_columns if (a not in selected_columns)]
        for column1 in columns_to_add:
            if all_columns.index(column) < all_columns.index(column1):
                selected_columns = [column, column1]
                print("\r\t>>> Progress\t:{:.4%}".format((j+1)/num_pairs), end='')
                j += 1       
                #log_scaled_data = transform_data(selected_columns, stud_data)
                clusterer, score, preds = kmeans(log_scaled_all_data[selected_columns])
                if score > best_score:
                    best_score = score
                    best_clusterer = clusterer
                    best_preds = preds
                    best_columns = selected_columns.copy()
    
    return best_columns, best_score, best_clusterer, best_preds



In [16]:

    
start_time = time.time()
# consider skipping the step below because it takes some time (~8.5 minites)
log_scaled_all_data = transform_data(all_columns, stud_data)
#best_columns, best_kmeans_score, best_kmeans_clusterer, best_kmeans_preds = choose_pair_columns_kmeans(all_columns, log_scaled_all_data)

# Instead run it single time (6 seconds only)
best_columns = ['frac_1s_hints', 'max_probl_views']
best_kmeans_clusterer, best_kmeans_score, best_kmeans_preds = kmeans(log_scaled_all_data[best_columns]) 

end_time = time.time()
print("\n\t>>> Exec. time\t:{}s".format(end_time-start_time))
print("\t>>> Best pair of cols:", best_columns)
print("\t>>> Best score:", best_kmeans_score)
print("\t>>> Best clusterer:", best_kmeans_clusterer)
print("\t>>> Best preds:", best_kmeans_preds)









    



	>>> Exec. time	:12.103132963180542s
	>>> Best pair of cols: ['frac_1s_hints', 'max_probl_views']
	>>> Best score: 0.694609789505
	>>> Best clusterer: KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)
	>>> Best preds: [1 0 0 ..., 0 0 0]



In [17]:

    
def preds_to_indices(preds): # gives array and returns array of indices with 1s
    new_list = []
    for i, val in enumerate(preds):
        if val == 1:
            new_list.append(i)
    return np.array(new_list)

Visualising the KMeans clusters:



In [18]:

    
log_scaled_all_data.describe()









    Out[18]:







  
    
      
      num_sess
      num_days
      num_probs
      num_atts
      num_hints
      frac_corr_atts
      frac_3s_atts
      frac_1s_hints
      time_atts
      time_hints
      max_probl_views
      max_atts
    
  
  
    
      count
      8980.000000
      8980.000000
      8980.000000
      8980.000000
      8980.000000
      8980.000000
      8980.000000
      8980.000000
      8980.000000
      8980.000000
      8980.000000
      8980.000000
    
    
      mean
      0.469324
      0.288833
      0.505188
      0.605718
      0.437909
      0.562803
      0.046694
      0.052563
      0.739452
      0.647370
      0.034374
      0.379434
    
    
      std
      0.211277
      0.218404
      0.230040
      0.186491
      0.207328
      0.155490
      0.075056
      0.096731
      0.137352
      0.199351
      0.045488
      0.093209
    
    
      min
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.309839
      0.121681
      0.321886
      0.479155
      0.294812
      0.478962
      0.000000
      0.000000
      0.658847
      0.571687
      0.003515
      0.320386
    
    
      50%
      0.495751
      0.274980
      0.542614
      0.639606
      0.447676
      0.557991
      0.021805
      0.000000
      0.770721
      0.692131
      0.020595
      0.379997
    
    
      75%
      0.639511
      0.482995
      0.695454
      0.751960
      0.597084
      0.647806
      0.061798
      0.067568
      0.840483
      0.784962
      0.047196
      0.438009
    
    
      max
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000



In [19]:

    
best_kmeans_preds_mask = preds_to_indices(best_kmeans_preds)
log_scaled_all_data_kmeans_0 = log_scaled_all_data.copy()[~log_scaled_all_data.index.isin(best_kmeans_preds_mask)]
log_scaled_all_data_kmeans_1 = log_scaled_all_data.copy()[log_scaled_all_data.index.isin(best_kmeans_preds_mask)]
plt.scatter(log_scaled_all_data_kmeans_0['frac_1s_hints'], \
            log_scaled_all_data_kmeans_0['max_probl_views'], \
            alpha=0.6, s=15, c='lightgreen')
plt.scatter(log_scaled_all_data_kmeans_1['frac_1s_hints'], \
            log_scaled_all_data_kmeans_1['max_probl_views'], \
            alpha=0.6, s=15, c='grey')
plt.xlim([0.0, 0.6])
plt.ylim([0.0, 0.4])
plt.figtext(x=0.64, y=0.56, s='Group 1', ha='center', size=14, color='black')
plt.figtext(x=0.20, y=0.69, s='Group 0', ha='center', size=14, color='darkgreen')
ax = plt.gca()
ax.set_xlabel('frac_1s_hints', size=14)
ax.set_ylabel('max_probl_views (log-transformed, scaled)', size=14)
plt.plot((0.14, 0.14), (0.001, 0.399), 'k--', c='blue')
plt.show()

Then, consider adding one more column to further increase the score:



In [20]:

    
def cols_iterate_kmeans(selected_columns, best_score, best_clusterer, best_preds):

    all_columns = ['num_sess', 'num_days', 'num_probs', 'num_atts', \
                   'num_hints', 'frac_corr_atts', 'frac_3s_atts', 'frac_1s_hints', \
                   'time_atts', 'time_hints', 'max_probl_views', 'max_atts']

    columns_to_add = [a for a in all_columns if (a not in selected_columns)]
    #print(columns_to_add)
    for column in columns_to_add:
        print("*"*40)
        print("*** Trying to add column", column)
        print("*"*40)
        selected_columns.append(column)
        log_scaled_data = transform_data(selected_columns, stud_data)
        clusterer, score, preds = kmeans(log_scaled_data)
        if score > best_score:
            print("!!! Success !!!")
            best_score = score
            best_clusterer = clusterer
            best_preds = preds
            print("!!! New score is", best_score)
            print("!!! New best clusterer is", best_clusterer)
            print("!!! New best selected_columns are", selected_columns)
            columns_to_add.remove(column)
        else:
            print("!!! Last score is equal or worse then our best one")
            print("!!! According to Occam's razor, remove the column", column)
            selected_columns.remove(column)
            print("!!! Still the best selected columns are", selected_columns)
    return selected_columns, best_score, best_clusterer, best_preds



In [21]:

    
# Just skip this step, it does not give new results:

# kmeans_clusterer = best_kmeans_clusterer
# kmeans_score = best_kmeans_score
# kmeans_preds = best_kmeans_preds

# selected_columns = best_columns # ['frac_1s_hints', 'max_probl_views']
# new_columns, new_kmeans_score, new_kmeans_clusterer, new_kmeans_preds = cols_iterate_kmeans(selected_columns, kmeans_score, kmeans_clusterer, kmeans_preds)
# if new_kmeans_score > kmeans_score:
#     print("+++ SUCCESS")
#     selected_columns = new_columns
#     best_kmeans_score = new_kmeans_score
#     best_kmeans_clusterer = new_kmeans_clusterer
#     best_kmeans_preds = new_kmeans_preds
# else:
#     print("--- GIVE UP")

As expected, the pair ['frac_1s_hints', 'max_probl_views'] still gives the best score.

Now, trying with different clusterers.

MeanShift:



In [22]:

    
def largest_cluster_fraction(preds):
    '''
    calculates the fraction of students that are in the largest group
    
    Parameters
    ==========
    preds : list
        list of predictions
    
    Returns
    =======
    fraction : float
        largest fraction of students
    best_i : integer
        number of the largest group
    '''
    
    fraction = 0
    ll = len(preds)
    for i in np.unique(preds):
        frac = len(preds[preds == i])/ll
        if frac > fraction:
            fraction = frac
            best_i = i
    return fraction, best_i

# Rewrite similar to kmeans procedure !!!

def meanshift(log_scaled_data):
    '''
    Apply MeanShift clustering algorithm to log_scaled_data
    (transformed and scaled by transform_data() function)
    Number of clusters is selected according to estimate_badwidth procedure
    with quantiles in np.linspace(0.01, 0.99, 99)
    
    
    Parameters
    ==========
    log_scaled_data : pandas.DataFrame
        data log-transormed and MinMaxScaler()-ed for KMeans clustering
    
    Returns
    =======
    best_clusterer : sklearn Model
        clustering algorithm with the largest Silhouette Coefficient
    best_score : float
        the largest value of the Silhouette Coefficient
    best_preds_sorted : numpy.array
        array with clustering predictions for log_scaled_data 
        (0 is the largest cluster, 1 is the second largest etc.) 
    cluster_frac : float
        fraction of students inside the largest group
    '''

    start_time = time.time()
    best_score = 0
    best_cluster_frac = 0
    for alpha in np.linspace(0.01, 0.99, 99):
        bandwidth = estimate_bandwidth(log_scaled_data, quantile=alpha, n_samples=None, random_state=0)

        clusterer = MeanShift(bandwidth=bandwidth, bin_seeding=True)
        clusterer.fit(log_scaled_data)

        preds = clusterer.fit_predict(log_scaled_data)
        cluster_frac = largest_cluster_fraction(preds)[0]
        # Calculate the mean silhouette coefficient for the number of clusters chosen
        try: 
            score = silhouette_score(log_scaled_data, preds)
        except ValueError:
            score = 0
        print(alpha, clusterer.cluster_centers_.shape[0], score, cluster_frac)
        # setting cluster_frac > 0.85, the value obtained in KMeans algorithm for ['frac_1s_hints', 'max_probl_views']
        if (best_score < score) and (cluster_frac < 0.85):
            best_clusterer = clusterer
            best_preds = preds
            best_score = score
            best_clusters = clusterer.cluster_centers_.shape[0]
            best_cluster_frac = cluster_frac
    print('*'*68)
    print("Our best model has", best_clusters, "clusters and sihlouette is", best_score)
    end_time = time.time()
    print("Running time is {}s".format(end_time-start_time))
    print('>'*68)
    best_preds_sorted = replace_group_numbers(best_preds)
    cluster_frac = best_cluster_frac
    
    return best_clusterer, best_score, best_preds_sorted, cluster_frac

# Rinning MeanShift is too slow: runs about 12 min for 1 pair, 
# and produces too bad results (largest score = 0.55 for reasonable max_fractions < 0.85)

# start_time = time.time()
# log_scaled_data = transform_data(best_columns, stud_data)
# best_meanshift_clusterer, best_meansift_score, best_meanshift_preds, _ = meanshift(log_scaled_data)
# print(best_meanshift_clusterer, best_meanshift_score, best_meanshift_preds)
# end_time = time.time()
# print("Running time is {}s".format(end_time-start_time))

GaussianMixture:



In [23]:

    
def gaussmix(log_scaled_data): # GaussianMixture
    start_time = time.time()
    max_score = 0
    for n_clusters in range(2,6):

        clusterer = GaussianMixture(random_state=0, n_init=50, n_components=n_clusters).fit(log_scaled_data)

        preds = clusterer.predict(log_scaled_data)
    # Calculate the mean silhouette coefficient for the number of clusters chosen
        score = silhouette_score(log_scaled_data, preds)
        print("For our model with", clusterer.n_components, "clusters, the sihlouette score is", score)
        if max_score < score:
            best_clusterer = clusterer
            # Predict the cluster for each data point
            best_preds = best_clusterer.predict(log_scaled_data)
            max_score = score
            best_clusters = n_clusters
    print('*'*68)
    print("Our best model has", best_clusters, "clusters and sihlouette is", max_score)
    end_time = time.time()
    print("Running time is {}s".format(end_time-start_time))
    print('>'*68)
    best_preds_sorted = replace_group_numbers(best_preds)
    return best_clusterer, max_score, best_preds_sorted

def run_clustering_gaussmix(log_scaled_data):
    best_score = 0
    print(">>> GaussianMixture:")
    clusterer, score, preds = gaussmix(log_scaled_data)
    if score > best_score:
        best_clusterer = clusterer
        best_score = score
        best_preds = preds
    print("Best clusterer is", best_clusterer)
    print("Max score is", best_score)
    print("Best preds is", best_preds)
    return best_clusterer, best_score, best_preds

# ~0.6 min running time but very small score (~0.15)
# start_time = time.time()
# log_scaled_data = transform_data(best_columns, stud_data)
# gaussmix_best_clusterer, gaussmix_best_score, gaussmix_best_preds = run_clustering_gaussmix(log_scaled_data)
# print(gaussmix_best_clusterer, gaussmix_best_score, gaussmix_best_preds)
# end_time = time.time()
# print("Running time is {}s".format(end_time-start_time))

AgglomerativeClustering:



In [24]:

    
def agglom(log_scaled_data): # AgglomerativeClustering with 'ward' connectivity
    start_time = time.time()
    max_score = 0
    for n_clusters in range(2,3): # use only 2 clusters
        connectivity = kneighbors_graph(log_scaled_data, n_neighbors=100, include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)
        clusterer = AgglomerativeClustering(n_clusters=n_clusters, \
                                          linkage='ward', \
                                          connectivity=connectivity)

        preds = clusterer.fit_predict(log_scaled_data)
        # Calculate the mean silhouette coefficient for the number of clusters chosen
        score = silhouette_score(log_scaled_data, preds)
        print("For our model with", clusterer.n_clusters, "clusters, and the sihlouette score is", score)
        if max_score < score:
            best_clusterer = clusterer
            # Predict the cluster for each data point
            best_preds = preds
            max_score = score
            best_clusters = n_clusters
    print('*'*68)
    print("Our best model has", best_clusters, "clusters and sihlouette is", max_score)
    end_time = time.time()
    print("Running time is {}s".format(end_time-start_time))
    print('>'*68)
    best_preds_sorted = replace_group_numbers(best_preds)
    return best_clusterer, max_score, best_preds_sorted

def run_clustering_agglom(log_scaled_data):
    best_score = 0
    print(">>> AgglomerativeClustering:")
    clusterer, score, preds = agglom(log_scaled_data)
    if score > best_score:
        best_clusterer = clusterer
        best_score = score
        best_preds = preds
    print("Best clusterer is", best_clusterer)
    print("Max score is", best_score)
    print("Best preds is", best_preds)
    return best_clusterer, best_score, best_preds

# Gives results very similar to KMeans but takes ~4 times more running time
start_time = time.time()
log_scaled_data = transform_data(best_columns, stud_data)
best_agglom_clusterer, best_agglom_score, best_agglom_preds = run_clustering_agglom(log_scaled_data)
print(best_agglom_clusterer, best_agglom_score, best_agglom_preds)
end_time = time.time()
print("Running time is {}s".format(end_time-start_time))









    



>>> AgglomerativeClustering:
For our model with 2 clusters, and the sihlouette score is 0.697600184844
********************************************************************
Our best model has 2 clusters and sihlouette is 0.697600184844
Running time is 14.720906019210815s
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Best clusterer is AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=<8980x8980 sparse matrix of type '<class 'numpy.float64'>'
	with 1180178 stored elements in Compressed Sparse Row format>,
            linkage='ward', memory=None, n_clusters=2,
            pooling_func=<function mean at 0x7ff1e804c598>)
Max score is 0.697600184844
Best preds is [1 0 0 ..., 0 0 0]
AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=<8980x8980 sparse matrix of type '<class 'numpy.float64'>'
	with 1180178 stored elements in Compressed Sparse Row format>,
            linkage='ward', memory=None, n_clusters=2,
            pooling_func=<function mean at 0x7ff1e804c598>) 0.697600184844 [1 0 0 ..., 0 0 0]
Running time is 14.73127269744873s

Visualising the AgglomerativeClustering clusters:



In [25]:

    
best_agglom_preds_mask = preds_to_indices(best_agglom_preds)
log_scaled_data_agglom_0 = log_scaled_data.copy()[~log_scaled_data.index.isin(best_agglom_preds_mask)]
log_scaled_data_agglom_1 = log_scaled_data.copy()[log_scaled_data.index.isin(best_agglom_preds_mask)]
plt.scatter(log_scaled_data_agglom_0['frac_1s_hints'], \
            log_scaled_data_agglom_0['max_probl_views'], \
            alpha=0.6, s=15, c='lightgreen')
plt.scatter(log_scaled_data_agglom_1['frac_1s_hints'], \
            log_scaled_data_agglom_1['max_probl_views'], \
            alpha=0.6, s=15, c='grey')
plt.xlim([0.0, 0.6])
plt.ylim([0.0, 0.4])
plt.figtext(x=0.64, y=0.56, s='Group 1', ha='center', size=14, color='black')
plt.figtext(x=0.20, y=0.69, s='Group 0', ha='center', size=14, color='darkgreen')
ax = plt.gca()
ax.set_xlabel('frac_1s_hints', size=14)
ax.set_ylabel('max_probl_views (log-transformed, scaled)', size=14)
plt.plot((0.145, 0.145), (0.001, 0.399), 'k--', c='blue')
plt.show()

Further clustering of obtained KMeans groups:

I start from group 0 that contains 7686 students:



In [26]:

    
best_kmeans_preds_mask = preds_to_indices(best_kmeans_preds)
log_scaled_all_data_kmeans_0 = log_scaled_all_data.copy()[~log_scaled_all_data.index.isin(best_kmeans_preds_mask)]

# In this particular splitting, take drop=False to save the initial index
# (simplifying students recovery for step 2)
log_scaled_all_data_kmeans_0.reset_index(inplace=True, drop=False)

log_scaled_all_data_kmeans_0.index









    Out[26]:





RangeIndex(start=0, stop=7686, step=1)



In [27]:

    
start_time = time.time()

# best_kmeans_columns_0, \
# best_kmeans_score_0, \
# best_kmeans_clusterer_0, \
# best_kmeans_preds_0 = choose_pair_columns_kmeans(all_columns, log_scaled_all_data_kmeans_0)

best_kmeans_columns_0 = ['frac_3s_atts', 'max_probl_views']
best_kmeans_clusterer_0, best_kmeans_score_0, best_kmeans_preds_0 = kmeans(log_scaled_all_data_kmeans_0[best_kmeans_columns_0]) 

end_time = time.time()
print("\n\t>>> Exec. time\t:{}s".format(end_time-start_time))
print("\t>>> Best pair of cols:", best_kmeans_columns_0)
print("\t>>> Best score:", best_kmeans_score_0)
print("\t>>> Best clusterer:", best_kmeans_clusterer_0)
print("\t>>> Best preds:", best_kmeans_preds_0)









    



	>>> Exec. time	:7.65056848526001s
	>>> Best pair of cols: ['frac_3s_atts', 'max_probl_views']
	>>> Best score: 0.713971167291
	>>> Best clusterer: KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)
	>>> Best preds: [0 0 1 ..., 0 0 0]



In [28]:

    
print(sum(best_kmeans_preds_0), len(best_kmeans_preds_0), len(best_kmeans_preds_0[best_kmeans_preds_0 == 0]))









    



391 7686 7295



In [29]:

    
log_scaled_all_data_kmeans_0.reset_index(inplace=True, drop=True)

Visualise obtained clusters:



In [30]:

    
best_kmeans_preds_mask_0 = preds_to_indices(best_kmeans_preds_0)

log_scaled_all_data_kmeans_00 = log_scaled_all_data_kmeans_0.copy()[~log_scaled_all_data_kmeans_0.index.isin(best_kmeans_preds_mask_0)]

log_scaled_all_data_kmeans_01 = log_scaled_all_data_kmeans_0.copy()[log_scaled_all_data_kmeans_0.index.isin(best_kmeans_preds_mask_0)]

plt.scatter(log_scaled_all_data_kmeans_00[best_kmeans_columns_0[0]], \
            log_scaled_all_data_kmeans_00[best_kmeans_columns_0[1]], \
            alpha=0.6, s=15, c='lightgreen')
plt.scatter(log_scaled_all_data_kmeans_01[best_kmeans_columns_0[0]], \
            log_scaled_all_data_kmeans_01[best_kmeans_columns_0[1]], \
            alpha=0.6, s=15, c='grey')
plt.xlim([0.0, 0.6])
plt.ylim([0.0, 0.4])
plt.figtext(x=0.64, y=0.56, s='Group 01', ha='center', size=14, color='black')
plt.figtext(x=0.20, y=0.69, s='Group 00', ha='center', size=14, color='darkgreen')
ax = plt.gca()
ax.set_xlabel(best_kmeans_columns_0[0], size=14)
ax.set_ylabel(best_kmeans_columns_0[1], size=14)
plt.plot((0.13, 0.13), (0.001, 0.499), 'k--', c='blue')
plt.show()

As we see, group 01 contains more students with "gaming" behaviour, so I proceed with group 00:



In [31]:

    
len(best_kmeans_preds_0)









    Out[31]:





7686



In [32]:

    
#best_kmeans_preds_mask_0 = preds_to_indices(best_kmeans_preds_0) # already implemented during group0 visualisation
log_scaled_all_data_kmeans_00 = log_scaled_all_data_kmeans_0.copy()[~log_scaled_all_data_kmeans_0.index.isin(best_kmeans_preds_mask_0)]

log_scaled_all_data_kmeans_00.reset_index(inplace=True, drop=True)

log_scaled_all_data_kmeans_00.index









    Out[32]:





RangeIndex(start=0, stop=7295, step=1)



In [33]:

    
start_time = time.time()

# best_kmeans_columns_00, \
# best_kmeans_score_00, \
# best_kmeans_clusterer_00, \
# best_kmeans_preds_00 = choose_pair_columns_kmeans(all_columns, log_scaled_all_data_kmeans_00)

best_kmeans_columns_00 = ['frac_3s_atts', 'time_hints']
best_kmeans_clusterer_00, \
best_kmeans_score_00, \
best_kmeans_preds_00 = kmeans(log_scaled_all_data_kmeans_00[best_kmeans_columns_00]) 


end_time = time.time()
print("\n\t>>> Exec. time\t:{}s".format(end_time-start_time))
print("\t>>> Best pair of cols:", best_kmeans_columns_00)
print("\t>>> Best score:", best_kmeans_score_00)
print("\t>>> Best clusterer:", best_kmeans_clusterer_00)
print("\t>>> Best preds:", best_kmeans_preds_00)









    



	>>> Exec. time	:6.68498969078064s
	>>> Best pair of cols: ['frac_3s_atts', 'time_hints']
	>>> Best score: 0.64097819462
	>>> Best clusterer: KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)
	>>> Best preds: [0 0 0 ..., 0 0 0]



In [34]:

    
print(sum(best_kmeans_preds_00), len(best_kmeans_preds_00), len(best_kmeans_preds_00[best_kmeans_preds_00 == 0]))









    



1109 7295 6186



In [35]:

    
best_kmeans_preds_mask_00 = preds_to_indices(best_kmeans_preds_00)

log_scaled_all_data_kmeans_000 = log_scaled_all_data_kmeans_00.copy()[~log_scaled_all_data_kmeans_00.index.isin(best_kmeans_preds_mask_00)]

log_scaled_all_data_kmeans_001 = log_scaled_all_data_kmeans_00.copy()[log_scaled_all_data_kmeans_00.index.isin(best_kmeans_preds_mask_00)]

plt.scatter(log_scaled_all_data_kmeans_000[best_kmeans_columns_00[0]], \
            log_scaled_all_data_kmeans_000[best_kmeans_columns_00[1]], \
            alpha=0.6, s=15, c='lightgreen')
plt.scatter(log_scaled_all_data_kmeans_001[best_kmeans_columns_00[0]], \
            log_scaled_all_data_kmeans_001[best_kmeans_columns_00[1]], \
            alpha=0.6, s=15, c='grey')
# plt.xlim([0.0, 0.6])
# plt.ylim([0.0, 0.4])
# plt.figtext(x=0.64, y=0.56, s='Group 01', ha='center', size=14, color='black')
# plt.figtext(x=0.20, y=0.69, s='Group 00', ha='center', size=14, color='darkgreen')
ax = plt.gca()
ax.set_xlabel(best_kmeans_columns_00[0], size=14)
ax.set_ylabel(best_kmeans_columns_00[1], size=14)
#plt.plot((0.13, 0.13), (0.001, 0.499), 'k--', c='blue')
plt.show()

So, there is a subgroup 001 of 1109 students that do not use many hints. What about the rest (000, 6186 students)?



In [36]:

    
log_scaled_all_data_kmeans_000 = log_scaled_all_data_kmeans_00.copy()[~log_scaled_all_data_kmeans_00.index.isin(best_kmeans_preds_mask_00)]

log_scaled_all_data_kmeans_000.reset_index(inplace=True, drop=True)

log_scaled_all_data_kmeans_000.index









    Out[36]:





RangeIndex(start=0, stop=6186, step=1)



In [37]:

    
start_time = time.time()

# best_kmeans_columns_000, \
# best_kmeans_score_000, \
# best_kmeans_clusterer_000, \
# best_kmeans_preds_000 = choose_pair_columns_kmeans(all_columns, log_scaled_all_data_kmeans_000)

best_kmeans_columns_000 = ['num_sess', 'num_probs']
best_kmeans_clusterer_000, \
best_kmeans_score_000, \
best_kmeans_preds_000 = kmeans(log_scaled_all_data_kmeans_000[best_kmeans_columns_000]) 

end_time = time.time()
print("\n\t>>> Exec. time\t:{}s".format(end_time-start_time))
print("\t>>> Best pair of cols:", best_kmeans_columns_000)
print("\t>>> Best score:", best_kmeans_score_000)
print("\t>>> Best clusterer:", best_kmeans_clusterer_000)
print("\t>>> Best preds:", best_kmeans_preds_000)









    



	>>> Exec. time	:3.2646102905273438s
	>>> Best pair of cols: ['num_sess', 'num_probs']
	>>> Best score: 0.603335326634
	>>> Best clusterer: KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)
	>>> Best preds: [0 0 0 ..., 1 1 0]



In [38]:

    
print(sum(best_kmeans_preds_000), len(best_kmeans_preds_000), len(best_kmeans_preds_000[best_kmeans_preds_000 == 0]))









    



2422 6186 3764



In [39]:

    
best_kmeans_preds_mask_000 = preds_to_indices(best_kmeans_preds_000)

log_scaled_all_data_kmeans_0000 = log_scaled_all_data_kmeans_000.copy()[~log_scaled_all_data_kmeans_000.index.isin(best_kmeans_preds_mask_000)]

log_scaled_all_data_kmeans_0001 = log_scaled_all_data_kmeans_000.copy()[log_scaled_all_data_kmeans_000.index.isin(best_kmeans_preds_mask_000)]

plt.scatter(log_scaled_all_data_kmeans_0000[best_kmeans_columns_000[0]], \
            log_scaled_all_data_kmeans_0000[best_kmeans_columns_000[1]], \
            alpha=0.6, s=15, c='lightgreen')
plt.scatter(log_scaled_all_data_kmeans_0001[best_kmeans_columns_000[0]], \
            log_scaled_all_data_kmeans_0001[best_kmeans_columns_000[1]], \
            alpha=0.6, s=15, c='grey')
# plt.figtext(x=0.64, y=0.56, s='Group 01', ha='center', size=14, color='black')
# plt.figtext(x=0.20, y=0.69, s='Group 00', ha='center', size=14, color='darkgreen')
ax = plt.gca()
ax.set_xlabel(best_kmeans_columns_000[0], size=14)
ax.set_ylabel(best_kmeans_columns_000[1], size=14)
#plt.plot((0.13, 0.13), (0.001, 0.499), 'k--', c='blue')
plt.show()

Splitting group 0000 (students with large 'num_sess' and 'num_probs')



In [40]:

    
log_scaled_all_data_kmeans_0000 = log_scaled_all_data_kmeans_000.copy()[~log_scaled_all_data_kmeans_000.index.isin(best_kmeans_preds_mask_000)]

log_scaled_all_data_kmeans_0000.reset_index(inplace=True, drop=True)

log_scaled_all_data_kmeans_0000.index









    Out[40]:





RangeIndex(start=0, stop=3764, step=1)



In [41]:

    
start_time = time.time()

# best_kmeans_columns_0000, \
# best_kmeans_score_0000, \
# best_kmeans_clusterer_0000, \
# best_kmeans_preds_0000 = choose_pair_columns_kmeans(all_columns, log_scaled_all_data_kmeans_0000)

best_kmeans_columns_0000 = ['num_sess', 'num_probs']
best_kmeans_clusterer_0000, \
best_kmeans_score_0000, \
best_kmeans_preds_0000 = kmeans(log_scaled_all_data_kmeans_0000[best_kmeans_columns_0000]) 

end_time = time.time()
print("\n\t>>> Exec. time\t:{}s".format(end_time-start_time))
print("\t>>> Best pair of cols:", best_kmeans_columns_0000)
print("\t>>> Best score:", best_kmeans_score_0000)
print("\t>>> Best clusterer:", best_kmeans_clusterer_0000)
print("\t>>> Best preds:", best_kmeans_preds_0000)









    



	>>> Exec. time	:1.0598983764648438s
	>>> Best pair of cols: ['num_sess', 'num_probs']
	>>> Best score: 0.569041801292
	>>> Best clusterer: KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)
	>>> Best preds: [0 1 1 ..., 1 0 0]



In [42]:

    
print(sum(best_kmeans_preds_0000), \
      len(best_kmeans_preds_0000), \
      len(best_kmeans_preds_0000[best_kmeans_preds_0000 == 0]))









    



1817 3764 1947



In [43]:

    
best_kmeans_preds_mask_0000 = preds_to_indices(best_kmeans_preds_0000)

log_scaled_all_data_kmeans_00000 = log_scaled_all_data_kmeans_0000.copy()[~log_scaled_all_data_kmeans_0000.index.isin(best_kmeans_preds_mask_0000)]

log_scaled_all_data_kmeans_00001 = log_scaled_all_data_kmeans_0000.copy()[log_scaled_all_data_kmeans_0000.index.isin(best_kmeans_preds_mask_0000)]

plt.scatter(log_scaled_all_data_kmeans_00000[best_kmeans_columns_0000[0]], \
            log_scaled_all_data_kmeans_00000[best_kmeans_columns_0000[1]], \
            alpha=0.6, s=15, c='lightgreen')
plt.scatter(log_scaled_all_data_kmeans_00001[best_kmeans_columns_0000[0]], \
            log_scaled_all_data_kmeans_00001[best_kmeans_columns_0000[1]], \
            alpha=0.6, s=15, c='grey')
# plt.xlim([0.0, 0.6])
# plt.ylim([0.0, 0.4])
# plt.figtext(x=0.64, y=0.56, s='Group 01', ha='center', size=14, color='black')
# plt.figtext(x=0.20, y=0.69, s='Group 00', ha='center', size=14, color='darkgreen')
ax = plt.gca()
ax.set_xlabel(best_kmeans_columns_0000[0], size=14)
ax.set_ylabel(best_kmeans_columns_0000[1], size=14)
#plt.plot((0.13, 0.13), (0.001, 0.499), 'k--', c='blue')
plt.show()

As we see, these two groups represent students with "intermediate experience" (00000) and "largest experience" (00001).

During Step 1, I splitted 8980 ASSISTments students into 6 different groups:

group 1, 1294 students with large 'frac_1s_hints' ("gaming" behaviour);
group 2, 391 students with small 'frac_1s_hints' and large 'frac_3s_atts' ("gaming" behaviour);
group 3, 1109 students with small 'time_hints' ("non-gaming" behaviour, small usage of hints);
group 4, 2422 students with small 'num_sess' and 'num_probs' ("non-gaming" behaviour, large usage of hints, small experience);
group 5, 1947 students with medium 'num_sess' and 'num_probs' ("non-gaming" behaviour, large usage of hints, medium experience);
group 6, 1817 students with large 'num_sess' and 'num_probs' ("non-gaming" behaviour, large usage of hints, large experience).

The final result of this step is the joint cluster index that contains numbers 1-6 for each student:



In [44]:

    
group1_index = np.array(log_scaled_all_data_kmeans_1.index)
len(group1_index)









    Out[44]:





1294



In [45]:

    
group2_index = np.array(log_scaled_all_data_kmeans_01['index'])
len(group2_index)









    Out[45]:





391



In [46]:

    
group3_index = np.array(log_scaled_all_data_kmeans_001['index'])
len(group3_index)









    Out[46]:





1109



In [47]:

    
group4_index = np.array(log_scaled_all_data_kmeans_0001['index'])
len(group4_index)









    Out[47]:





2422



In [48]:

    
group5_index = np.array(log_scaled_all_data_kmeans_00000['index'])
len(group5_index)









    Out[48]:





1947



In [49]:

    
group6_index = np.array(log_scaled_all_data_kmeans_00001['index'])
len(group6_index)









    Out[49]:





1817



In [50]:

    
def create_joint_cluster_index():
    '''
    Saves group index files into cluster_index.csv for further analysis
    '''
    
    cluster_index_lst = []
    for i in range(len(stud_data)+1):
        if i in group1_index:
            cluster_index_lst.append(1)
        elif i in group2_index:
            cluster_index_lst.append(2)
        elif i in group3_index:
            cluster_index_lst.append(3)
        elif i in group4_index:
            cluster_index_lst.append(4)
        elif i in group5_index:
            cluster_index_lst.append(5)
        elif i in group6_index:
            cluster_index_lst.append(6)

    print(Counter(cluster_index_lst))
    cluster_index = pd.Series(cluster_index_lst, dtype=int)
    cluster_index.to_csv('cluster_index.csv')
    return



In [51]:

    
create_joint_cluster_index()









    



Counter({4: 2422, 5: 1947, 6: 1817, 1: 1294, 3: 1109, 2: 391})



In [52]:

    
! ls -lh cluster_index.csv









    



-rw-rw-r-- 1 dima806 dima806 61K Nov  4 19:17 cluster_index.csv



In [ ]:

	Anon Student Id	Session Id	Duration (sec)	Student Response Type	Problem Name	Problem View	Attempt At Step	Outcome	Day	x
0	Stu_001d187b1b375fe98b88696b250177f0	647501	102.0	1	2218	1.0	1.0	2.0	2004-11-10	0
1	Stu_001d187b1b375fe98b88696b250177f0	647501	46.0	0	2218	1.0	2.0	0.0	2004-11-10	1
2	Stu_001d187b1b375fe98b88696b250177f0	647792	70.0	1	3093	1.0	1.0	2.0	2004-11-10	0
3	Stu_001d187b1b375fe98b88696b250177f0	647792	22.0	1	3093	1.0	1.0	2.0	2004-11-10	0
4	Stu_001d187b1b375fe98b88696b250177f0	647792	2.0	1	3093	1.0	2.0	2.0	2004-11-10	0

	num_sess	num_days	num_probs	num_atts	num_hints	frac_corr_atts	frac_3s_atts	frac_1s_hints	time_atts	time_hints	max_probl_views	max_atts
count	8980.000000	8980.000000	8980.000000	8980.000000	8980.000000	8980.000000	8980.000000	8980.000000	8980.000000	8980.000000	8980.000000	8980.000000
mean	76.158129	5.767038	62.234298	213.419933	89.186192	0.562803	0.046694	0.052563	7621.217689	1511.659815	1.191333	3.606050
std	88.289228	4.898646	64.666099	234.867659	147.576020	0.155490	0.075056	0.096731	7433.529297	1694.585366	0.364244	1.941819
min	1.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000
25%	13.000000	2.000000	11.000000	38.000000	9.000000	0.478962	0.000000	0.000000	1544.750000	273.000000	1.017241	2.433333
50%	44.000000	4.000000	40.000000	132.000000	32.000000	0.557991	0.021805	0.000000	5377.500000	893.000000	1.103175	3.319091
75%	110.000000	9.000000	95.000000	313.000000	105.000000	0.647806	0.061798	0.067568	11703.250000	2223.250000	1.244350	4.400000
max	1067.000000	55.000000	522.000000	2091.000000	2465.000000	1.000000	1.000000	1.000000	69259.000000	18370.000000	22.000000	46.000000