Calculating CWUBC

Jan 2016

Given JSON summaries of projects, calculate Cumulative Weighted Unique Block Count (CWUBC) of projects

See Proposal, Page 4: http://benjixie.com/meng_proposal.pdf

REQUIRED: ai2summarizer.py (https://github.com/bxie/ai2_summarizer)

See also: Trajectory.ipynb

CWUBC Steps:

  1. parse directory and load JSON from directory
  2. calcualte T_all matrix (see proposal)
  3. calculate block weighting

POPPS Steps:

  1. Determine blocks corresponding to each programming skill
  2. Calculate POPPS matrix

Clustering:

  1. Separate users based on CWUBC
  2. Plot POPPS of each group
  3. Run K-Means clustering on CWUBC trajectory matrix
  4. Plot POPPS of each cluster

CSP Principles Trajectory:

  1. Isolate CSP Blocks
  2. Trajectory of CSP Blocks (freq, not binary)

Other:

  • Analyze slope of CWUBC
  • PCA
  • Categorize Clusters (elbow)
  • Prior Knowledge: Cluster based only on first n projects

In [1]:
import os
import os.path
import re
import json
import csv
import pickle
import numpy as np
import pandas as pd

#plotting
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.ticker as mtick

#making plots look pretty
%matplotlib inline
matplotlib.style.use('ggplot')
pd.options.display.mpl_style = 'default'

from collections import Counter
from sklearn.cluster import KMeans

# ROOT_DIR = "/Users/bxie/Documents/ai2_users_random_small/"
ROOT_DIR = "/Users/bxie/Documents/ai2_users_random/"
NB_DIR = "/Users/bxie/programming/ai2_user_trajectory/data/"
REGEX_ROOT_IGNORE = 'python|\.|README'
REGEX_SUMMARY = 'summary\.json'
SUMMARY_SUFFIX = "_summary.json"
USER_ID_RAND = "000317"
PROJ_ID_RAND = "4673362639978496"
THRESHOLD = 20

In [2]:
"""
code to get blocks
"""

"""
return dictionary of (user ids, num projects) 
for users w/ at least min_num_projects
"""
def get_users(min_num_projects):
    #regex to ignore non-project files
    ignore = 'python|\.|README'
    regexp=re.compile(ignore) #use: regexp.search(fname)

    fnames = filter(lambda x: regexp.search(x) is None, os.listdir(ROOT_DIR))
    super_users = {}

    for fname in fnames:
        num_projects = len(get_all_projects(fname))
        if num_projects > min_num_projects:
            super_users[fname] = num_projects

    return super_users

"""
Given user_id (user directory name), return list of all project ids
"""
def get_all_projects(user_id):
    unfiltered_project_names = os.listdir("{}{}".format(ROOT_DIR, user_id))
    project_names = filter(lambda x: x.isdigit(), unfiltered_project_names)
    return project_names

"""
given user id and project id, return project summary (as dictionary)
"""
def get_summary(user_id, project_id):
    summary_dir = "{}{}/{}{}".format(ROOT_DIR, user_id, project_id, SUMMARY_SUFFIX)
    with open(summary_dir) as data_file:
        data = json.load(data_file)
    return data

"""
Given project_id (user directory name), return lists of all active blocks, orphaned blocks
"""
def get_blocks(summary):
    screen_names = filter(lambda x: x.find("*")<0, summary.keys())
    blocks_count = Counter({})
    orphan_count = Counter({})
    for sname in screen_names:
        #if has blocks
        if has_blocks(summary, sname):
            blocks_count += Counter(summary[sname]['Blocks']['Active Blocks']['Types'])
        if has_blocks(summary, sname, check_active=False):
            orphan_count += Counter(summary[sname]['Blocks']['Orphan Blocks']['Types'])
    return dict(blocks_count), dict(orphan_count)

"""
Given blocks dict, save to CSV
"""
def save_blocks_to_csv(blocks_dict, new_fname_path):
    writer = csv.writer(open("{}.csv".format(new_fname_path), 'wb'))
    for key, value in blocks_dict.items():
        writer.writerow([key, value])
    return True


""""""""""""""" Helper Functions """""""""""""""

"""
Given project summary(dict) and screen name(str) 
and designation of active (default) or orphan blocks,
return boolean to determine if screen has those blocks
"""
def has_blocks(summary, screen_name, check_active=True):
    block_name = 'Active Blocks' if check_active else 'Orphan Blocks'
    return type(summary[screen_name]['Blocks']) == dict and type(summary[screen_name]['Blocks'][block_name]) == dict

In [3]:
""" Get all types of blocks """

"""
get count of all blocks of projects (up to upper_bound # of projects) by users with at least n (threshold) projects
"""
def get_all_blocks(threshold=0, upper_bound=THRESHOLD, have_upper_bound=True):
    counter_active = Counter({})
    counter_orphan = Counter({})    
    
    for user_id in get_users(threshold):
        project_ids = get_all_projects(user_id)
        if have_upper_bound:
            project_ids = project_ids[:upper_bound] # only select first n projects as defined by upper_bound

        for project_id in project_ids:
#             print "{}, {}".format(user_id, project_id)
            active, orphan = get_blocks(get_summary(user_id, project_id))
            counter_active += Counter(active)
            counter_orphan += Counter(orphan)
    return dict(counter_active), dict(counter_orphan)

"""
get count of all blocks by users with at least n (threshold) projects
return tuple of dictionaries (active, orphan blocks)
key: block type, value; block frequnecy
"""
def get_blocks_project_count(threshold=0):
    counter_active = Counter({})
    counter_orphan = Counter({})    
    
    for user_id in get_users(threshold):
        project_ids = get_all_projects(user_id)
        if threshold > 0:
            project_ids = project_ids[:threshold] #analyze first n projects only
        for project_id in project_ids:
#             print "{}, {}".format(user_id, project_id)
            active, orphan = get_blocks(get_summary(user_id, project_id))
            active = {val:1 for val in active} #1 per project
            orphan = {val:1 for val in orphan}
            counter_active += Counter(active)
            counter_orphan += Counter(orphan)
    return dict(counter_active), dict(counter_orphan)

"""
get list of all block types
"""
def get_all_block_types(active_blocks, orphan_blocks):
    return list(set(active_blocks.keys() + orphan_blocks.keys()))

"""
load pickled block types
"""
def load_block_types(fname):
    block_types = open(fname, 'rb')
    output = pickle.load(block_types)
    block_types.close()
    return output

"""
save pickled block types
"""
def save_block_types(block_types_list, fname):
    block_types = open(fname, 'wb')
    pickle.dump(block_types_list, block_types)
    block_types.close()

In [4]:
"""
Calculating trajectory matrix (CWUBC)
"""

# order of this is important
block_types_fname = 'jeff_types.pkl'
BLOCK_TYPES = load_block_types(NB_DIR+block_types_fname)

def get_all_trajectories(threshold=THRESHOLD):
    user_ids = get_users(threshold)
    user_traj_vectors = [] #list of user trajectory vectors
    for uid in user_ids:
        V_u = get_trajectory(uid)
        user_traj_vectors.append(V_u)
    return np.vstack(user_traj_vectors)

# given a user_id, return trajectory as vector of # of blocks used at each project
# BXX TODO: Add weighting
def get_trajectory(user_id, threshold=THRESHOLD):
    P_b = get_binary_matrix(user_id, threshold)
    V_u = np.sum(P_b, axis=1)
    return V_u

"""
given user id, get CUMULATIVE binary matrix of users x blocks
"""
def get_binary_matrix(user_id, threshold=THRESHOLD):
    P_u = get_freq_matrix(user_id, threshold)
    # print P_u[:, BLOCK_TYPES.index('color_make_color')]
    P_c = np.cumsum(P_u, axis = 0)
    # print P_c[:, BLOCK_TYPES.index('color_make_color')]
    return P_c>0

"""
given user id, get non-cumulative binary matrix of users x blocks
"""
def get_binary_matrix_non_cum(user_id, threshold=THRESHOLD):
    P_u = get_freq_matrix(user_id, threshold)
    # print P_u[:, BLOCK_TYPES.index('color_make_color')]
    P_u[P_u>0]=1 #binary matrix
    # print P_c[:, BLOCK_TYPES.index('color_make_color')]
    return P_u
    

# given user_id, return matrix of frequency of blocks of each project 
# output: n x d matrix where n=# of projects (THRESHOLD), d = # of block types
def get_freq_matrix(user_id, threshold=THRESHOLD):
    output = np.zeros((threshold, len(BLOCK_TYPES)))
    project_ids = get_all_projects(user_id)[:threshold] # getting first n projects from user
    for i in range(threshold):
        pid = project_ids[i]
        summary = get_summary(user_id, pid)
        blocks = get_blocks(summary)[0]
        for blk, count in blocks.items():
            output[i, BLOCK_TYPES.index(blk)] = count
    return output

#normalize traj_matrix by max (if by_max) or by sum
def normalize_trajectory(traj_matrix, by_max=True):
    if by_max:
        user_norm = traj_matrix[:,-1] #final UBC for each user
    else:
        user_norm = traj_matrix.sum(axis=1)
    output = traj_matrix.astype(float) / user_norm[:, np.newaxis]
    return np.nan_to_num(output) #NaN from divide by zero error

def difference_trajectory(traj_matrix):
    return np.diff(traj_matrix, axis=1)

In [5]:
"""
Calculating POPPS
To get block type, from browser console: bs = Blocklies['5066549580791808_Screen1']; bs.selected.type
"""


#mapping programming skill to block types
POPPS_MAP = {
    'cond': [
        'controls_if', #conditional
        'controls_choose'
       ], 
    'list': [
        'lists_create_with', #list
        'lists_create_with', 
        'lists_add_items', 
        'lists_is_in', 
        'lists_length', 
        'lists_is_empty', 
        'lists_pick_random_item', 
        'lists_position_in', 
        'lists_select_item', 
        'lists_insert_item', 
        'lists_replace_item', 
        'lists_remove_item', 
        'lists_append_list', 
        'lists_copy', 
        'lists_is_list', 
        'lists_to_csv_row', 
        'lists_to_csv_table', 
        'lists_from_csv_row', 
        'lists_from_csv_table', 
        'lists_lookup_in_pairs'
       ], 
    'loop': [
        'controls_forEach', #loop
        'controls_forRange',
        'controls_while'
       ], 
    'logic': [
        'logic_negate', #operator
        'logic_or', 
        'logic_boolean', 
        'logic_false', 
        'logic_operation', 
        'logic_compare'
       ], 
    'var': [
        'lexical_variable_set', 
        'lexical_variable_get', 
        'local_declaration_expression', 
        'local_declaration_statement'], #variable
    'proc': [
        'procedures_defnoreturn', #procedure
        'procedures_callreturn', 
        'procedures_defreturn', 
        'procedures_callnoreturn'
       ],
    'proc_def': [
        'procedures_defnoreturn', #procedure
        'procedures_defreturn', 
       ],
    }

# flat list of all CC blocks (formerly known as POPPS blocks)
POPPS_ALL_BLOCKS = []


"""
given binary matrix to show POPPS, 
return vector for average proportion of users who have not learned skill by project i
"""
def get_average_survival(popps_matrix):
    return np.average(popps_matrix, axis=0)


"""
given specific programming skill (string) from POPPS_MAP.keys(),
optional list of user ids (select_users),
and optional threshold for min number of projects users must have
return binary matrix to show POPPS (row: user, column: 1 if used skill by that project)
"""
def get_popps_all_users(prog_skill, select_users=[], threshold=THRESHOLD, block_types=BLOCK_TYPES):
    user_ids = get_users(threshold) if len(select_users)==0 else select_users
    user_popps_vectors = [] #list of user trajectory vectors
    for uid in user_ids:
        P_b = get_specific_popps_binary(uid, prog_skill, threshold, block_types)
        user_popps_vectors.append(P_b)
    return np.vstack(user_popps_vectors)

"""
given user id (string),
specific programming skill (string) from POPPS_MAP.keys(),
optional thershold for number of projects to analyze,
and optional block types matrix (block_types_matrix),
return binary matrix for all POPPS that is 1 if user has skill by project i
"""
def get_specific_popps_binary(user_id, prog_skill, threshold=THRESHOLD, block_types= BLOCK_TYPES):
    if prog_skill not in POPPS_MAP:
        raise Exception("{} not in POPPS_MAP. Select from {}".format(prog_skill, POPPS_MAP.keys()))
    popps_binary = np.zeros([1, threshold])
    popps_binary[:] = 1
    P_b = get_binary_matrix(user_id, threshold)
    block_inds = get_block_indices(POPPS_MAP[prog_skill], block_types)
    found_proj_ind = np.argwhere(P_b[:,block_inds]==True) #locations in P_b that show block in project
    if len(found_proj_ind):
        #get first project that contains a block pertaining to prog_skill
        first_proj_ind = np.min(np.argwhere(P_b[:,block_inds]==True)[:,0])
        popps_binary[0, first_proj_ind:] = 0
    return popps_binary
    
"""
given user id (int as string), 
optional thershold for number of projects to analyze,
and optional block types matrix (block_types_matrix),
return binary matrix for all POPPS that is 1 if user has skill by project i
"""
def get_all_popps_binary(user_id, threshold=THRESHOLD, block_types= BLOCK_TYPES):
    num_popps = len(POPPS_MAP)
    popps_binary = np.zeros([num_popps, threshold])
    popps_binary[:,:] = 1
    for i in range(num_popps):
        prog_skill = POPPS_MAP.keys()[i]
        if prog_skill not in POPPS_MAP:
            raise Exception("{} not in POPPS_MAP. Select from {}".format(prog_skill, POPPS_MAP.keys()))
        P_b = get_binary_matrix(user_id, threshold)
        block_inds = get_block_indices(POPPS_MAP[prog_skill], block_types)
        found_proj_ind = np.argwhere(P_b[:,block_inds]==True) #locations in P_b that show block in project
        if len(found_proj_ind):
            #get first project that contains a block pertaining to prog_skill
            first_proj_ind = np.min(np.argwhere(P_b[:,block_inds]==True)[:,0]) 
            popps_binary[i, first_proj_ind:] = 0
    return popps_binary
    
    
""" HELPER FUNCTIONS """

"""
Given list of block types to identify (selected_blocks)
and optional blocks types matrix (block_types) 
return list of indices in matrix for given types
"""
def get_block_indices(selected_blocks, block_types = BLOCK_TYPES):
    indices = []
    for blk_type in selected_blocks:
        indices.append(block_types.index(unicode(blk_type)))
    return list(set(indices))

for key in POPPS_MAP.keys():
    POPPS_ALL_BLOCKS += POPPS_MAP[key]

POPPS_ALL_BLOCKS_INDS = get_block_indices(POPPS_ALL_BLOCKS)
OTHER_BLOCKS_INDS = list(set(range(0,len(BLOCK_TYPES))) - set(POPPS_ALL_BLOCKS_INDS))

#ensure disjoint sets
len(BLOCK_TYPES) == len(POPPS_ALL_BLOCKS_INDS) + len(OTHER_BLOCKS_INDS)


Out[5]:
True

In [7]:
"""
isolating CC blocks
"""

"""
given user_id,
return 1D array (vector) of cumulative trajectory of # of block types in each project
"""
def get_cc_trajectory(user_id, filter_blocks=True, block_inds=POPPS_ALL_BLOCKS_INDS):
    mat_f = get_binary_matrix(user_id)
    if filter_blocks:
        mat_f = mat_f[: , block_inds] #select only relevant blocks
    mat_f = np.sum(mat_f, axis=1)
    return mat_f

def get_all_cc_trajectories(block_inds=POPPS_ALL_BLOCKS_INDS, threshold=THRESHOLD):
    user_ids = get_users(threshold)
    user_traj_vectors = [] #list of user trajectory vectors
    for uid in user_ids:
        V_u = get_cc_trajectory(uid, block_inds=block_inds)
        user_traj_vectors.append(V_u)
    return np.vstack(user_traj_vectors)

"""
UNUSED
"""

"""
CC block count in each project
"""
def get_cc_trajectory_repeats(user_id, block_inds=POPPS_ALL_BLOCKS_INDS, threshold=THRESHOLD):
    mat_f = get_freq_matrix(user_id)
    mat_f[mat_f>0] = 1 #binary matrix
    mat_f = mat_f[: , block_inds] #select only relevant blocks
    mat_f = np.cumsum(mat_f, axis=0)
    mat_f = np.sum(mat_f, axis=1)
    return mat_f

Sophistication of Projects

Learning rate of users decreases with time => breadth learning decreases. Is there an emphasis on depth learning? Or does learning stagnate?


In [9]:
"""
given a user id,
count_types: True to count block types, false to count frequency
optionally list of block indices to select
return the vector representing number of block types used in each block (not cumulative)
"""
def get_counts_by_project(user_id, count_types=False, filter_blocks=False, block_inds=POPPS_ALL_BLOCKS_INDS):
    if count_types:
        mat_f = get_binary_matrix_non_cum(user_id)
    else:
        mat_f = get_freq_matrix(user_id)
    if filter_blocks:
        mat_f = mat_f[: , block_inds] #select only relevant blocks
    return np.sum(mat_f, axis = 1)
    
def get_all_avg_counts_by_project(threshold=THRESHOLD, count_types=False, filter_blocks=False, block_inds=POPPS_ALL_BLOCKS_INDS):
    uids = get_users(threshold)
    count_vectors = []
    for user_id in uids:
        count_vec = get_counts_by_project(user_id, count_types=False, filter_blocks=filter_blocks, block_inds=block_inds)
        count_vectors.append(count_vec)
    return np.expand_dims(np.average(count_vectors, axis=0), axis=1)

In [11]:
# sandbox
counts = get_all_avg_counts_by_project(filter_blocks=True, block_inds=OTHER_BLOCKS_INDS)
counts_cc = get_all_avg_counts_by_project(filter_blocks=True)

# print counts_cc
# print counts

plt.plot(counts)
plt.plot(counts_cc)

# plot_trajectory(counts_list, title="Average Cumulative Sum of Block Types", 
#                     ylabel="Cum. Number of Block Types", 
#                     legend = ['Comp. Concepts Blocks', 'Non-CC Blocks'], legend_loc=2, percent=False)


Out[11]:
[<matplotlib.lines.Line2D at 0x100535d10>]

In [12]:
print counts_cc
print counts_cc.shape

x = np.expand_dims(counts_cc, axis=1)

def plot_counts_by_project(counts_list, title="Frequency of Blocks by Project", 
                    ylabel="Number of Blocks", 
                    legend = ['Comp. Concepts Blocks', 'Non-CC Blocks'], legend_loc=2, percent=False):

#     fig = plt.figure(figsize=(11.3,7))
    fig = plt.figure(figsize=(11.3,5))    
    ax = fig.add_subplot(1,1,1)
    plt.xlabel('Project Number', fontsize=20, color='black')
    plt.ylabel(ylabel, fontsize=20, color='black')
    plt.title(title, fontsize=24, color='black')
    my_legend = legend

    for i in range(len(counts_list)):
        t_mat = counts_list[i]
        if percent:
            t_mat = t_mat * 100 #for percent
        if i==1:
            plt.plot(t_mat, linewidth=5, linestyle='dashed')
        else:
            plt.plot(t_mat, linewidth=5)

    plt.legend(my_legend, loc=legend_loc, fontsize=16)
    if percent:
        fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
        yticks = mtick.FormatStrFormatter(fmt)
        ax.yaxis.set_major_formatter(yticks)
        
    return fig, ax

counts_list = [counts_cc, counts]

plot_counts_by_project(counts_list)


[[ 26.26950355]
 [ 41.29787234]
 [ 39.16312057]
 [ 27.28368794]
 [ 23.9787234 ]
 [ 25.56028369]
 [ 41.03546099]
 [ 27.08510638]
 [ 48.17021277]
 [ 26.41134752]
 [ 30.60283688]
 [ 30.07092199]
 [ 33.68085106]
 [ 31.05673759]
 [ 54.03546099]
 [ 30.        ]
 [ 40.31205674]
 [ 34.61702128]
 [ 49.9787234 ]
 [ 23.75177305]]
(20, 1)
Out[12]:
(<matplotlib.figure.Figure at 0x10f6bde10>,
 <matplotlib.axes._subplots.AxesSubplot at 0x10f6bdfd0>)

In [111]:
np.array([0.0]) + counts_cc


Out[111]:
array([[ 2.90070922],
       [ 3.64539007],
       [ 3.07092199],
       [ 3.61702128],
       [ 3.12056738],
       [ 3.05673759],
       [ 3.5106383 ],
       [ 3.21985816],
       [ 3.26241135],
       [ 3.25531915],
       [ 3.54609929],
       [ 3.24113475],
       [ 3.30496454],
       [ 3.17021277],
       [ 3.84397163],
       [ 2.86524823],
       [ 3.67375887],
       [ 3.18439716],
       [ 2.94326241],
       [ 2.91489362]])

In [11]:
"""
get all block types
shouldn't need to run if have saved pickled block types (blocks_type_fname)
"""

# active, orphan = get_all_blocks(0)
# all_types = get_all_block_types(active, orphan)# flat list of all CC blocks (formerly known as POPPS blocks)
all_cc_blocks = []

for key in POPPS_MAP.keys():
    all_cc_blocks += POPPS_MAP[key]

cc_block_inds = get_block_indices(all_cc_blocks)

len(BLOCK_TYPES)


# save block types
# save_blocks_to_csv(active, NB_DIR + "jeff_0_active")
# save_blocks_to_csv(orphan, NB_DIR + "jeff_0_orphan")
# save_block_types(all_types, NB_DIR + "jeff_types.pkl")
# print 'all saved!'


Out[11]:
1013

In [18]:
#get trajectories
T_all = get_all_trajectories()

#NB: associating blocks and block types now less trivial
T_all_cc = get_all_cc_trajectories(block_inds=POPPS_ALL_BLOCKS_INDS)
T_all_not = get_all_cc_trajectories(block_inds=OTHER_BLOCKS_INDS)
# save_block_types(T_all, 'traj.pkl')
print 'done'
# np.shape(np.average(T_all, axis=0))


done

Block Frequency Analysis


In [25]:
#blocks counts
#THIS TAKES TIME
active_total, orphan_total = get_all_blocks(threshold=20) # total # of blocks used in all projects
active_proj, orphan_proj = get_blocks_project_count(threshold=20) # num of projects to have specific block.

In [31]:
len(active_total)


Out[31]:
650

In [12]:
c = Counter(active_proj)
# filter(lambda k: k in all_cc_blocks, active.keys())
cc_dict = {k: active_proj[k] for k in all_cc_blocks}

cc_count = Counter(cc_dict)
# print cc_count.most_common(10)
# print
cc_count_sorted = cc_count.most_common()

cc_count_sorted

btypes = []
counts = []

for key, count in cc_count_sorted:
    btypes.append(key)
    counts.append(count)

counts_reversed = counts[::-1]
btypes_reversed = btypes[::-1]
    
for key, count in cc_count_sorted:
    print "{}\t{}".format(key, float(count))    
#     print "{}\t{}".format(key, float(count)/sum(counts)*100)    

print len(counts)


lexical_variable_get	1617.0
controls_if	1205.0
lexical_variable_set	1096.0
logic_boolean	708.0
lists_create_with	585.0
procedures_defnoreturn	564.0
procedures_callnoreturn	540.0
logic_false	433.0
logic_compare	355.0
lists_select_item	319.0
lists_add_items	208.0
logic_operation	185.0
logic_negate	129.0
lists_length	129.0
procedures_defreturn	128.0
controls_forEach	118.0
procedures_callreturn	111.0
controls_forRange	109.0
logic_or	98.0
local_declaration_statement	82.0
lists_pick_random_item	63.0
controls_while	63.0
controls_choose	61.0
lists_is_in	52.0
lists_replace_item	44.0
lists_is_empty	39.0
lists_remove_item	30.0
local_declaration_expression	27.0
lists_is_list	22.0
lists_position_in	22.0
lists_insert_item	15.0
lists_from_csv_row	14.0
lists_lookup_in_pairs	10.0
lists_from_csv_table	8.0
lists_copy	7.0
lists_to_csv_row	6.0
lists_to_csv_table	6.0
lists_append_list	6.0
38

In [19]:
index = np.arange(len(counts))

yticks = ["{}. {}".format(len(btypes_reversed)-i, btypes_reversed[i]) for i in range(len(btypes_reversed))]
print yticks

#Histogram of CC block frequency
fig, ax = plt.subplots()
rects1 = ax.barh(np.arange(0, len(counts))-0.4, counts_reversed)
# rects1 = ax.bar(np.arange(0, len(counts))+0.5, counts, width=1)
fig.set_size_inches(4.5,10)
# plt.xticks(index, ['']+btypes, rotation='vertical', fontsize=16)

plt.yticks(index, btypes_reversed, fontsize=14, color='black')
# plt.yticks(index, yticks, fontsize=14, color='black') #adds number to ytick

ax.set_ylabel('Block Type', fontsize=14, color='black')
ax.set_xlabel('Number of Projects', fontsize=14, color='black')
plt.tick_params(axis='x', which='major', labelsize=11, color='black')
# plt.tick_params(axis='both', which='minor', labelsize=8)

plt.title("Frequency of Computational Concept Blocks              . \n", fontsize=18, color='black')
plt.autoscale()
plt.show()


['38. lists_append_list', '37. lists_to_csv_table', '36. lists_to_csv_row', '35. lists_copy', '34. lists_from_csv_table', '33. lists_lookup_in_pairs', '32. lists_from_csv_row', '31. lists_insert_item', '30. lists_position_in', '29. lists_is_list', '28. local_declaration_expression', '27. lists_remove_item', '26. lists_is_empty', '25. lists_replace_item', '24. lists_is_in', '23. controls_choose', '22. controls_while', '21. lists_pick_random_item', '20. local_declaration_statement', '19. logic_or', '18. controls_forRange', '17. procedures_callreturn', '16. controls_forEach', '15. procedures_defreturn', '14. lists_length', '13. logic_negate', '12. logic_operation', '11. lists_add_items', '10. lists_select_item', '9. logic_compare', '8. logic_false', '7. procedures_callnoreturn', '6. procedures_defnoreturn', '5. lists_create_with', '4. logic_boolean', '3. lexical_variable_set', '2. controls_if', '1. lexical_variable_get']

In [57]:
counts_reversed


Out[57]:
[12,
 17,
 20,
 21,
 25,
 27,
 38,
 48,
 59,
 64,
 73,
 83,
 107,
 116,
 126,
 128,
 182,
 207,
 224,
 266,
 304,
 306,
 329,
 335,
 385,
 411,
 418,
 545,
 770,
 921,
 1105,
 1333,
 1379,
 1426,
 1604,
 2388,
 2647,
 3397]

In [13]:
"""
plotting
""" 

linestyles = ['solid', 'dashed', 'dotted', 'dashdot']

"""
Given list of trajectory matrices (list of ndarray)
and text describing grouping methods
return plot of average trajectory of each matrix in list
"""
def plot_cwubc_avg(traj_matrix_list, add_zero=True, grouped_by_text=""):
    plt.figure(figsize=(11.3,7))
    plt.xlabel('Project Number', fontsize=20, color='black')
    plt.ylabel('Cum. Number of Block Types', fontsize=20, color='black')

    plt.title("Average Cumulative Sum of Block Types \n for Users Clustered By {}".format(grouped_by_text), fontsize=24)

    my_legend = []
    
    for i in range(len(traj_matrix_list)):
        t_mat = traj_matrix_list[i]
        num_users = np.shape(t_mat)[0]
        if add_zero:
            t_mat = np.insert(t_mat, 0, 0, axis=1) #0 added in beginning for plot
        T_avg = np.average(t_mat, axis=0) #avg of each column/at each project
        plt.plot(T_avg, linewidth=5, linestyle = linestyles[i % len(linestyles)])
        my_legend.append("Cluster {} ({} users)".format(i, num_users))

    plt.legend(my_legend, loc=2, fontsize=16)

"""
Given trajectory matrix (ndarray),
plot trajectory of all users (rows) separately
"""
def plot_cwubc_all_users(traj_matrix, add_zero=True):
    plt.figure(figsize=(12,8))
    num_users = np.shape(traj_matrix)[0]
    if add_zero:
        T_all_plot = np.insert(traj_matrix, 0, 0, axis=1) #0 added in beginning for plot
    else:
        T_all_plot = traj_matrix
    # plt.plot(T_all_mean, linestyle='dotted', linewidth=5)
    plt.xlabel('Project Number')
    plt.ylabel('Number of Unique Blocks')

    plt.title("Cummulative Number of Blocks Used by AI User")
    for i in range(T_all_plot.shape[0]):
        plt.plot(T_all_plot[i,:])
    #TODO: figuer out return

""" HELPER FUNCTIONS """

"""
Given number of groups, return list of human-readable strings
to be element of POPPS plot that splits 100% into num_groups groups
ex: num_groups = 4 => ['<25%','25-50%','50-75%','>75%']
"""
def create_legend(num_groups):
    vals = range(0,101,100/num_groups)
    output = []
    output.append("<{}%".format(vals[1]))
    for i in range(1,len(vals)-2):
        output.append("{}-{}%".format(vals[i], vals[i+1]))
    output.append(">{}%".format(vals[-2]))
    return output

# given 1D numpy array, return same array w/ 0.0
# added to first term
def add_zero(vector):
    return np.insert(0.0, 1, vector)

In [15]:
"""
Grouping Users by final CWUBC (AKA dumb clustering)
"""

"""
given CWUBC trajectory matrix (traj_matrix, ndarray)
and number of desired groups (n, as int),
split traj_matrix according to final CWUBC.
return list of n matrices representing traj_matrix split n ways and
list of lists of indices in matrix that correspond to each split
"""
def split_by_end_cwubc(traj_matrix, n):
    end_cwubc = traj_matrix[:,-1]
    thresholds = np.percentile(traj_matrix[:,-1], range(0,101,100/n)) #get thresholds for spliting n ways
    
#     indices = []
    output = []
    indices = []
    for i in range(len(thresholds)-1):
#         indices.append(np.argwhere(np.all([end_cwubc>=thresholds[i], end_cwubc<thresholds[i+1]], axis=0)))
        inds = np.argwhere(np.all([end_cwubc>=thresholds[i], end_cwubc<thresholds[i+1]], axis=0)).flatten()
        indices.append(inds)
        output.append(traj_matrix[inds])

    return output, indices

In [16]:
### DEPRECATED ###
#Splitting users by CWUBC and comparing POPPS

"""
given list of lists of indicies (list of int),
list of skills (strings corresponding to POPPS_MAP.keys()),
plot POPPS survival curve
"""
def plot_popps(grouped_inds, grouped_by_text="<something>", skills=POPPS_MAP.keys()):
    for skill in skills:
        user_ids = []
        popps = []
        all_user_ids = np.array(get_users(THRESHOLD).keys())
        legend = []
        for i in range(len(grouped_inds)):
            indices = grouped_inds[i]
            temp_uids = all_user_ids[indices]
            user_ids.append(temp_uids)
            p = get_popps_all_users(skill, temp_uids)
            p_avg = get_average_survival(p)
            p_avg = np.insert(p_avg, 0, 1.0)
            popps.append(p_avg)
            legend.append("{} ({} users)".format(i, len(temp_uids)))
        plt.figure(figsize=(12,8))
        plt.title("Surirval Curve of Users Grouped by {}: {}".format(grouped_by_text.title(), skill.title()))
        plt.xlabel("Project")
        plt.ylabel("Proportion of Users Who Have Never Used {}".format(skill.title()))
        for p_avg in popps:
            plt.plot(p_avg, linewidth=5)
        plt.legend(legend)
    
    
# n = 3
# mats, inds = split_by_end_cwubc(T_all, n)

# for skill in POPPS_MAP.keys():
#     user_ids = []
#     popps = []
#     all_user_ids = np.array(get_users(THRESHOLD).keys())
#     for i in range(len(inds)):
#         indices = inds[i]
#         temp_uids = all_user_ids[indices]
#         user_ids.append(temp_uids)
#         p = get_popps_all_users(skill, temp_uids)
#         p_avg = get_average_survival(p)
#         p_avg = np.insert(p_avg, 0, 1.0)
#         popps.append(p_avg)
#     plt.figure(figsize=(12,8))
#     plt.title("Surirval Curve of Users Grouped by CWUBC: {}".format(skill.title()))
#     plt.xlabel("Project")
#     plt.ylabel("Proportion of Users Who Have Never Used Concept")
#     for p_avg in popps:
#         plt.plot(p_avg, linewidth=5)
#     plt.legend(create_legend(n))
    
# plot_cwubc_avg(mats)
# plot_cwubc_all_users(mats[3])

Trajectory Comparison: All Blocks vs Comp. Concepts

TODO:

  • function to plot these (input: trajectory list, title, ylabel, legend)
  • see if learning rate is exponential decay See SO Answer

In [22]:
# plot_trajectory(traj_mat_list, title="", ylabel="", percent=False, legend_loc=2)
def plot_trajectory(traj_mat_list, title="Average Cumulative Sum of Block Types", 
                    ylabel="Cum. Number of\nBlock Types", 
                    legend = ['Comp. Concepts Blocks', 'Non-CC Blocks'], legend_loc=2, percent=False):

#     fig = plt.figure(figsize=(11.3,7))
    fig = plt.figure(figsize=(11.3,5))
    ax = fig.add_subplot(1,1,1)
    plt.xlabel('Project Number', fontsize=20, color='black')
    plt.ylabel(ylabel, fontsize=20, color='black')
    plt.title(title, fontsize=24, color='black')
    my_legend = legend

    for i in range(len(traj_mat_list)):
        t_mat = traj_mat_list[i]
        if percent:
            t_mat = t_mat * 100 #for percent
        num_users = np.shape(t_mat)[0]
        t_mat = np.insert(t_mat, 0, 0, axis=1) #0 added in beginning for plot
        T_avg = np.average(t_mat, axis=0) #avg of each column/at each project
        if i==1:
            plt.plot(T_avg, linewidth=5, linestyle='dashed')
        else:
            plt.plot(T_avg, linewidth=5)
        
        my_legend.append("{} ({} users)".format(i, num_users))

    plt.legend(my_legend, loc=legend_loc, fontsize=16)
    if percent:
        fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
        yticks = mtick.FormatStrFormatter(fmt)
        ax.yaxis.set_major_formatter(yticks)
        
    return fig, ax

In [23]:
"""
Plotting UBC of all blocks vs CT blocks
"""
mats, inds = split_by_end_cwubc(T_all, 1)
mats_not, inds_not = split_by_end_cwubc(T_all_not, 1)
mats_cc, inds_cc = split_by_end_cwubc(T_all_cc, 1)

both_mats = [mats_cc[0], mats_not[0]]

#Avg Block Count
plot_trajectory(both_mats)


#Normalized Learning Rate
mats_delta = [
    normalize_trajectory(difference_trajectory(mats_cc[0]), by_max=False),
    normalize_trajectory(difference_trajectory(mats_not[0]), by_max=False)
    ]

plot_trajectory(mats_delta, title="Normalized Average Learning Rate", 
                ylabel="% of Block Types\nIntroduced", 
                percent=True, legend_loc=1)


#Normalized Block Count
mats_norm = [normalize_trajectory(mats_cc[0]), normalize_trajectory(mats_not[0])]
# both_mats_norm = [normalize_trajectory(mats_cc[0])]

plot_trajectory(mats_norm, title="Normalized Avg. Cum. Sum of Block Types", 
                ylabel="% of Cum. Number\nof Block Types", percent=True, legend_loc=2)


Out[23]:
(<matplotlib.figure.Figure at 0x113ce2d50>,
 <matplotlib.axes._subplots.AxesSubplot at 0x113b0d110>)

In [15]:
"""
Clustering
"""

"""
given trajectory matrix (n x d where d is num of projects)
and optional number of clusters,
return list of trajectory matrices for each cluster of users
"""
def k_means(traj_matrix, num_clusters = 3):
    estimator = KMeans(n_clusters=num_clusters)
    estimator.fit(traj_matrix)
    predict = estimator.predict(traj_matrix)
    cluster_ind = [] #list of lists of ind in given cluster
    T_cluster = []
    for i in range(num_clusters):
        cluster_ind.append(np.argwhere(predict==i).flatten())
        T_cluster.append(traj_matrix[cluster_ind[i]])
        print "{} has {} users".format(i, len(cluster_ind[i]))
    return T_cluster, cluster_ind

In [18]:
T_cluster = k_means(T_all, 3)[0]
print
delta = np.diff(T_all, axis=1)
T_diff_ind = k_means(delta, 3)[1]
T_diff = []
T_diff_not = []
T_diff_cc = []
for i in range(3):
    T_diff.append(T_all[T_diff_ind[i]])
    T_diff_not.append(T_all_not[T_diff_ind[i]])
    T_diff_cc.append(T_all_cc[T_diff_ind[i]])


0 has 33 users
1 has 42 users
2 has 66 users

0 has 41 users
1 has 92 users
2 has 8 users

In [27]:
T_diff[2]
# T_diff_ind[2]


Out[27]:
array([[  2,  50,  51,  63,  71,  79,  83,  91,  98, 101, 104, 104, 104,
        107, 107, 107, 107, 108, 108, 112],
       [ 11,  63,  64,  71,  71,  71,  71,  73,  74,  77,  90,  96,  96,
         96,  98,  98,  99,  99, 102, 110],
       [  9,  65,  78,  78,  82,  84,  87,  91,  93,  99,  99, 101, 105,
        105, 108, 110, 118, 118, 118, 124],
       [  0,  54,  70,  80,  80,  90, 106, 120, 122, 122, 128, 128, 128,
        132, 134, 134, 135, 135, 136, 137],
       [  4,  58,  59,  59,  59,  59,  59,  59,  59,  59,  60,  60,  60,
         60,  60,  60,  69,  83,  88,  90],
       [  0,  38,  43,  45,  48,  50,  54,  61,  69,  76,  76,  76,  83,
         85,  87,  88,  90,  90,  96,  96],
       [  3,  98,  98, 105, 108, 110, 110, 113, 123, 124, 124, 130, 130,
        131, 140, 149, 149, 149, 149, 149],
       [ 10,  68,  68,  68,  68,  73,  75,  84,  84,  89,  89,  90,  91,
         92,  92,  97,  97,  97,  97,  97]])

In [24]:
x = get_users(20)

In [26]:
x


Out[26]:
{'000317': 21,
 '000411': 34,
 '000482': 40,
 '000629': 25,
 '000800': 25,
 '001181': 40,
 '001343': 28,
 '001442': 34,
 '001554': 21,
 '001651': 49,
 '001655': 21,
 '001683': 39,
 '001850': 24,
 '001881': 22,
 '001953': 29,
 '001980': 24,
 '001990': 28,
 '002038': 21,
 '002048': 32,
 '002060': 27,
 '002080': 45,
 '002136': 22,
 '002168': 25,
 '002188': 63,
 '002380': 22,
 '002459': 176,
 '002638': 30,
 '002687': 25,
 '002733': 54,
 '002755': 65,
 '002780': 31,
 '002805': 22,
 '002876': 21,
 '002897': 21,
 '002993': 24,
 '003009': 33,
 '003012': 22,
 '003042': 40,
 '003066': 21,
 '003074': 23,
 '003144': 24,
 '003149': 28,
 '003244': 32,
 '003256': 31,
 '003324': 30,
 '003382': 36,
 '003469': 30,
 '003508': 41,
 '003542': 29,
 '003694': 21,
 '003928': 30,
 '003941': 23,
 '004052': 227,
 '004164': 21,
 '004179': 41,
 '004203': 21,
 '004233': 199,
 '004250': 28,
 '004264': 29,
 '004399': 36,
 '004575': 21,
 '004630': 26,
 '004664': 24,
 '004743': 59,
 '004755': 28,
 '004767': 25,
 '004784': 34,
 '004970': 29,
 '005067': 55,
 '005101': 58,
 '005119': 28,
 '005179': 29,
 '005193': 22,
 '005340': 25,
 '005471': 22,
 '005513': 21,
 '005670': 34,
 '005671': 23,
 '005843': 25,
 '005964': 27,
 '005990': 26,
 '006148': 88,
 '006220': 35,
 '006244': 102,
 '006247': 32,
 '006262': 27,
 '006376': 70,
 '006504': 33,
 '006549': 24,
 '006629': 21,
 '006681': 64,
 '006794': 35,
 '006909': 25,
 '006998': 27,
 '007037': 22,
 '007057': 61,
 '007084': 28,
 '007157': 22,
 '007293': 21,
 '007433': 32,
 '007600': 108,
 '007692': 29,
 '007696': 209,
 '007710': 66,
 '007895': 22,
 '007909': 21,
 '008005': 58,
 '008095': 79,
 '008140': 72,
 '008223': 40,
 '008257': 31,
 '008437': 25,
 '008467': 27,
 '008468': 25,
 '008503': 41,
 '008527': 22,
 '008544': 30,
 '008576': 38,
 '008644': 28,
 '008755': 47,
 '008801': 30,
 '008807': 26,
 '008815': 35,
 '008895': 65,
 '008906': 29,
 '008980': 25,
 '008995': 39,
 '009066': 27,
 '009102': 44,
 '009147': 40,
 '009204': 36,
 '009277': 25,
 '009356': 22,
 '009373': 41,
 '009432': 32,
 '009448': 21,
 '009551': 34,
 '009584': 21,
 '009602': 36,
 '009893': 25,
 '009952': 61}

In [19]:
mats, inds = split_by_end_cwubc(T_all, 3)
# plot_cwubc_avg(mats, 'end block count')
# plot_cwubc_avg(T_cluster, 'kmeans clustering')
plot_cwubc_avg(T_diff, grouped_by_text='Learning Rate')

traj_mat_list_not = T_diff_not
traj_mat_list_cc = T_diff_cc
for i in range(len(traj_mat_list_not)):
    traj_both = [normalize_trajectory(traj_mat_list_cc[i]), normalize_trajectory(traj_mat_list_not[i])]
    plot_trajectory(traj_both, title="Avg Distinct Block Count (Normalized), Cluster {}".format(i), 
                    ylabel="% of Cum. Number of Unique Blocks", percent=True, legend_loc=4)


Grouping based on slope of trajectory, K-Means

Smallest Cluster (~10 users) has sharp spike in UBC from 1st to 2nd project but sample too small to determine if relevant.


In [198]:
# grouped_inds = k_means(T_all, 3)[1]
g_text = "KMeans diff"
delta = np.diff(T_all, axis=1)
grouped_inds = k_means(delta, 3)[1]
T_diff = []
for i in range(3):
    T_diff.append(T_all[grouped_inds[i]])
plot_cwubc_avg(T_diff, g_text)
# plot_popps(grouped_inds, grouped_by_text=g_text)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-198-0f7b7d668107> in <module>()
      2 g_text = "KMeans diff"
      3 delta = np.diff(T_all, axis=1)
----> 4 grouped_inds = k_means(delta, 3)[1]
      5 T_diff = []
      6 for i in range(3):

NameError: name 'k_means' is not defined

Attempting Fixed Effect Models

x: UBC y: # of projects using procedures (# of procedures


In [131]:
"""
getting ind where project uses procedures
"""

"""
given user_id (string) and prog_skill (string that is in POPPS_MAP.keys()),
return vector of length threshold that is 1 if prog_skill is used in that project, 0 else
"""
# def get_specific_popps_location(user_id, prog_skill, threshold=THRESHOLD, block_types= BLOCK_TYPES):
#     if prog_skill not in POPPS_MAP:
#         raise Exception("{} not in POPPS_MAP. Select from {}".format(prog_skill, POPPS_MAP.keys()))
#     popps_count = np.zeros(threshold)

#     P_f = get_freq_matrix(user_id, threshold)
#     block_inds = get_block_indices(POPPS_MAP[prog_skill], block_types)
#     found_proj_ind = np.unique(np.argwhere(P_f[:,block_inds]>0)[:,0]) #project inds in P_f that have prog skill in project
#     if len(found_proj_ind):
#         popps_count[found_proj_ind] = 1
#     return popps_count

"""
given user_id (string) and prog_skill (string that is in POPPS_MAP.keys()),
optional binary (boolean) that returns binary/boolean vector instead of true counts if true,
return vector of length threshold that shows number of blocks related to prog_skill used in project
"""
def get_specific_popps_counts(user_id, prog_skill, binary=False, threshold=THRESHOLD, block_types= BLOCK_TYPES):
    if prog_skill not in POPPS_MAP:
        raise Exception("{} not in POPPS_MAP. Select from {}".format(prog_skill, POPPS_MAP.keys()))
    popps_count = np.zeros(threshold)

    P_f = get_freq_matrix(user_id, threshold)
    block_inds = get_block_indices(POPPS_MAP[prog_skill], block_types)
    blk_count = P_f[:,block_inds][:,0] #num of prog skill blocks in each project
#     print blk_count
    if binary:
        return blk_count>0
    return blk_count

In [130]:
uid = '000317'
proj_inds = get_specific_popps_counts(uid, 'proc')
# num_proj = np.nonzero(proj_inds>0)[0].shape[0]
# num_proj
np.sum(proj_inds)


[ 0.  2.  3.  0.  0.  0.  0.  0.  0.  0.  0.  0.  4.  0.  0.  0.  0.  0.
  0.  0.]
Out[130]:
9.0

In [150]:
#getting UBC and # of projects w/ procedure for each user
#end UBC
ubc = T_all[:,-1]
# print ubc

is_binary = True
data = np.zeros([len(ubc), 2])
data[:,0] = ubc

# print user_ids

user_ids = get_users(20)
for i in range(len(user_ids)):
    uid = user_ids.keys()[i]
    proj_inds = get_specific_popps_counts(uid, 'proc', binary=is_binary)
    if is_binary:
        count = np.nonzero(proj_inds>0)[0].shape[0]
    else:
        count = np.sum(proj_inds)
    data[i,1] = count

In [151]:
x = data[:,0]
y = data[:,1]
fit = np.polyfit(x,y,1)
p = np.poly1d(fit)


xp = np.linspace(0,200,1000)

_ = plt.plot(x, y, '.', xp, p(xp), '-')
# plt.scatter(x, y)



In [152]:
# calculating correlation
np.corrcoef(x,y)[0,1]


Out[152]:
0.55629524021334065

In [27]:
pf = get_freq_matrix('000317', 20)
block_inds = get_block_indices(POPPS_MAP['proc'], BLOCK_TYPES)
found_proj_ind = np.argwhere(pf[:,block_inds]>0) #locations in P_b that show block in project
found_proj_ind


Out[27]:
array([[ 1,  0],
       [ 1,  3],
       [ 2,  0],
       [ 2,  3],
       [12,  0],
       [12,  3]])

In [37]:
len(np.unique(found_proj_ind[:,0]))


Out[37]:
3

In [43]:
x=np.zeros(10)
x[[2,4,6]]=2
x


Out[43]:
array([ 0.,  0.,  2.,  0.,  2.,  0.,  2.,  0.,  0.,  0.])

In [30]:
pf[:,block_inds]


Out[30]:
array([[ 0.,  0.,  0.,  0.],
       [ 2.,  0.,  0.,  4.],
       [ 3.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 4.,  0.,  0.,  4.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.]])

Computational Concepts Trajectory


In [ ]: