In [1]:
import csv
import urllib2
from urllib import urlencode
import os, os.path
from bs4 import BeautifulSoup
from dateutil.parser import parse
import re
import numpy as np
import sys
os.chdir('/home/will/Dropbox/BMES375/Summer13/')
sys.path.append('/home/will/PySeqUtils/')
In [2]:
from PlottingTools import make_heatmap_df
In [3]:
url_maps = {}
with open('cody_urls.csv') as handle:
for row in csv.DictReader(handle):
url = row['Cody Url']
player_id = url.rsplit('/', 1)[-1].split('-')[0]
url_maps[row['BannerWebID']] = player_id
In [4]:
cody_problems = []
with open('cody_problems.csv') as handle:
for row in csv.DictReader(handle):
cody_problems.append((row['CodyProblem'], parse(row['DueDate'])))
In [5]:
def get_web_data(user_num, problem_num):
term = 'term='
term += 'player_id%3A'+str(user_num)
term += '+problem_id%3A'+str(problem_num)
url = 'http://www.mathworks.com/matlabcentral/cody/solutions'
return urllib2.urlopen(url + '?' + term).read()
def get_sols(soup):
return soup.find_all('div', attrs = {'class':'solution-metric'})
def get_solution_size(solution):
size = solution.span.text
if size == 'Incorrect':
return np.inf
else:
return int(size)
def get_grade(user, problem):
soup = BeautifulSoup(get_web_data(user, problem))
sols = get_sols(soup)
for sol in sols:
yield get_solution_size(sol)
else:
yield None
In [6]:
from itertools import product
from concurrent.futures import ProcessPoolExecutor
from itertools import imap
def linker(tup):
(bw_id, user_num), (problem, due_date) = tup
tmp = []
for size in get_grade(user_num, problem):
tmp.append((bw_id, 'P-'+problem, size))
return tmp
with ProcessPoolExecutor(max_workers = 10) as ex:
tmp_data = []
checks = product(url_maps.items(), cody_problems)
for res in ex.map(linker, checks):
tmp_data += res
In [7]:
import pandas as pd
grade_df = pd.DataFrame(tmp_data, columns = ['BW_id', 'Prob', 'Size'])
In [8]:
pdata = pd.pivot_table(grade_df, rows = ['BW_id', 'Prob'],
values = 'Size', aggfunc = [np.min, len]).dropna()
pdata.head()
Out[8]:
In [18]:
best_scores = pd.pivot_table(pdata.reset_index(), rows = 'BW_id',
cols = 'Prob', values = 'amin')
fig = make_heatmap_df(best_scores.T, figsize = (10,10))
plt.colorbar().set_label('Best Solution Nodesize')
In [27]:
num_tries = pd.pivot_table(pdata.reset_index(), rows = 'BW_id',
cols = 'Prob', values = 'len',
aggfunc = np.sum)
fig = make_heatmap_df(num_tries.applymap(np.log10).T, figsize = (10,10))
cb = plt.colorbar()
tickpos = [5, 10, 15, 20, 30, 40, 50, 60, 100]
cb.set_label('#Tries')
cb.set_ticks([np.log10(x) for x in tickpos])
cb.set_ticklabels(map(str, tickpos))
In [ ]: