In [1]:
import os, sys, functools, json
from collections import namedtuple
import numpy as np
from numpy.random import random_integers
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from pandas import DataFrame
pd.set_option('display.max_colwidth',100)
pd.set_option('display.float_format', '{:.3g}'.format)
import scipy
import scipy.optimize
import sklearn
lib_path = os.path.abspath('../hammer')
sys.path.append(lib_path)
import hammer
In [2]:
j = json.load(open("../scrape/items.json", "r"))
In [28]:
[j[i] for i in random_integers(0, len(j), 4)]
Out[28]:
In [29]:
def lang_gen(j):
for el in j:
for lang in el["langs"]:
yield lang
language_set = set(lang_gen(j))
def questions_gen(j):
for el in j:
yield el["slug"]
question_set = set(questions_gen(j))
language_set.difference_update(set(["REBOL", "Agda"]))
# We use languages and questions as array dimensions below, so standardize an ordering
languages = sorted(language_set)
questions = sorted(question_set)
In [30]:
PollResult = namedtuple("PollResult",
"question langs votes")
def result_from_json(el, question_set, language_set):
if not (el["slug"] in question_set
and el["langs"][0] in language_set
and el["langs"][1] in language_set):
return None
return PollResult(el["slug"], el["langs"], el["votes"])
def create_resdict(languages, questions, j):
langpos = { lang:i for (i,lang) in enumerate(languages) }
res_dict = {q: np.zeros([len(languages),
len(languages)])
for q in questions }
question_set = set(questions)
language_set = set(languages)
for el in j:
pollresult = result_from_json(el, question_set, language_set)
if not pollresult:
continue
pos0 = langpos[pollresult.langs[0]]
pos1 = langpos[pollresult.langs[1]]
res_dict[pollresult.question][pos0,pos1] = pollresult.votes[0]
res_dict[pollresult.question][pos1,pos0] = pollresult.votes[1]
return res_dict
In [31]:
res_dict = create_resdict(languages, questions, j)
In [38]:
def expected_wins(dr):
"""ELO expected wins formula"""
return 1.0 / (np.power(10,-dr/400.0) + 1.0)
def win_ratios(elo_scores):
"""In this set up the (i,j)th output is >.5 if S[i]>S[j] """
XX,YY = np.meshgrid(elo_scores, elo_scores)
return expected_wins(YY-XX)
def elo_scores(results):
matches = results+results.T
def elo_error(elo_scores):
return np.sum(results-matches*win_ratios(elo_scores),
axis=1)
S = scipy.optimize.leastsq(elo_error,
np.zeros([results.shape[0]]))
return S[0]-np.average(S[0], weights=matches.sum(axis=0))
In [45]:
def create_scores(languages, questions, res_dict):
res = np.zeros((len(languages), len(questions)))
for (i,question) in enumerate(questions):
res[:,i] = elo_scores(res_dict[question])
return DataFrame(res, index=languages, columns=questions)
In [46]:
scores = create_scores(languages, questions, res_dict)
# Give nicely formatted names to the columns of `scores`
dfq = DataFrame.from_csv('questions2.csv').sort()
scores.columns = dfq.question_text
print "Created scores with shape {} for {} languages and {} questions".format(
scores.shape, scores.shape[0], scores.shape[1])
[scores.index,
scores.columns]
Out[46]:
scores
has shape (len(languages), len(questions))
so pca selects axes in question-space
In [65]:
pca = sklearn.decomposition.PCA(n_components=4)
langpcafit = pca.fit(scores)
num_components = langpcafit.components_.shape[0]
component_labels = ["C" + str(i) for i in range(0,num_components)]
langpcc = DataFrame(pca.transform(scores),
index = languages,
columns = component_labels)
langpcs = DataFrame(langpcafit.components_.transpose(),
index = dfq.question_text,
columns = component_labels)
In [66]:
cevr = np.cumsum(langpcafit.explained_variance_ratio_)
plt.plot(cevr, 'ro')
plt.grid()
plt.xlabel('Component')
plt.ylim([0, 1])
plt.ylabel('Cumulative explained variance')
cevr
Out[66]:
In [67]:
def scatter_simple(X,Y,**kwargs):
plt.figure(figsize=(7,7))
plt.scatter(X, Y, marker='.', color='y', **kwargs)
plt.grid()
plt.gray()
def annotate_scatter(labels,X,Y):
for label, x, y in zip(labels, X, Y):
plt.annotate(
label,
xy = (x, y), xytext = (0, 0),
textcoords = 'offset points',
ha = 'center',
va = 'center',
bbox = dict(boxstyle = 'round,pad=0',
fc = 'yellow',
ec = 'yellow',
alpha = .2),
size = 12)
def scatter_components(df, CX="C0", CY="C1", **kwargs):
scatter_simple(df[CX],df[CY],**kwargs)
annotate_scatter(df.index, df[CX],df[CY])
plt.title('Languages in component space')
plt.xlabel(CX + ' (ELO points)')
plt.ylabel(CY + ' (ELO points)')
max_abs_score=df.abs().max().max()
plt.gca().set_xlim([-max_abs_score, max_abs_score])
plt.gca().set_ylim([-max_abs_score, max_abs_score])
In [95]:
scatter_components(langpcc)
scatter_components(langpcc, "C2", "C3")
In [97]:
def headtail(df, n):
return pd.concat([df.head(n), df.tail(n)])
def describe_component(langpcs, cpt):
return DataFrame(langpcs[cpt]).sort(cpt, ascending=False)
headtail(describe_component(langpcs, "C0"),
5)
Out[97]:
In [101]:
def cpt_summary(langpcs, cpts):
df2 = DataFrame()
for cpt in cpts:
df2[cpt] = list(DataFrame(langpcs[cpt]).sort(cpt, ascending=False).index)
return df2
headtail(cpt_summary(langpcs, component_labels),3)
Out[101]:
In [104]:
def component_influences_summarize(df, FCpt, FLang, extraCols=[]):
FLangCpt = FCpt+"*("+FLang+")"
AbsFLangCpt = "Abs"+FLang+FCpt
df[FLangCpt] = df[FCpt]*df[FLang]
df[AbsFLangCpt] = df[FLangCpt].abs()
dfs = df.sort([AbsFLangCpt], ascending=False)
return dfs[[FCpt, FLangCpt, FLang]+extraCols]
In [105]:
def component_influences(langpcs, scores, FCpt, FLang):
"""Return a data frame for the given language and component.
index - question
columns - language scores, component weight, product of these
sorted by abs(language score * component weight) descending
"""
df = langpcs.copy()
df[FLang] = scores.transpose()[FLang]
return component_influences_summarize(df, FCpt, FLang)
component_influences(langpcs, scores, "C0", "Python").head(10)
Out[105]:
In [108]:
def component_influences_compare(langpcs, scores, FCpt, lang1, lang2):
FLang = lang2+'-' +lang1
extraCols = [lang1, lang2]
df = langpcs.copy()
scoresT = scores.transpose()
df[FLang] = scoresT[lang2]-scoresT[lang1]
df[extraCols] = scoresT[extraCols]
return component_influences_summarize(df, FCpt, FLang, extraCols)
component_influences_compare(langpcs, scores,
"C0",
"C++", "Python").head(6)
Out[108]:
In [109]:
def comparebycpt(langpcs, scores, component, languages):
return pd.concat([DataFrame(langpcs[component]),
scores.transpose()[languages]],
axis = 1).sort(component, ascending=False)
headtail(comparebycpt(langpcs, scores,
"C0",
["C++", "Python", "F#"]),
3)
Out[109]:
In [110]:
import re
def questions_with(regex):
return [ q for q in dfq.question_text if re.search(regex, q) ]
def highlights(df, languages=None):
return
qs= questions_with("readable")
scores[qs].sort(qs, ascending=False).head(10)
Out[110]:
In [113]:
def bigdiffs(dfq, scores, language1, language2):
df3 = pd.concat([dfq.reset_index().set_index("question_text"),
scores.transpose()[[language1, language2]]],
axis=1)
df3["delta"] = df3[language1]-df3[language2]
df3["impdelta"] = df3["delta"]*df3.importance
df3 = df3.sort("impdelta", ascending=False)
return df3[[language1, language2, "impdelta",
"importance"]]
bd=bigdiffs(dfq, scores, "Python", "C++")
headtail(bd.drop("importance", axis=1),
4)
Out[113]:
In [114]:
from pandas.tools.plotting import _get_standard_colors
def parcor2(frame, class_column, cols=None, ax=None, color=None,
use_columns=False, yticks=None, colormap=None,
**kwds):
"""Parallel coordinates plotting.
Parameters
----------
frame: DataFrame
class_column: str
Column name containing class names
cols: list, optional
A list of column names to use
ax: matplotlib.axis, optional
matplotlib axis object
color: list or tuple, optional
Colors to use for the different classes
use_columns: bool, optional
If true, columns will be used as yticks
yticks: list or tuple, optional
A list of values to use for yticks
colormap: str or matplotlib colormap, default None
Colormap to use for line colors.
kwds: keywords
Options to pass to matplotlib plotting method
Returns
-------
ax: matplotlib axis object
Examples
--------
>>> from pandas import read_csv
>>> from pandas.tools.plotting import parallel_coordinates
>>> from matplotlib import pyplot as plt
>>> df = read_csv('https://raw.github.com/pydata/pandas/master/pandas/tests/data/iris.csv')
>>> parallel_coordinates(df, 'Name', color=('#556270', '#4ECDC4', '#C7F464'))
>>> plt.show()
"""
import matplotlib.pyplot as plt
n = len(frame)
classes = frame[class_column].drop_duplicates()
class_col = frame[class_column]
if cols is None:
df = frame.drop(class_column, axis=1)
else:
df = frame[cols]
used_legends = set([])
ncols = len(df.columns)
# determine values to use for xticks
if use_columns is True:
if not np.all(np.isreal(list(df.columns))):
raise ValueError('Columns must be numeric to be used as xticks')
y = df.columns
elif yticks is not None:
if not np.all(np.isreal(yticks)):
raise ValueError('yticks specified must be numeric')
elif len(yticks) != ncols:
raise ValueError('Length of yticks must match number of columns')
y = yticks
else:
y = xrange(ncols)
if ax is None:
ax = plt.gca()
color_values = _get_standard_colors(num_colors=len(classes),
colormap=colormap, color_type='random',
color=color)
colors = dict(zip(classes, color_values))
for i in range(n):
x = df.iloc[i].values
kls = class_col.iat[i]
label = pd.core.common.pprint_thing(kls)
if label not in used_legends:
used_legends.add(label)
ax.plot(x, y, color=colors[kls], label=label, **kwds)
else:
ax.plot(x, y, color=colors[kls], **kwds)
ax.grid()
ax.set_yticks(y)
ax.set_yticklabels(df.columns)
ax.set_ylim(y[0], y[-1])
ax.legend(loc='upper right')
return ax
In [116]:
plt.figure(figsize=(6,40))
bd2=DataFrame()
bd2["impdelta"] = bd["impdelta"]
bd2["Python"] = bd["Python"]*bd["importance"]
bd2["C++"] = bd["C++"]*bd["importance"]
parcor2(
bd2[["Python","C++","impdelta"]].transpose().reset_index(),
"index")
xlim = plt.xlim()
plt.xlim(-max(np.abs(xlim)),+max(np.abs(xlim)))
Out[116]:
In [117]:
of_interest = (langpcc.C0 - langpcc.C1 ) > 500
st = scores.transpose()
imp = DataFrame(dfq.importance)
df = DataFrame(st.values*imp.values,
index = st.index,
columns = st.columns)
average_scores = DataFrame(df.mean(), columns = ["mean"])
average_scores[of_interest].sort(["mean"], ascending=False)
Out[117]:
In [ ]: