In [2]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.style.use('bmh')
path = "C:/Users/mramire8/Google Drive/AAL-Experiments/aal_python/sr-oracle-test/results-calibrated/zscores/score-analysis"
trial = "trial0-student5k.txt"
def load_trial(filename):
data = np.loadtxt(path+"/"+filename, skiprows=1)
return data
def read_data(filename):
f = open(path+"/"+filename)
with f:
lines = f.readlines()
return np.array([l.strip().split("\t") for l in lines[1:]]), lines[0]
t,headers = read_data(trial)
t0 = load_trial(trial)
In [3]:
print "Number of sentences", len(t)
print "Number of Documents", len(np.unique(t[:,1]))
print "Headers", headers
In [4]:
def get_doc(data, docid):
return np.array([d for d in data if d[1] == docid])
# return data[data[1]==docid,:]
RANK=3
CRANK=5
In [5]:
from scipy.stats import pearsonr
def get_pearson_doc(data):
result = dict()
for did in np.unique(data[:,1]):
doc =get_doc(data,did)
result[did]=pearsonr(doc[:,RANK], doc[:,CRANK])[0]
return result
In [6]:
corr = sorted(get_pearson_doc(t0).items(),key=lambda x: x[1], reverse=True)
print "\n".join(["docid:{}\tcorrel:{}".format(d,p) for d,p in corr[:10]])
In [7]:
from collections import Counter
corr_dist = Counter([c[1] for c in corr])
plt.hist(corr_dist.keys(),weights=corr_dist.values(), bins=np.arange(.8,1.001,.01))
plt.title("Correlation Distribution Ranks - (mean: {0:.3f})".format(np.mean([c[1] for c in corr])))
plt.xlabel("Pearson Correlation")
plt.ylabel("Frequency")
Out[7]:
In [35]:
#SENTID DOCID SCORE RANK CALSCORE CALRANK ORAPRED 1LABEL Py0 STUDENTLABEL CorrectlyLabeled? StudentCorrect? RankDiff
# Chek if the sentences picked by the students are correct?
SCORRECT = 11 ## correctly labeled by the student
OCORRECT = 10 # correctly labeled by the oracle
snippets = t0[t0[:,CRANK] == 0]
print "Correctly labeled snippets by student: %s (%s)" % (len(snippets[snippets[:,SCORRECT]==1]),snippets[:,SCORRECT].sum()/len(snippets))
print "Snippets by student: Correct 0:%s, correct-1:%s" % (len([s for s in snippets if s[9]==0 and s[SCORRECT]]),len([s for s in snippets if s[9]==1 and s[SCORRECT]]))
print "Correctly labeled snippets by oracle: %s" % (snippets[:,OCORRECT].sum()/len(snippets))
In [54]:
print "Snippets"
from itertools import product
def get_selected(snippets, label, student, oracle):
return [s for s in snippets if s[7]==label and s[SCORRECT] == student and s[OCORRECT] == oracle]
def get_options_selected(snippets,labels, student, oracle):
options = product(labels, student, oracle)
tot = 0
ans= []
for opt in options:
s = get_selected(snippets, *opt)
print "label=%s\tstudent=%s\toracle=%s" % opt,
print "\t%s\t%s" % (len(s), len(s)/250.)
tot += len(s)
ans.append(s)
print "Total", tot
return ans
In [48]:
labels=[0,1]
student=[True, False]
oracle=[True]
print "Snippets where the oracle is correct"
get_options_selected(snippets, labels,student, oracle)
In [49]:
labels=[0,1]
student=[True]
oracle=[True, False]
print "Snippets where the student is correct"
get_options_selected(snippets, labels,student, oracle)
In [56]:
# wrong = get_selected(snippets, labels, slabels=[0,1]
student=[False]
oracle=[False]
print "Snippets where the oracle is correct"
wrong = get_options_selected(snippets, labels,student, oracle)
print len(wrong)
In [61]:
wrong_docs= [s[1] for s in wrong[0]]
wrong_docs.extend([s[1] for s in wrong[1]])
In [69]:
w0=get_doc(t0, wrong_docs[0])
print "Document wrong", wrong_docs[0]
In [73]:
scc_rank = w0[w0[:,CRANK]==1]
print "Sencond ranked in w0", scc_rank[0]
print "correct?", scc_rank[0][OCORRECT]
In [94]:
seg_option = []
for d in wrong_docs:
sents = get_doc(t0,d)
seg_option.append([d, sents[sents[:,CRANK] ==1][0][OCORRECT]])
print "Numb. of correct sencond choise", np.array(seg_option)[:,1].sum()
In [82]:
len(snippets)
Out[82]:
In [85]:
bad = snippets[snippets[snippets[:,OCORRECT]==False][:,SCORRECT]==False][:,1]
In [86]:
bad==wrong_docs
Out[86]:
In [88]:
len(bad)
Out[88]:
In [89]:
len(wrong_docs)
Out[89]:
In [91]:
print zip(sorted(bad), sorted(wrong_docs))
In [92]:
print sorted(bad)
In [93]:
print sorted(wrong_docs)
In [ ]: