In [2]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

mpl.style.use('bmh')

path = "C:/Users/mramire8/Google Drive/AAL-Experiments/aal_python/sr-oracle-test/results-calibrated/zscores/score-analysis"
trial = "trial0-student5k.txt"

def load_trial(filename):
    data = np.loadtxt(path+"/"+filename, skiprows=1)
    return data

def read_data(filename):
    f = open(path+"/"+filename)
    with f:
        lines = f.readlines() 
    return np.array([l.strip().split("\t") for l in lines[1:]]), lines[0]

t,headers = read_data(trial)
t0 = load_trial(trial)

In [3]:
print "Number of sentences", len(t)
print "Number of Documents", len(np.unique(t[:,1]))
print "Headers", headers


Number of sentences 3259
Number of Documents 250
Headers SENTID	DOCID	SCORE	RANK	CALSCORE	CALRANK	ORAPRED	1LABEL	Py0	STUDENTLABEL	CorrectlyLabeled?	StudentCorrect?	RankDiff


In [4]:
def get_doc(data, docid):
    return np.array([d for d in data if d[1] == docid])
#     return data[data[1]==docid,:]
RANK=3
CRANK=5

In [5]:
from scipy.stats import pearsonr
def get_pearson_doc(data):
    result = dict()
    for did in np.unique(data[:,1]):
        doc =get_doc(data,did)
        result[did]=pearsonr(doc[:,RANK], doc[:,CRANK])[0]
    return result

In [6]:
corr = sorted(get_pearson_doc(t0).items(),key=lambda x: x[1], reverse=True)

print "\n".join(["docid:{}\tcorrel:{}".format(d,p) for d,p in corr[:10]])


docid:8198.0	correl:1.0
docid:8203.0	correl:1.0
docid:4108.0	correl:1.0
docid:6744.0	correl:1.0
docid:35.0	correl:1.0
docid:11300.0	correl:1.0
docid:8355.0	correl:1.0
docid:3629.0	correl:1.0
docid:2610.0	correl:1.0
docid:7469.0	correl:1.0
C:\Python27\lib\site-packages\scipy\stats\stats.py:2436: RuntimeWarning: invalid value encountered in double_scalars
  r = r_num / r_den
C:\Python27\lib\site-packages\scipy\stats\stats.py:4184: RuntimeWarning: invalid value encountered in less
  x = np.where(x < 1.0, x, 1.0)  # if x > 1 then return 1.0

In [7]:
from collections import Counter
corr_dist = Counter([c[1] for c in corr])
plt.hist(corr_dist.keys(),weights=corr_dist.values(), bins=np.arange(.8,1.001,.01))
plt.title("Correlation Distribution Ranks - (mean: {0:.3f})".format(np.mean([c[1] for c in corr])))
plt.xlabel("Pearson Correlation")
plt.ylabel("Frequency")


Out[7]:
<matplotlib.text.Text at 0xb4381d0>

In [35]:
#SENTID	DOCID	SCORE	RANK	CALSCORE	CALRANK	ORAPRED	1LABEL	Py0	STUDENTLABEL	CorrectlyLabeled?	StudentCorrect?	RankDiff
# Chek if the sentences picked by the students are correct?
SCORRECT = 11 ## correctly labeled by the student
OCORRECT = 10 # correctly labeled by the oracle
snippets = t0[t0[:,CRANK] == 0]
print "Correctly labeled snippets by student: %s (%s)" % (len(snippets[snippets[:,SCORRECT]==1]),snippets[:,SCORRECT].sum()/len(snippets))
print "Snippets by student: Correct 0:%s, correct-1:%s" % (len([s for s in snippets if s[9]==0 and s[SCORRECT]]),len([s for s in snippets if s[9]==1 and s[SCORRECT]]))
print "Correctly labeled snippets by oracle: %s" % (snippets[:,OCORRECT].sum()/len(snippets))


Correctly labeled snippets by student: 189 (0.756)
Snippets by student: Correct 0:99, correct-1:90
Correctly labeled snippets by oracle: 0.78

In [54]:
print "Snippets" 
from itertools import product

def get_selected(snippets, label, student, oracle):
    return [s for s in snippets if s[7]==label and s[SCORRECT] == student and s[OCORRECT] == oracle]

def get_options_selected(snippets,labels, student, oracle):
    options = product(labels, student, oracle)
    tot = 0
    ans= []
    for opt in options:
        s = get_selected(snippets, *opt)
        print "label=%s\tstudent=%s\toracle=%s" % opt,
        print "\t%s\t%s" % (len(s), len(s)/250.)
        tot += len(s)
        ans.append(s)
    print "Total", tot
    return ans


Snippets

In [48]:
labels=[0,1]
student=[True, False]
oracle=[True]

print "Snippets where the oracle is correct"
get_options_selected(snippets, labels,student, oracle)


Snippets where the oracle is correct
label=0	student=True	oracle=True 	95	0.38
label=0	student=False	oracle=True 	6	0.024
label=1	student=True	oracle=True 	83	0.332
label=1	student=False	oracle=True 	11	0.044
Total 195

In [49]:
labels=[0,1]
student=[True]
oracle=[True, False]

print "Snippets where the student is correct"
get_options_selected(snippets, labels,student, oracle)


Snippets where the student is correct
label=0	student=True	oracle=True 	95	0.38
label=0	student=True	oracle=False 	4	0.016
label=1	student=True	oracle=True 	83	0.332
label=1	student=True	oracle=False 	7	0.028
Total 189

In [56]:
# wrong = get_selected(snippets, labels, slabels=[0,1]
student=[False]
oracle=[False]

print "Snippets where the oracle is correct"
wrong = get_options_selected(snippets, labels,student, oracle)
print len(wrong)


Snippets where the oracle is correct
label=0	student=False	oracle=False 	18	0.072
label=1	student=False	oracle=False 	26	0.104
Total 44
2

In [61]:
wrong_docs= [s[1] for s in wrong[0]]
wrong_docs.extend([s[1] for s in wrong[1]])

In [69]:
w0=get_doc(t0, wrong_docs[0])

print "Document wrong", wrong_docs[0]


Document wrong 11235.0

In [73]:
scc_rank = w0[w0[:,CRANK]==1]
print "Sencond ranked in w0", scc_rank[0]
print "correct?", scc_rank[0][OCORRECT]


 Sencond ranked in w0 [  1.00000000e+00   1.12350000e+04   7.67367050e-01   0.00000000e+00
   1.67110300e+00   1.00000000e+00   0.00000000e+00   0.00000000e+00
   7.67367050e-01   0.00000000e+00   1.00000000e+00   1.00000000e+00
   1.00000000e+00]
correct? 1.0

In [94]:
seg_option = []
for d in wrong_docs:
    sents = get_doc(t0,d)
    seg_option.append([d, sents[sents[:,CRANK] ==1][0][OCORRECT]])

print "Numb. of correct sencond choise", np.array(seg_option)[:,1].sum()


Numb. of correct sencond choise 25.0

In [82]:
len(snippets)


Out[82]:
250

In [85]:
bad = snippets[snippets[snippets[:,OCORRECT]==False][:,SCORRECT]==False][:,1]

In [86]:
bad==wrong_docs


Out[86]:
array([ True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False], dtype=bool)

In [88]:
len(bad)


Out[88]:
44

In [89]:
len(wrong_docs)


Out[89]:
44

In [91]:
print zip(sorted(bad), sorted(wrong_docs))


[(77.0, 30.0), (790.0, 77.0), (828.0, 284.0), (1263.0, 447.0), (1388.0, 452.0), (1768.0, 604.0), (1844.0, 605.0), (1857.0, 732.0), (1868.0, 790.0), (2193.0, 828.0), (2464.0, 1376.0), (2617.0, 1768.0), (2633.0, 1844.0), (3191.0, 1913.0), (3508.0, 3304.0), (3652.0, 3629.0), (3812.0, 3681.0), (3907.0, 3698.0), (4251.0, 4015.0), (4910.0, 4408.0), (5304.0, 4426.0), (5421.0, 4698.0), (6753.0, 4734.0), (7285.0, 5147.0), (7857.0, 5159.0), (7879.0, 5232.0), (8009.0, 5256.0), (8190.0, 5320.0), (8965.0, 5483.0), (8971.0, 6232.0), (9826.0, 6377.0), (9865.0, 6744.0), (9977.0, 6797.0), (10319.0, 6805.0), (10339.0, 9697.0), (10545.0, 10442.0), (11235.0, 10545.0), (11345.0, 11235.0), (11670.0, 11321.0), (11770.0, 11770.0), (12063.0, 12002.0), (12113.0, 12099.0), (12164.0, 12137.0), (12389.0, 12389.0)]

In [92]:
print sorted(bad)


[77.0, 790.0, 828.0, 1263.0, 1388.0, 1768.0, 1844.0, 1857.0, 1868.0, 2193.0, 2464.0, 2617.0, 2633.0, 3191.0, 3508.0, 3652.0, 3812.0, 3907.0, 4251.0, 4910.0, 5304.0, 5421.0, 6753.0, 7285.0, 7857.0, 7879.0, 8009.0, 8190.0, 8965.0, 8971.0, 9826.0, 9865.0, 9977.0, 10319.0, 10339.0, 10545.0, 11235.0, 11345.0, 11670.0, 11770.0, 12063.0, 12113.0, 12164.0, 12389.0]

In [93]:
print sorted(wrong_docs)


[30.0, 77.0, 284.0, 447.0, 452.0, 604.0, 605.0, 732.0, 790.0, 828.0, 1376.0, 1768.0, 1844.0, 1913.0, 3304.0, 3629.0, 3681.0, 3698.0, 4015.0, 4408.0, 4426.0, 4698.0, 4734.0, 5147.0, 5159.0, 5232.0, 5256.0, 5320.0, 5483.0, 6232.0, 6377.0, 6744.0, 6797.0, 6805.0, 9697.0, 10442.0, 10545.0, 11235.0, 11321.0, 11770.0, 12002.0, 12099.0, 12137.0, 12389.0]

In [ ]: