In [1]:
import JobsMapResultsFilesToContainerObjs as JRS
from nltk.metrics.agreement import AnnotationTask
from collections import OrderedDict, Counter
import re, json
import importlib
importlib.reload(JRS)
import pandas as pd
import plotly.plotly as py
import cufflinks as cf
cf.go_offline()
from math import ceil
from plotly.offline import plot, iplot
import plotly.graph_objs as go
from datetime import datetime
Three parameters are being used to measure inter-worker agreement scores. Quantify the measure of how workers agree to the same images that appear in different albums.
alpha()
Krippendorff 1980
pi()
Scott 1955; here, multi-pi. Equivalent to K from Siegel and Castellan (1988).
s()
Bennett, Albert and Goldstein 1954
Observations : Every worker has worked on atleast two albums.
(maxAlbums,minAlbums,avgAlbums) = (80, 2, 8.220264317180616)
In [36]:
workerStats = {}
wrkrCnts = {}
for row in masterList:
wrkrCnts[row[0]] = wrkrCnts.get(row[0],[]) + [row[2]]
for wrkr in wrkrCnts.keys():
workerStats[wrkr] = {'shares' : sum(wrkrCnts[wrkr]), 'total_attempts' : len(wrkrCnts[wrkr]),'proportion' : sum(wrkrCnts[wrkr])*100/len(wrkrCnts[wrkr])}
In [38]:
df = pd.DataFrame.from_dict(workerStats).transpose()
df.sort_values(by='proportion',inplace=True,ascending=True)
In [46]:
df['proportion'] = df['proportion'].apply(lambda x : round(int(x)/10)*10)
In [16]:
print(len(workerStats.keys()))
workersMulAlbms = [worker for worker in workerStats.keys() if workerStats[worker]['total_attempts'] > 10]
print(len(workersMulAlbms))
## Every worker worked on more than 1 album
In [33]:
numAlbums = {worker : ceil(workerStats[worker]['total_attempts']/10) for worker in workerStats.keys() if worker != 'A2E9NUZZ4S2VJ9'}
attempts = list(numAlbums.values())
maxAlbums = max(attempts)
minAlbums = min(attempts)
avgAlbums = sum(attempts)/len(attempts)
maxAlbums,minAlbums,avgAlbums
Out[33]:
In [32]:
for worker in numAlbums.keys():
if numAlbums[worker] >= 100:
print(worker)
In [31]:
numAlbums['A2E9NUZZ4S2VJ9'],workerStats['A2E9NUZZ4S2VJ9']['total_attempts']
Out[31]:
In [20]:
def createResultDict(jobRangeStart,jobRangeEnd,workerData=False):
masterDict = OrderedDict()
keysOfInterest = []
for i in range(jobRangeStart,jobRangeEnd+1):
inFLTitle = "photo_album_" + str(i)
inFL = "../results/photo_album_" + str(i) + ".results"
with open(inFL,"r") as inp:
inFLList = [line.replace('"','') for line in inp]
header = inFLList[0].split("\t")
resultList = [line.split("\t") for line in inFLList[1:]]
resultDict = OrderedDict()
for i in range(0,len(resultList)):
for j in range(0,len(header)):
resultDict[header[j]] = resultDict.get(header[j],[]) + [resultList[i][j]]
if workerData:
keysOfInterest = list(filter(lambda x: re.search("workerid",x),resultDict.keys()))
else:
keysOfInterest = list(filter(lambda x: re.search("Answer",x),resultDict.keys()))
newDict = OrderedDict()
for key in keysOfInterest:
newDict[key] = resultDict[key]
masterDict[inFLTitle] = newDict
print(keysOfInterest)
return masterDict
In [26]:
results = JRS.createResultDict(1,100)
imgAlbumDict = JRS.genImgAlbumDictFromMap("../data/imageGID_job_map_expt2_corrected.csv")
res = JRS.createResultDict(1,100,workerData=True)
masterList = []
for album in res.keys(): # loop 1
responses = res[album]
workers = responses['workerid']
for response in responses.keys(): # loop 2
if 'Answer' in response and response.split(".")[1].isdigit():
shrNoShr = []
gid = response.split(".")[1]
for shrNShr in responses[response]: # loop 3.1
if len(shrNShr.split("|")) != 2: # no response captured
shrNoShr.append("*")
elif shrNShr.split("|")[1] == 'share':
shrNoShr.append(1)
else:
shrNoShr.append(0)
for i in range(len(workers)): # loop 3.2
masterList.append((workers[i],gid,gid + "_" + album,shrNoShr[i]))
img_mul_albms = [gid for gid in imgAlbumDict.keys() if len(imgAlbumDict[gid]) > 1]
alphas = []
alphas_new = []
pis = []
kappas = []
for img in img_mul_albms:
some_dict = {}
for tup in masterList:
if tup[1] == img:
if tup[0] in some_dict.keys():
some_dict[tup[0]].update({tup[2]:tup[3]})
else:
some_dict[tup[0]] = {tup[2]:tup[3]}
some_dict_final = {worker : some_dict[worker] for worker in some_dict.keys() if len(some_dict[worker].values()) > 1}
reliability_matrix = []
for worker in some_dict_final.keys():
dct = some_dict_final[worker]
for gid_albm in dct.keys():
reliability_matrix.append((worker, gid_albm, dct[gid_albm]))
t = AnnotationTask(data=reliability_matrix)
if len(some_dict_final) > 1:
alphas.append(KAS.krippendorff_alpha(list(some_dict_final.values()), KAS.nominal_metric, missing_items='*'))
try:
alphas_new.append(float("{0:.3f}".format(t.alpha())))
pis.append(float("{0:.3f}".format(t.pi())))
kappas.append(float("{0:.3f}".format(t.kappa())))
except ZeroDivisionError as e:
print("Caught")
pis.append(1)
kappas.append(1)
alphas_new.append(1)
In [27]:
sum(alphas)/len(alphas), sum(pis)/len(pis), sum(kappas)/len(kappas), sum(alphas_new)/len(alphas_new)
Out[27]:
In [30]:
np.mean(alphas), np.std(alphas), np.mean(pis), np.std(pis), np.mean(kappas), np.std(kappas)
Out[30]:
In [80]:
album_wise_dict = {}
for row in masterList:
album_wise_dict[row[2]] = album_wise_dict.get(row[2],[]) + [(row[0],row[1], row[3])]
In [93]:
alphas = []
pis = []
kappas = []
for album in album_wise_dict.keys():
reliability_matrix = album_wise_dict[album]
t = AnnotationTask(data=reliability_matrix)
try:
alphas.append(t.alpha())
pis.append(t.pi())
kappas.append(t.kappa())
except ZeroDivisionError as e:
print("Caught")
In [94]:
np.mean(alphas), np.std(alphas), np.mean(pis), np.std(pis), np.mean(kappas), np.std(kappas)
Out[94]:
In [307]:
res = JRS.createResultDict(1,100,workerData=True)
masterList = []
for album in res.keys(): # loop 1
responses = res[album]
workers = responses['workerid']
pos = 1
for response in responses.keys(): # loop 2
if 'Answer' in response and response.split(".")[1].isdigit():
shrNoShr = []
gid = response.split(".")[1]
for shrNShr in responses[response]: # loop 3.1
if len(shrNShr.split("|")) != 2: # no response captured
shrNoShr.append("*")
elif shrNShr.split("|")[1] == 'share':
shrNoShr.append(1)
else:
shrNoShr.append(0)
for i in range(len(workers)): # loop 3.2
masterList.append((pos, shrNoShr[i]))
pos += 1
masterList = list(filter(lambda x : x[1] != '*',masterList))
pos_shares = [0]*20
positions = [0] * 20
for row in masterList:
pos_shares[row[0]-1] += row[1]
positions[row[0]-1] += 1
positions_norm = []
for pos in positions:
positions_norm.append(pos*100/sum(positions))
pos_shr_norm = []
for i in range(len(positions)):
pos_shr_norm.append(pos_shares[i]*positions_norm[i]/positions[i])
In [308]:
layout= go.Layout(
showlegend=False,
legend=dict(
x=0.5,
y=0,
font=dict(size=15)
),
xaxis= dict(
title= 'Position of the image',
ticklen= 5,
zeroline= True,
),
yaxis=dict(
ticklen= 5,
#range=range
)
)
trace1 = go.Bar(
x = list(range(1,21)),
name = "No. of times image appeared in x position",
y = positions_norm,
opacity = 0.5,
marker=dict(color='grey')
)
trace2 = go.Scatter(
name = "No. of times image shared in x position",
x = list(range(1,21)),
y = pos_shr_norm,
opacity = 1,
marker=dict(color='blue'),
mode ='lines'
)
data = [trace1, trace2]
fig = dict(data=data,layout=layout)
iplot(fig,filename="Expt2 Training data distributions")
In [2]:
import DeriveFinalResultSet as drs, htmltag as HT
def genHTMLTableFiles(shrCntsObj):
shrPropDict = drs.getShrProp(shrCntsObj)
totCntDict = drs.genTotCnts(shrCntsObj)
df = pd.DataFrame(shrPropDict,index = ['Share Proportion']).transpose()
df_tot = pd.DataFrame(totCntDict,index = ['Total']).transpose()
return df,df.to_html(bold_rows = False), df_tot
In [268]:
ftr = 'AGE'
In [98]:
def get_plots_per_ftr(ftr, ftr_txt):
d = drs.ovrallShrCntsByFtr(drs.gidAidMapFl,drs.aidFeatureMapFl,ftr,drs.imgJobMap,1,100)
head3 = HT.h3("Data-Frame by " + ftr)
df1,tb1, df_tot = genHTMLTableFiles(d)
dct = df_tot.to_dict()['Total']
totals_dct = {}
shares_dct = {}
for key in dct.keys():
if 'total' in key:
if 'UNIDENTIFIED' in key or 'unknown' in key:
totals_dct['unknown'] = dct[key]
else:
totals_dct[key[0]] = dct[key]
elif 'share' in key:
if 'UNIDENTIFIED' in key:
shares_dct['unknown'] = dct[key]
else:
shares_dct[key[0]] = dct[key]
df_total = pd.DataFrame(totals_dct, index=['total']).transpose()
df_share = pd.DataFrame(shares_dct, index=['shared']).transpose()
df_total.reset_index(inplace=True)
df_share.reset_index(inplace=True)
df = pd.merge(df_total, df_share)
df['total_proportion'] = df.total * 100 / sum(list(df.total))
df['share_proportion'] = df.shared * df.total_proportion / df.total
df = df.sort_values(by=['total_proportion'], ascending=False)
layout= go.Layout(
showlegend=False,
legend=dict(
x=0.5,
y=1,
font=dict(size=15)
),
xaxis= dict(
title= ftr_txt,
ticklen= 5,
zeroline= True,
tickangle=45
),
yaxis=dict(
ticklen= 5,
#range=range
),
barmode='grouped'
)
trace1 = go.Bar(
x = list(range(1,len(df["index"])+1)),
name = "No. of times image with 'X' appeared",
y = list(df.total_proportion),
opacity = 0.5,
marker=dict(color='grey')
)
trace2 = go.Scatter(
name = "No. of times image with 'X' shared",
x = list(range(1,len(df["index"])+1)),
y = list(df.share_proportion),
opacity = 1,
marker=dict(color='blue'),
)
data = [trace1, trace2]
return dict(data=data, layout=layout)
In [294]:
fig = get_plots_per_ftr('VIEW_POINT', 'View point of the animal')
iplot(fig,filename="Expt2 Training data distributions")
In [295]:
fig = get_plots_per_ftr('AGE', 'Age of the animal')
iplot(fig,filename="Expt2 Training data distributions")
In [296]:
fig = get_plots_per_ftr('SPECIES', 'Species of the animal')
iplot(fig,filename="Expt2 Training data distributions")
In [99]:
fig = get_plots_per_ftr('CONTRIBUTOR', 'ID\'s of the contributor who took images')
iplot(fig,filename="Expt2 Training data distributions")
In [305]:
In [87]:
results = JRS.createResultDict(1,100)
imgAlbumDict = JRS.genImgAlbumDictFromMap("../data/imageGID_job_map_expt2_corrected.csv")
res = JRS.createResultDict(1,100,workerData=True)
masterList = []
for album in res.keys(): # loop 1
responses = res[album]
workers = responses['workerid']
for response in responses.keys(): # loop 2
if 'Answer' in response and response.split(".")[1].isdigit():
shrNoShr = []
gid = response.split(".")[1]
for shrNShr in responses[response]: # loop 3.1
if len(shrNShr.split("|")) != 2: # no response captured
shrNoShr.append("*")
elif shrNShr.split("|")[1] == 'share':
shrNoShr.append(1)
else:
shrNoShr.append(0)
for i in range(len(workers)): # loop 3.2
masterList.append((gid,shrNoShr[i]))
masterList = list(filter(lambda x : x[1] != '*',masterList))
In [88]:
tots = {}
shares = {}
for tup in masterList:
tots[tup[0]] = tots.get(tup[0], 0) + 1
shares[tup[0]] = shares.get(tup[0], 0) + tup[1]
In [89]:
df_tots = pd.DataFrame(tots, index=['totals']).transpose().reset_index()
df_shares = pd.DataFrame(shares, index=['shares']).transpose().reset_index()
df = pd.merge(df_tots, df_shares)
In [90]:
inpExifFl = "../data/GZC_exifs_beauty_full.json"
with open(inpExifFl,"r") as inpJsonFl:
exifJsonObj = json.load(inpJsonFl)
exifDf = pd.DataFrame(exifJsonObj).transpose()
exifDf['date'] = exifDf['date'].apply(lambda x : datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))
exifDf['day'] = exifDf.date.apply(lambda x : x.day)
exifDf['hour'] = exifDf.date.apply(lambda x : x.hour)
exifDf.drop(['size','date'],1,inplace=True)
exifDf.reset_index(inplace=True)
In [91]:
df = pd.merge(df, exifDf)
# df = df[[ 'totals', 'shares', 'hour']]
df.head()
Out[91]:
In [92]:
tots_hour = df.groupby(['hour'])['totals'].sum()
shares_hour = df.groupby(['hour'])['shares'].sum()
# data.groupby(func, axis=0).mean()
# >>> data.groupby(['col1', 'col2'])['col3'].mean()
In [27]:
sum(df.totals)
Out[27]:
In [97]:
layout= go.Layout(
showlegend=False,
legend=dict(
x=0.5,
y=1,
font=dict(size=15)
),
xaxis= dict(
title= 'Time of the day when the picture was clicked',
ticklen= 5,
zeroline= True,
),
yaxis=dict(
ticklen= 5,
#range=range
),
barmode='grouped'
)
trace1 = go.Bar(
x = list(df["index"]),
name = "No. of times image with 'X' appeared",
y = list(df.total_proportion),
opacity = 0.5,
marker=dict(color='grey')
)
trace2 = go.Scatter(
name = "No. of times image with 'X' shared",
x = list(df["index"]),
y = list(df.share_proportion),
opacity = 1,
marker=dict(color='blue'),
)
data = [trace1, trace2]
fig = dict(data=data, layout=layout)
iplot(fig,filename="Expt2 Training data distributions")
In [93]:
tots_hour = pd.DataFrame(tots_hour.to_dict(), index=['totals']).transpose().reset_index()
shares_hour = pd.DataFrame(shares_hour.to_dict(), index=['shares']).transpose().reset_index()
In [94]:
df = pd.merge(tots_hour, shares_hour)
In [95]:
df['total_proportion'] = df.totals * 100 / sum(df.totals)
df['share_proportion'] = df.shares * df.total_proportion / df.totals
In [81]:
df.sort_values(by=['total_proportion'], ascending=False, inplace=True)
In [96]:
df
Out[96]:
In [ ]: