In [1]:
import JobsMapResultsFilesToContainerObjs as JRS
from nltk.metrics.agreement import AnnotationTask
from collections import OrderedDict, Counter
import re, json
import importlib
importlib.reload(JRS)
import pandas as pd
import plotly.plotly as py
import cufflinks as cf
cf.go_offline()
from math import ceil
from plotly.offline import plot, iplot
import plotly.graph_objs as go
from datetime import datetime


Inter annotator agreement scores for workers

Three parameters are being used to measure inter-worker agreement scores. Quantify the measure of how workers agree to the same images that appear in different albums.

alpha()
Krippendorff 1980

pi()
Scott 1955; here, multi-pi. Equivalent to K from Siegel and Castellan (1988).

s()
Bennett, Albert and Goldstein 1954

Observations : Every worker has worked on atleast two albums.

(maxAlbums,minAlbums,avgAlbums) = (80, 2, 8.220264317180616)


In [36]:
workerStats = {}
wrkrCnts = {}
for row in masterList:
    wrkrCnts[row[0]] = wrkrCnts.get(row[0],[]) + [row[2]]
    
for wrkr in wrkrCnts.keys():
    workerStats[wrkr] = {'shares' : sum(wrkrCnts[wrkr]), 'total_attempts' : len(wrkrCnts[wrkr]),'proportion' : sum(wrkrCnts[wrkr])*100/len(wrkrCnts[wrkr])}

In [38]:
df = pd.DataFrame.from_dict(workerStats).transpose()

df.sort_values(by='proportion',inplace=True,ascending=True)

In [46]:
df['proportion'] = df['proportion'].apply(lambda x : round(int(x)/10)*10)

In [16]:
print(len(workerStats.keys()))

workersMulAlbms = [worker for worker in workerStats.keys() if workerStats[worker]['total_attempts'] > 10]
print(len(workersMulAlbms))

## Every worker worked on more than 1 album


228
228

In [33]:
numAlbums = {worker : ceil(workerStats[worker]['total_attempts']/10) for worker in workerStats.keys() if worker != 'A2E9NUZZ4S2VJ9'}
attempts = list(numAlbums.values())
maxAlbums = max(attempts)
minAlbums = min(attempts)
avgAlbums = sum(attempts)/len(attempts)

maxAlbums,minAlbums,avgAlbums


Out[33]:
(80, 2, 8.220264317180616)

In [32]:
for worker in numAlbums.keys():
    if numAlbums[worker] >= 100:
        print(worker)


A2E9NUZZ4S2VJ9

In [31]:
numAlbums['A2E9NUZZ4S2VJ9'],workerStats['A2E9NUZZ4S2VJ9']['total_attempts']


Out[31]:
(120, 1198)

In [20]:
def createResultDict(jobRangeStart,jobRangeEnd,workerData=False):
    masterDict = OrderedDict()
    keysOfInterest = []
    for i in range(jobRangeStart,jobRangeEnd+1):
        inFLTitle = "photo_album_" + str(i)
        inFL = "../results/photo_album_" + str(i) + ".results"
        with open(inFL,"r") as inp:
            inFLList = [line.replace('"','') for line in inp]

        header = inFLList[0].split("\t")
        resultList = [line.split("\t") for line in inFLList[1:]]

        resultDict = OrderedDict()
        for i in range(0,len(resultList)):
            for j in range(0,len(header)):
                resultDict[header[j]] = resultDict.get(header[j],[]) + [resultList[i][j]]
                
        if workerData:
            keysOfInterest = list(filter(lambda x: re.search("workerid",x),resultDict.keys()))
        else:
            keysOfInterest = list(filter(lambda x: re.search("Answer",x),resultDict.keys()))
        newDict = OrderedDict()
        for key in keysOfInterest:
            newDict[key] = resultDict[key]

        masterDict[inFLTitle] = newDict
    print(keysOfInterest)    
    return masterDict

Creating reliability matrix for understanding agreement of share-rates for images that appear in different albums

What you trying to say: to what degree does the share rate of image x that appear in album a agree to share rate of album b.. Not so important


In [26]:
results = JRS.createResultDict(1,100)
imgAlbumDict = JRS.genImgAlbumDictFromMap("../data/imageGID_job_map_expt2_corrected.csv")

res = JRS.createResultDict(1,100,workerData=True)
masterList = []

for album in res.keys(): # loop 1
    responses = res[album]
    workers = responses['workerid']
    for response in responses.keys(): # loop 2
        if 'Answer' in response and response.split(".")[1].isdigit():
            shrNoShr = []
            gid = response.split(".")[1]
            for shrNShr in responses[response]: # loop 3.1
                if len(shrNShr.split("|")) != 2: # no response captured
                    shrNoShr.append("*")
                elif shrNShr.split("|")[1] == 'share':
                    shrNoShr.append(1)
                else:
                    shrNoShr.append(0)
            
            for i in range(len(workers)): # loop 3.2
                masterList.append((workers[i],gid,gid + "_" + album,shrNoShr[i]))
                
img_mul_albms = [gid for gid in imgAlbumDict.keys() if len(imgAlbumDict[gid]) > 1]
alphas = []
alphas_new = []
pis = []
kappas = []
for img in img_mul_albms:
    some_dict = {}
    for tup in masterList:
        if tup[1] == img:
            if tup[0] in some_dict.keys():
                some_dict[tup[0]].update({tup[2]:tup[3]})
            else:
                some_dict[tup[0]] = {tup[2]:tup[3]}

    some_dict_final = {worker : some_dict[worker] for worker in some_dict.keys() if len(some_dict[worker].values()) > 1}
    
    reliability_matrix = []

    for worker in some_dict_final.keys():
        dct = some_dict_final[worker]
        for gid_albm in dct.keys():
            reliability_matrix.append((worker, gid_albm, dct[gid_albm]))

    t = AnnotationTask(data=reliability_matrix)
    if len(some_dict_final) > 1:
        alphas.append(KAS.krippendorff_alpha(list(some_dict_final.values()), KAS.nominal_metric, missing_items='*'))
        try:
            alphas_new.append(float("{0:.3f}".format(t.alpha())))
            pis.append(float("{0:.3f}".format(t.pi())))
            kappas.append(float("{0:.3f}".format(t.kappa())))
        except ZeroDivisionError as e:
            print("Caught")
            pis.append(1)
            kappas.append(1)
            alphas_new.append(1)


Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught

In [27]:
sum(alphas)/len(alphas), sum(pis)/len(pis), sum(kappas)/len(kappas), sum(alphas_new)/len(alphas_new)


Out[27]:
(0.26037227066051688,
 0.24063694267515925,
 0.4411641791044776,
 0.3316305732484076)

In [30]:
np.mean(alphas), np.std(alphas), np.mean(pis), np.std(pis), np.mean(kappas), np.std(kappas)


Out[30]:
(0.26037227066051688,
 0.51648063586027104,
 0.24063694267515925,
 0.64518920498377497,
 0.44116417910447758,
 0.5255568928975124)

In [80]:
album_wise_dict = {}
for row in masterList:
    album_wise_dict[row[2]] = album_wise_dict.get(row[2],[]) + [(row[0],row[1], row[3])]

In [93]:
alphas = []
pis = []
kappas = []
for album in album_wise_dict.keys():
    reliability_matrix = album_wise_dict[album]
    t = AnnotationTask(data=reliability_matrix)
    try:
        alphas.append(t.alpha())
        pis.append(t.pi()) 
        kappas.append(t.kappa())
    except ZeroDivisionError as e:
        print("Caught")


Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught

In [94]:
np.mean(alphas), np.std(alphas), np.mean(pis), np.std(pis), np.mean(kappas), np.std(kappas)


Out[94]:
(0.21692423159260182,
 0.10650736948244149,
 0.14203322802064169,
 0.15090682592663776,
 0.17703867734101922,
 0.15223211774801929)

POSITION BIAS


In [307]:
res = JRS.createResultDict(1,100,workerData=True)
masterList = []

for album in res.keys(): # loop 1
    responses = res[album]
    workers = responses['workerid']
    pos = 1
    for response in responses.keys(): # loop 2
        if 'Answer' in response and response.split(".")[1].isdigit():
            shrNoShr = []
            gid = response.split(".")[1]
            for shrNShr in responses[response]: # loop 3.1
                if len(shrNShr.split("|")) != 2: # no response captured
                    shrNoShr.append("*")
                elif shrNShr.split("|")[1] == 'share':
                    shrNoShr.append(1)
                else:
                    shrNoShr.append(0)
            
            for i in range(len(workers)): # loop 3.2
                masterList.append((pos, shrNoShr[i]))
            pos += 1
masterList = list(filter(lambda x : x[1] != '*',masterList))         
pos_shares = [0]*20
positions = [0] * 20
for row in masterList:
    pos_shares[row[0]-1] += row[1]
    positions[row[0]-1] += 1
    
positions_norm = []
for pos in positions:
    positions_norm.append(pos*100/sum(positions))
    
    
pos_shr_norm = []
for i in range(len(positions)):
    pos_shr_norm.append(pos_shares[i]*positions_norm[i]/positions[i])

In [308]:
layout= go.Layout(
                showlegend=False,
                legend=dict(
                    x=0.5,
                    y=0,
                    font=dict(size=15)
                ),
                xaxis= dict(
                    title= 'Position of the image',
                    ticklen= 5,
                    zeroline= True,
                ),
                yaxis=dict(
                    ticklen= 5,
                    #range=range
                )
            )

trace1 = go.Bar(
                    x = list(range(1,21)),
                    name = "No. of times image appeared in x position",
                    y = positions_norm,
                    opacity = 0.5,
                    marker=dict(color='grey')
                    
            )

trace2 = go.Scatter(
                    name = "No. of times image shared in x position",
                    x = list(range(1,21)),
                    y = pos_shr_norm,
                    opacity = 1,
                    marker=dict(color='blue'),
                    mode ='lines'
                    
            )


data = [trace1, trace2]
fig = dict(data=data,layout=layout)
iplot(fig,filename="Expt2 Training data distributions")



In [2]:
import DeriveFinalResultSet as drs, htmltag as HT

def genHTMLTableFiles(shrCntsObj):
    shrPropDict = drs.getShrProp(shrCntsObj)
    totCntDict = drs.genTotCnts(shrCntsObj)
    df = pd.DataFrame(shrPropDict,index = ['Share Proportion']).transpose()
    df_tot = pd.DataFrame(totCntDict,index = ['Total']).transpose()
    return df,df.to_html(bold_rows = False), df_tot

In [268]:
ftr = 'AGE'

In [98]:
def get_plots_per_ftr(ftr, ftr_txt):
    d = drs.ovrallShrCntsByFtr(drs.gidAidMapFl,drs.aidFeatureMapFl,ftr,drs.imgJobMap,1,100)
    head3 = HT.h3("Data-Frame by " + ftr)
    df1,tb1, df_tot = genHTMLTableFiles(d)
    dct = df_tot.to_dict()['Total']


    totals_dct = {}
    shares_dct = {}

    for key in dct.keys():
        if 'total' in key:
            if 'UNIDENTIFIED' in key or 'unknown' in key:
                totals_dct['unknown'] = dct[key]
            else:
                totals_dct[key[0]] = dct[key]
        elif 'share' in key:
            if 'UNIDENTIFIED' in key:
                shares_dct['unknown'] = dct[key]
            else:
                shares_dct[key[0]] = dct[key]
    df_total = pd.DataFrame(totals_dct, index=['total']).transpose()
    df_share = pd.DataFrame(shares_dct, index=['shared']).transpose()  
    df_total.reset_index(inplace=True)
    df_share.reset_index(inplace=True)

    df = pd.merge(df_total, df_share)
    df['total_proportion'] = df.total * 100 / sum(list(df.total))
    df['share_proportion'] = df.shared * df.total_proportion / df.total
    
    df = df.sort_values(by=['total_proportion'], ascending=False)
    
    layout= go.Layout(
                    showlegend=False,
                    legend=dict(
                        x=0.5,
                        y=1,
                        font=dict(size=15)
                    ),
                    xaxis= dict(
                        title= ftr_txt,
                        ticklen= 5,
                        zeroline= True,
            tickangle=45
                    ),
                    yaxis=dict(
                        ticklen= 5,
                        #range=range
                    ),
        barmode='grouped'
                )
    trace1 = go.Bar(
                        x = list(range(1,len(df["index"])+1)),
                        name = "No. of times image with 'X' appeared",
                        y = list(df.total_proportion),
                        opacity = 0.5,
                        marker=dict(color='grey')

                )

    trace2 = go.Scatter(
                        name = "No. of times image with 'X' shared",
                        x = list(range(1,len(df["index"])+1)),
                        y = list(df.share_proportion),
                        opacity = 1,
                        marker=dict(color='blue'),

                )
    data = [trace1, trace2]
    return dict(data=data, layout=layout)

In [294]:
fig = get_plots_per_ftr('VIEW_POINT', 'View point of the animal')
iplot(fig,filename="Expt2 Training data distributions")



In [295]:
fig = get_plots_per_ftr('AGE', 'Age of the animal')
iplot(fig,filename="Expt2 Training data distributions")



In [296]:
fig = get_plots_per_ftr('SPECIES', 'Species of the animal')
iplot(fig,filename="Expt2 Training data distributions")



In [99]:
fig = get_plots_per_ftr('CONTRIBUTOR', 'ID\'s of the contributor who took images')
iplot(fig,filename="Expt2 Training data distributions")



In [305]:


In [87]:
results = JRS.createResultDict(1,100)
imgAlbumDict = JRS.genImgAlbumDictFromMap("../data/imageGID_job_map_expt2_corrected.csv")

res = JRS.createResultDict(1,100,workerData=True)
masterList = []

for album in res.keys(): # loop 1
    responses = res[album]
    workers = responses['workerid']
    for response in responses.keys(): # loop 2
        if 'Answer' in response and response.split(".")[1].isdigit():
            shrNoShr = []
            gid = response.split(".")[1]
            for shrNShr in responses[response]: # loop 3.1
                if len(shrNShr.split("|")) != 2: # no response captured
                    shrNoShr.append("*")
                elif shrNShr.split("|")[1] == 'share':
                    shrNoShr.append(1)
                else:
                    shrNoShr.append(0)
            
            for i in range(len(workers)): # loop 3.2
                masterList.append((gid,shrNoShr[i]))
                
masterList = list(filter(lambda x : x[1] != '*',masterList))

In [88]:
tots = {}
shares = {}
for tup in masterList:
    tots[tup[0]] = tots.get(tup[0], 0) + 1
    shares[tup[0]] = shares.get(tup[0], 0) + tup[1]

In [89]:
df_tots = pd.DataFrame(tots, index=['totals']).transpose().reset_index()
df_shares = pd.DataFrame(shares, index=['shares']).transpose().reset_index()

df = pd.merge(df_tots, df_shares)

In [90]:
inpExifFl = "../data/GZC_exifs_beauty_full.json"

with open(inpExifFl,"r") as inpJsonFl:
    exifJsonObj = json.load(inpJsonFl)
exifDf = pd.DataFrame(exifJsonObj).transpose()

exifDf['date'] = exifDf['date'].apply(lambda x : datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))
exifDf['day'] = exifDf.date.apply(lambda x : x.day)
exifDf['hour'] = exifDf.date.apply(lambda x : x.hour)
exifDf.drop(['size','date'],1,inplace=True)
exifDf.reset_index(inplace=True)

In [91]:
df = pd.merge(df, exifDf)
# df = df[[ 'totals', 'shares', 'hour']]
df.head()


Out[91]:
index totals shares arousal contrast dominance height hsv_itten_std_h hsv_itten_std_s hsv_itten_std_v lat long orientation pleasure symmetry width day hour
0 10 20 4 -0.0677241 1.4738 0.500873 4000 105719 140171 133193 -1.36709 36.782 1 0.441554 11.4251 6000 1 15
1 1000 10 9 0.0267916 2.2988 0.446958 3072 100419 106270 146868 -1.34968 36.806 1 0.385603 10.8675 4608 1 17
2 1003 10 2 0.00380291 1.61078 0.534058 3072 116605 145911 182645 -1.34965 36.8062 1 0.463471 9.66617 4608 1 17
3 1005 10 8 -0.049412 1.75693 0.55702 3072 102427 140823 132426 -1.34965 36.8062 1 0.488551 12.6299 4608 1 17
4 101 10 2 0.0550115 2.42374 0.444083 4000 142995 193383 250416 -1.37325 36.8003 1 0.380382 6.21572 6000 1 16

In [92]:
tots_hour = df.groupby(['hour'])['totals'].sum()
shares_hour = df.groupby(['hour'])['shares'].sum()


# data.groupby(func, axis=0).mean()
# >>> data.groupby(['col1', 'col2'])['col3'].mean()

In [27]:
sum(df.totals)


Out[27]:
19801

In [97]:
layout= go.Layout(
                    showlegend=False,
                    legend=dict(
                        x=0.5,
                        y=1,
                        font=dict(size=15)
                    ),
                    xaxis= dict(
                        title= 'Time of the day when the picture was clicked',
                        ticklen= 5,
                        zeroline= True,
                    ),
                    yaxis=dict(
                        ticklen= 5,
                        #range=range
                    ),
        barmode='grouped'
                )
trace1 = go.Bar(
                        x = list(df["index"]),
                        name = "No. of times image with 'X' appeared",
                        y = list(df.total_proportion),
                        opacity = 0.5,
                        marker=dict(color='grey')

                )

trace2 = go.Scatter(
                        name = "No. of times image with 'X' shared",
                        x = list(df["index"]),
                        y = list(df.share_proportion),
                        opacity = 1,
                        marker=dict(color='blue'),

                )
data = [trace1, trace2]
fig = dict(data=data, layout=layout)
iplot(fig,filename="Expt2 Training data distributions")



In [93]:
tots_hour = pd.DataFrame(tots_hour.to_dict(), index=['totals']).transpose().reset_index()
shares_hour = pd.DataFrame(shares_hour.to_dict(), index=['shares']).transpose().reset_index()

In [94]:
df = pd.merge(tots_hour, shares_hour)

In [95]:
df['total_proportion'] = df.totals * 100 / sum(df.totals)

df['share_proportion'] = df.shares * df.total_proportion / df.totals

In [81]:
df.sort_values(by=['total_proportion'], ascending=False, inplace=True)

In [96]:
df


Out[96]:
index totals shares total_proportion share_proportion
0 9 284 154 1.434271 0.777738
1 10 250 120 1.262562 0.606030
2 11 451 198 2.277663 0.999949
3 12 1446 821 7.302661 4.146255
4 13 2099 1046 10.600475 5.282561
5 14 2497 1298 12.610474 6.555224
6 15 3166 1559 15.989091 7.873340
7 16 2922 1478 14.756830 7.464269
8 17 3172 1658 16.019393 8.373314
9 18 1741 868 8.792485 4.383617
10 19 1214 568 6.131003 2.868542
11 20 350 159 1.767587 0.802990
12 21 189 84 0.954497 0.424221
13 22 20 3 0.101005 0.015151

In [ ]: