In [1]:

    
import JobsMapResultsFilesToContainerObjs as JRS
from nltk.metrics.agreement import AnnotationTask
from collections import OrderedDict, Counter
import re, json
import importlib
importlib.reload(JRS)
import pandas as pd
import plotly.plotly as py
import cufflinks as cf
cf.go_offline()
from math import ceil
from plotly.offline import plot, iplot
import plotly.graph_objs as go
from datetime import datetime

Inter annotator agreement scores for workers

Three parameters are being used to measure inter-worker agreement scores. Quantify the measure of how workers agree to the same images that appear in different albums.

alpha()
Krippendorff 1980

pi()
Scott 1955; here, multi-pi. Equivalent to K from Siegel and Castellan (1988).

s()
Bennett, Albert and Goldstein 1954

Observations : Every worker has worked on atleast two albums.

(maxAlbums,minAlbums,avgAlbums) = (80, 2, 8.220264317180616)



In [36]:

    
workerStats = {}
wrkrCnts = {}
for row in masterList:
    wrkrCnts[row[0]] = wrkrCnts.get(row[0],[]) + [row[2]]
    
for wrkr in wrkrCnts.keys():
    workerStats[wrkr] = {'shares' : sum(wrkrCnts[wrkr]), 'total_attempts' : len(wrkrCnts[wrkr]),'proportion' : sum(wrkrCnts[wrkr])*100/len(wrkrCnts[wrkr])}



In [38]:

    
df = pd.DataFrame.from_dict(workerStats).transpose()

df.sort_values(by='proportion',inplace=True,ascending=True)



In [46]:

    
df['proportion'] = df['proportion'].apply(lambda x : round(int(x)/10)*10)



In [16]:

    
print(len(workerStats.keys()))

workersMulAlbms = [worker for worker in workerStats.keys() if workerStats[worker]['total_attempts'] > 10]
print(len(workersMulAlbms))

## Every worker worked on more than 1 album



In [33]:

    
numAlbums = {worker : ceil(workerStats[worker]['total_attempts']/10) for worker in workerStats.keys() if worker != 'A2E9NUZZ4S2VJ9'}
attempts = list(numAlbums.values())
maxAlbums = max(attempts)
minAlbums = min(attempts)
avgAlbums = sum(attempts)/len(attempts)

maxAlbums,minAlbums,avgAlbums









    Out[33]:





(80, 2, 8.220264317180616)



In [32]:

    
for worker in numAlbums.keys():
    if numAlbums[worker] >= 100:
        print(worker)









    



A2E9NUZZ4S2VJ9



In [31]:

    
numAlbums['A2E9NUZZ4S2VJ9'],workerStats['A2E9NUZZ4S2VJ9']['total_attempts']









    Out[31]:





(120, 1198)



In [20]:

    
def createResultDict(jobRangeStart,jobRangeEnd,workerData=False):
    masterDict = OrderedDict()
    keysOfInterest = []
    for i in range(jobRangeStart,jobRangeEnd+1):
        inFLTitle = "photo_album_" + str(i)
        inFL = "../results/photo_album_" + str(i) + ".results"
        with open(inFL,"r") as inp:
            inFLList = [line.replace('"','') for line in inp]

        header = inFLList[0].split("\t")
        resultList = [line.split("\t") for line in inFLList[1:]]

        resultDict = OrderedDict()
        for i in range(0,len(resultList)):
            for j in range(0,len(header)):
                resultDict[header[j]] = resultDict.get(header[j],[]) + [resultList[i][j]]
                
        if workerData:
            keysOfInterest = list(filter(lambda x: re.search("workerid",x),resultDict.keys()))
        else:
            keysOfInterest = list(filter(lambda x: re.search("Answer",x),resultDict.keys()))
        newDict = OrderedDict()
        for key in keysOfInterest:
            newDict[key] = resultDict[key]

        masterDict[inFLTitle] = newDict
    print(keysOfInterest)    
    return masterDict

What you trying to say: to what degree does the share rate of image x that appear in album a agree to share rate of album b.. Not so important



In [26]:

    
results = JRS.createResultDict(1,100)
imgAlbumDict = JRS.genImgAlbumDictFromMap("../data/imageGID_job_map_expt2_corrected.csv")

res = JRS.createResultDict(1,100,workerData=True)
masterList = []

for album in res.keys(): # loop 1
    responses = res[album]
    workers = responses['workerid']
    for response in responses.keys(): # loop 2
        if 'Answer' in response and response.split(".")[1].isdigit():
            shrNoShr = []
            gid = response.split(".")[1]
            for shrNShr in responses[response]: # loop 3.1
                if len(shrNShr.split("|")) != 2: # no response captured
                    shrNoShr.append("*")
                elif shrNShr.split("|")[1] == 'share':
                    shrNoShr.append(1)
                else:
                    shrNoShr.append(0)
            
            for i in range(len(workers)): # loop 3.2
                masterList.append((workers[i],gid,gid + "_" + album,shrNoShr[i]))
                
img_mul_albms = [gid for gid in imgAlbumDict.keys() if len(imgAlbumDict[gid]) > 1]
alphas = []
alphas_new = []
pis = []
kappas = []
for img in img_mul_albms:
    some_dict = {}
    for tup in masterList:
        if tup[1] == img:
            if tup[0] in some_dict.keys():
                some_dict[tup[0]].update({tup[2]:tup[3]})
            else:
                some_dict[tup[0]] = {tup[2]:tup[3]}

    some_dict_final = {worker : some_dict[worker] for worker in some_dict.keys() if len(some_dict[worker].values()) > 1}
    
    reliability_matrix = []

    for worker in some_dict_final.keys():
        dct = some_dict_final[worker]
        for gid_albm in dct.keys():
            reliability_matrix.append((worker, gid_albm, dct[gid_albm]))

    t = AnnotationTask(data=reliability_matrix)
    if len(some_dict_final) > 1:
        alphas.append(KAS.krippendorff_alpha(list(some_dict_final.values()), KAS.nominal_metric, missing_items='*'))
        try:
            alphas_new.append(float("{0:.3f}".format(t.alpha())))
            pis.append(float("{0:.3f}".format(t.pi())))
            kappas.append(float("{0:.3f}".format(t.kappa())))
        except ZeroDivisionError as e:
            print("Caught")
            pis.append(1)
            kappas.append(1)
            alphas_new.append(1)









    



Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught



In [27]:

    
sum(alphas)/len(alphas), sum(pis)/len(pis), sum(kappas)/len(kappas), sum(alphas_new)/len(alphas_new)









    Out[27]:





(0.26037227066051688,
 0.24063694267515925,
 0.4411641791044776,
 0.3316305732484076)



In [30]:

    
np.mean(alphas), np.std(alphas), np.mean(pis), np.std(pis), np.mean(kappas), np.std(kappas)









    Out[30]:





(0.26037227066051688,
 0.51648063586027104,
 0.24063694267515925,
 0.64518920498377497,
 0.44116417910447758,
 0.5255568928975124)



In [80]:

    
album_wise_dict = {}
for row in masterList:
    album_wise_dict[row[2]] = album_wise_dict.get(row[2],[]) + [(row[0],row[1], row[3])]



In [93]:

    
alphas = []
pis = []
kappas = []
for album in album_wise_dict.keys():
    reliability_matrix = album_wise_dict[album]
    t = AnnotationTask(data=reliability_matrix)
    try:
        alphas.append(t.alpha())
        pis.append(t.pi()) 
        kappas.append(t.kappa())
    except ZeroDivisionError as e:
        print("Caught")









    



Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught



In [94]:

    
np.mean(alphas), np.std(alphas), np.mean(pis), np.std(pis), np.mean(kappas), np.std(kappas)









    Out[94]:





(0.21692423159260182,
 0.10650736948244149,
 0.14203322802064169,
 0.15090682592663776,
 0.17703867734101922,
 0.15223211774801929)

POSITION BIAS



In [307]:

    
res = JRS.createResultDict(1,100,workerData=True)
masterList = []

for album in res.keys(): # loop 1
    responses = res[album]
    workers = responses['workerid']
    pos = 1
    for response in responses.keys(): # loop 2
        if 'Answer' in response and response.split(".")[1].isdigit():
            shrNoShr = []
            gid = response.split(".")[1]
            for shrNShr in responses[response]: # loop 3.1
                if len(shrNShr.split("|")) != 2: # no response captured
                    shrNoShr.append("*")
                elif shrNShr.split("|")[1] == 'share':
                    shrNoShr.append(1)
                else:
                    shrNoShr.append(0)
            
            for i in range(len(workers)): # loop 3.2
                masterList.append((pos, shrNoShr[i]))
            pos += 1
masterList = list(filter(lambda x : x[1] != '*',masterList))         
pos_shares = [0]*20
positions = [0] * 20
for row in masterList:
    pos_shares[row[0]-1] += row[1]
    positions[row[0]-1] += 1
    
positions_norm = []
for pos in positions:
    positions_norm.append(pos*100/sum(positions))
    
    
pos_shr_norm = []
for i in range(len(positions)):
    pos_shr_norm.append(pos_shares[i]*positions_norm[i]/positions[i])



In [308]:

    
layout= go.Layout(
                showlegend=False,
                legend=dict(
                    x=0.5,
                    y=0,
                    font=dict(size=15)
                ),
                xaxis= dict(
                    title= 'Position of the image',
                    ticklen= 5,
                    zeroline= True,
                ),
                yaxis=dict(
                    ticklen= 5,
                    #range=range
                )
            )

trace1 = go.Bar(
                    x = list(range(1,21)),
                    name = "No. of times image appeared in x position",
                    y = positions_norm,
                    opacity = 0.5,
                    marker=dict(color='grey')
                    
            )

trace2 = go.Scatter(
                    name = "No. of times image shared in x position",
                    x = list(range(1,21)),
                    y = pos_shr_norm,
                    opacity = 1,
                    marker=dict(color='blue'),
                    mode ='lines'
                    
            )


data = [trace1, trace2]
fig = dict(data=data,layout=layout)
iplot(fig,filename="Expt2 Training data distributions")



In [2]:

    
import DeriveFinalResultSet as drs, htmltag as HT

def genHTMLTableFiles(shrCntsObj):
    shrPropDict = drs.getShrProp(shrCntsObj)
    totCntDict = drs.genTotCnts(shrCntsObj)
    df = pd.DataFrame(shrPropDict,index = ['Share Proportion']).transpose()
    df_tot = pd.DataFrame(totCntDict,index = ['Total']).transpose()
    return df,df.to_html(bold_rows = False), df_tot



In [268]:

    
ftr = 'AGE'



In [98]:

    
def get_plots_per_ftr(ftr, ftr_txt):
    d = drs.ovrallShrCntsByFtr(drs.gidAidMapFl,drs.aidFeatureMapFl,ftr,drs.imgJobMap,1,100)
    head3 = HT.h3("Data-Frame by " + ftr)
    df1,tb1, df_tot = genHTMLTableFiles(d)
    dct = df_tot.to_dict()['Total']


    totals_dct = {}
    shares_dct = {}

    for key in dct.keys():
        if 'total' in key:
            if 'UNIDENTIFIED' in key or 'unknown' in key:
                totals_dct['unknown'] = dct[key]
            else:
                totals_dct[key[0]] = dct[key]
        elif 'share' in key:
            if 'UNIDENTIFIED' in key:
                shares_dct['unknown'] = dct[key]
            else:
                shares_dct[key[0]] = dct[key]
    df_total = pd.DataFrame(totals_dct, index=['total']).transpose()
    df_share = pd.DataFrame(shares_dct, index=['shared']).transpose()  
    df_total.reset_index(inplace=True)
    df_share.reset_index(inplace=True)

    df = pd.merge(df_total, df_share)
    df['total_proportion'] = df.total * 100 / sum(list(df.total))
    df['share_proportion'] = df.shared * df.total_proportion / df.total
    
    df = df.sort_values(by=['total_proportion'], ascending=False)
    
    layout= go.Layout(
                    showlegend=False,
                    legend=dict(
                        x=0.5,
                        y=1,
                        font=dict(size=15)
                    ),
                    xaxis= dict(
                        title= ftr_txt,
                        ticklen= 5,
                        zeroline= True,
            tickangle=45
                    ),
                    yaxis=dict(
                        ticklen= 5,
                        #range=range
                    ),
        barmode='grouped'
                )
    trace1 = go.Bar(
                        x = list(range(1,len(df["index"])+1)),
                        name = "No. of times image with 'X' appeared",
                        y = list(df.total_proportion),
                        opacity = 0.5,
                        marker=dict(color='grey')

                )

    trace2 = go.Scatter(
                        name = "No. of times image with 'X' shared",
                        x = list(range(1,len(df["index"])+1)),
                        y = list(df.share_proportion),
                        opacity = 1,
                        marker=dict(color='blue'),

                )
    data = [trace1, trace2]
    return dict(data=data, layout=layout)



In [294]:

    
fig = get_plots_per_ftr('VIEW_POINT', 'View point of the animal')
iplot(fig,filename="Expt2 Training data distributions")



In [295]:

    
fig = get_plots_per_ftr('AGE', 'Age of the animal')
iplot(fig,filename="Expt2 Training data distributions")



In [296]:

    
fig = get_plots_per_ftr('SPECIES', 'Species of the animal')
iplot(fig,filename="Expt2 Training data distributions")



In [99]:

    
fig = get_plots_per_ftr('CONTRIBUTOR', 'ID\'s of the contributor who took images')
iplot(fig,filename="Expt2 Training data distributions")



In [305]:



In [87]:

    
results = JRS.createResultDict(1,100)
imgAlbumDict = JRS.genImgAlbumDictFromMap("../data/imageGID_job_map_expt2_corrected.csv")

res = JRS.createResultDict(1,100,workerData=True)
masterList = []

for album in res.keys(): # loop 1
    responses = res[album]
    workers = responses['workerid']
    for response in responses.keys(): # loop 2
        if 'Answer' in response and response.split(".")[1].isdigit():
            shrNoShr = []
            gid = response.split(".")[1]
            for shrNShr in responses[response]: # loop 3.1
                if len(shrNShr.split("|")) != 2: # no response captured
                    shrNoShr.append("*")
                elif shrNShr.split("|")[1] == 'share':
                    shrNoShr.append(1)
                else:
                    shrNoShr.append(0)
            
            for i in range(len(workers)): # loop 3.2
                masterList.append((gid,shrNoShr[i]))
                
masterList = list(filter(lambda x : x[1] != '*',masterList))



In [88]:

    
tots = {}
shares = {}
for tup in masterList:
    tots[tup[0]] = tots.get(tup[0], 0) + 1
    shares[tup[0]] = shares.get(tup[0], 0) + tup[1]



In [89]:

    
df_tots = pd.DataFrame(tots, index=['totals']).transpose().reset_index()
df_shares = pd.DataFrame(shares, index=['shares']).transpose().reset_index()

df = pd.merge(df_tots, df_shares)



In [90]:

    
inpExifFl = "../data/GZC_exifs_beauty_full.json"

with open(inpExifFl,"r") as inpJsonFl:
    exifJsonObj = json.load(inpJsonFl)
exifDf = pd.DataFrame(exifJsonObj).transpose()

exifDf['date'] = exifDf['date'].apply(lambda x : datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))
exifDf['day'] = exifDf.date.apply(lambda x : x.day)
exifDf['hour'] = exifDf.date.apply(lambda x : x.hour)
exifDf.drop(['size','date'],1,inplace=True)
exifDf.reset_index(inplace=True)



In [91]:

    
df = pd.merge(df, exifDf)
# df = df[[ 'totals', 'shares', 'hour']]
df.head()









    Out[91]:






  
    
      
      index
      totals
      shares
      arousal
      contrast
      dominance
      height
      hsv_itten_std_h
      hsv_itten_std_s
      hsv_itten_std_v
      lat
      long
      orientation
      pleasure
      symmetry
      width
      day
      hour
    
  
  
    
      0
      10
      20
      4
      -0.0677241
      1.4738
      0.500873
      4000
      105719
      140171
      133193
      -1.36709
      36.782
      1
      0.441554
      11.4251
      6000
      1
      15
    
    
      1
      1000
      10
      9
      0.0267916
      2.2988
      0.446958
      3072
      100419
      106270
      146868
      -1.34968
      36.806
      1
      0.385603
      10.8675
      4608
      1
      17
    
    
      2
      1003
      10
      2
      0.00380291
      1.61078
      0.534058
      3072
      116605
      145911
      182645
      -1.34965
      36.8062
      1
      0.463471
      9.66617
      4608
      1
      17
    
    
      3
      1005
      10
      8
      -0.049412
      1.75693
      0.55702
      3072
      102427
      140823
      132426
      -1.34965
      36.8062
      1
      0.488551
      12.6299
      4608
      1
      17
    
    
      4
      101
      10
      2
      0.0550115
      2.42374
      0.444083
      4000
      142995
      193383
      250416
      -1.37325
      36.8003
      1
      0.380382
      6.21572
      6000
      1
      16



In [92]:

    
tots_hour = df.groupby(['hour'])['totals'].sum()
shares_hour = df.groupby(['hour'])['shares'].sum()


# data.groupby(func, axis=0).mean()
# >>> data.groupby(['col1', 'col2'])['col3'].mean()



In [27]:

    
sum(df.totals)









    Out[27]:





19801



In [97]:

    
layout= go.Layout(
                    showlegend=False,
                    legend=dict(
                        x=0.5,
                        y=1,
                        font=dict(size=15)
                    ),
                    xaxis= dict(
                        title= 'Time of the day when the picture was clicked',
                        ticklen= 5,
                        zeroline= True,
                    ),
                    yaxis=dict(
                        ticklen= 5,
                        #range=range
                    ),
        barmode='grouped'
                )
trace1 = go.Bar(
                        x = list(df["index"]),
                        name = "No. of times image with 'X' appeared",
                        y = list(df.total_proportion),
                        opacity = 0.5,
                        marker=dict(color='grey')

                )

trace2 = go.Scatter(
                        name = "No. of times image with 'X' shared",
                        x = list(df["index"]),
                        y = list(df.share_proportion),
                        opacity = 1,
                        marker=dict(color='blue'),

                )
data = [trace1, trace2]
fig = dict(data=data, layout=layout)
iplot(fig,filename="Expt2 Training data distributions")



In [93]:

    
tots_hour = pd.DataFrame(tots_hour.to_dict(), index=['totals']).transpose().reset_index()
shares_hour = pd.DataFrame(shares_hour.to_dict(), index=['shares']).transpose().reset_index()



In [94]:

    
df = pd.merge(tots_hour, shares_hour)



In [95]:

    
df['total_proportion'] = df.totals * 100 / sum(df.totals)

df['share_proportion'] = df.shares * df.total_proportion / df.totals



In [81]:

    
df.sort_values(by=['total_proportion'], ascending=False, inplace=True)



In [96]:

    
df









    Out[96]:






  
    
      
      index
      totals
      shares
      total_proportion
      share_proportion
    
  
  
    
      0
      9
      284
      154
      1.434271
      0.777738
    
    
      1
      10
      250
      120
      1.262562
      0.606030
    
    
      2
      11
      451
      198
      2.277663
      0.999949
    
    
      3
      12
      1446
      821
      7.302661
      4.146255
    
    
      4
      13
      2099
      1046
      10.600475
      5.282561
    
    
      5
      14
      2497
      1298
      12.610474
      6.555224
    
    
      6
      15
      3166
      1559
      15.989091
      7.873340
    
    
      7
      16
      2922
      1478
      14.756830
      7.464269
    
    
      8
      17
      3172
      1658
      16.019393
      8.373314
    
    
      9
      18
      1741
      868
      8.792485
      4.383617
    
    
      10
      19
      1214
      568
      6.131003
      2.868542
    
    
      11
      20
      350
      159
      1.767587
      0.802990
    
    
      12
      21
      189
      84
      0.954497
      0.424221
    
    
      13
      22
      20
      3
      0.101005
      0.015151



In [ ]:

	index	totals	shares	arousal	contrast	dominance	height	hsv_itten_std_h	hsv_itten_std_s	hsv_itten_std_v	lat	long	orientation	pleasure	symmetry	width	day	hour
0	10	20	4	-0.0677241	1.4738	0.500873	4000	105719	140171	133193	-1.36709	36.782	1	0.441554	11.4251	6000	1	15
1	1000	10	9	0.0267916	2.2988	0.446958	3072	100419	106270	146868	-1.34968	36.806	1	0.385603	10.8675	4608	1	17
2	1003	10	2	0.00380291	1.61078	0.534058	3072	116605	145911	182645	-1.34965	36.8062	1	0.463471	9.66617	4608	1	17
3	1005	10	8	-0.049412	1.75693	0.55702	3072	102427	140823	132426	-1.34965	36.8062	1	0.488551	12.6299	4608	1	17
4	101	10	2	0.0550115	2.42374	0.444083	4000	142995	193383	250416	-1.37325	36.8003	1	0.380382	6.21572	6000	1	16

	index	totals	shares	total_proportion	share_proportion
0	9	284	154	1.434271	0.777738
1	10	250	120	1.262562	0.606030
2	11	451	198	2.277663	0.999949
3	12	1446	821	7.302661	4.146255
4	13	2099	1046	10.600475	5.282561
5	14	2497	1298	12.610474	6.555224
6	15	3166	1559	15.989091	7.873340
7	16	2922	1478	14.756830	7.464269
8	17	3172	1658	16.019393	8.373314
9	18	1741	868	8.792485	4.383617
10	19	1214	568	6.131003	2.868542
11	20	350	159	1.767587	0.802990
12	21	189	84	0.954497	0.424221
13	22	20	3	0.101005	0.015151

Inter annotator agreement scores for workers

Creating reliability matrix for understanding agreement of share-rates for images that appear in different albums

POSITION BIAS