In [1]:

    
import JobsMapResultsFilesToContainerObjs as JRS
from nltk.metrics.agreement import AnnotationTask
from collections import OrderedDict, Counter
import re, json
import importlib
importlib.reload(JRS)
import pandas as pd
import plotly.plotly as py
import cufflinks as cf
cf.go_offline()
from math import ceil
from plotly.offline import plot, iplot
import plotly.graph_objs as go
from datetime import datetime

Inter annotator agreement scores for workers

Three parameters are being used to measure inter-worker agreement scores. Quantify the measure of how workers agree to the same images that appear in different albums.

alpha()
Krippendorff 1980

pi()
Scott 1955; here, multi-pi. Equivalent to K from Siegel and Castellan (1988).

s()
Bennett, Albert and Goldstein 1954

Observations : Every worker has worked on atleast two albums.

(maxAlbums,minAlbums,avgAlbums) = (80, 2, 8.220264317180616)



In [36]:

    
workerStats = {}
wrkrCnts = {}
for row in masterList:
    wrkrCnts[row[0]] = wrkrCnts.get(row[0],[]) + [row[2]]
    
for wrkr in wrkrCnts.keys():
    workerStats[wrkr] = {'shares' : sum(wrkrCnts[wrkr]), 'total_attempts' : len(wrkrCnts[wrkr]),'proportion' : sum(wrkrCnts[wrkr])*100/len(wrkrCnts[wrkr])}



In [38]:

    
df = pd.DataFrame.from_dict(workerStats).transpose()

df.sort_values(by='proportion',inplace=True,ascending=True)



In [46]:

    
df['proportion'] = df['proportion'].apply(lambda x : round(int(x)/10)*10)



In [16]:

    
print(len(workerStats.keys()))

workersMulAlbms = [worker for worker in workerStats.keys() if workerStats[worker]['total_attempts'] > 10]
print(len(workersMulAlbms))

## Every worker worked on more than 1 album



In [33]:

    
numAlbums = {worker : ceil(workerStats[worker]['total_attempts']/10) for worker in workerStats.keys() if worker != 'A2E9NUZZ4S2VJ9'}
attempts = list(numAlbums.values())
maxAlbums = max(attempts)
minAlbums = min(attempts)
avgAlbums = sum(attempts)/len(attempts)

maxAlbums,minAlbums,avgAlbums









    Out[33]:





(80, 2, 8.220264317180616)



In [32]:

    
for worker in numAlbums.keys():
    if numAlbums[worker] >= 100:
        print(worker)









    



A2E9NUZZ4S2VJ9



In [31]:

    
numAlbums['A2E9NUZZ4S2VJ9'],workerStats['A2E9NUZZ4S2VJ9']['total_attempts']









    Out[31]:





(120, 1198)



In [1]:

    
def createResultDict(jobRangeStart,jobRangeEnd,workerData=False):
    masterDict = OrderedDict()
    keysOfInterest = []
    for i in range(jobRangeStart,jobRangeEnd+1):
        inFLTitle = "photo_album_" + str(i)
        inFL = "../results/photo_album_" + str(i) + ".results"
        with open(inFL,"r") as inp:
            inFLList = [line.replace('"','') for line in inp]

        header = inFLList[0].split("\t")
        resultList = [line.split("\t") for line in inFLList[1:]]

        resultDict = OrderedDict()
        for i in range(0,len(resultList)):
            for j in range(0,len(header)):
                resultDict[header[j]] = resultDict.get(header[j],[]) + [resultList[i][j]]
                
        if workerData:
            keysOfInterest = list(filter(lambda x: re.search("workerid",x),resultDict.keys()))
        else:
            keysOfInterest = list(filter(lambda x: re.search("Answer",x),resultDict.keys()))
        newDict = OrderedDict()
        for key in keysOfInterest:
            newDict[key] = resultDict[key]

        masterDict[inFLTitle] = newDict
    print(keysOfInterest)    
    return masterDict

What you trying to say: to what degree does the share rate of image x that appear in album a agree to share rate of album b.. Not so important



In [4]:

    
results = JRS.createResultDict(1,100)
imgAlbumDict = JRS.genImgAlbumDictFromMap("../data/imageGID_job_map_expt2_corrected.csv")

res = JRS.createResultDict(1,100,workerData=True)
masterList = []

for album in res.keys(): # loop 1
    responses = res[album]
    workers = responses['workerid']
    for response in responses.keys(): # loop 2
        if 'Answer' in response and response.split(".")[1].isdigit():
            shrNoShr = []
            gid = response.split(".")[1]
            for shrNShr in responses[response]: # loop 3.1
                if len(shrNShr.split("|")) != 2: # no response captured
                    shrNoShr.append("*")
                elif shrNShr.split("|")[1] == 'share':
                    shrNoShr.append(1)
                else:
                    shrNoShr.append(0)
            
            for i in range(len(workers)): # loop 3.2
                masterList.append((workers[i],gid,gid + "_" + album,shrNoShr[i]))
                
# img_mul_albms = [gid for gid in imgAlbumDict.keys() if len(imgAlbumDict[gid]) > 1]
# alphas = []
# alphas_new = []
# pis = []
# kappas = []
# for img in img_mul_albms:
#     some_dict = {}
#     for tup in masterList:
#         if tup[1] == img:
#             if tup[0] in some_dict.keys():
#                 some_dict[tup[0]].update({tup[2]:tup[3]})
#             else:
#                 some_dict[tup[0]] = {tup[2]:tup[3]}

#     some_dict_final = {worker : some_dict[worker] for worker in some_dict.keys() if len(some_dict[worker].values()) > 1}
    
#     reliability_matrix = []

#     for worker in some_dict_final.keys():
#         dct = some_dict_final[worker]
#         for gid_albm in dct.keys():
#             reliability_matrix.append((worker, gid_albm, dct[gid_albm]))

#     t = AnnotationTask(data=reliability_matrix)
#     if len(some_dict_final) > 1:
#         alphas.append(KAS.krippendorff_alpha(list(some_dict_final.values()), KAS.nominal_metric, missing_items='*'))
#         try:
#             alphas_new.append(float("{0:.3f}".format(t.alpha())))
#             pis.append(float("{0:.3f}".format(t.pi())))
#             kappas.append(float("{0:.3f}".format(t.kappa())))
#         except ZeroDivisionError as e:
#             print("Caught")
#             pis.append(1)
#             kappas.append(1)
#             alphas_new.append(1)



In [27]:

    
sum(alphas)/len(alphas), sum(pis)/len(pis), sum(kappas)/len(kappas), sum(alphas_new)/len(alphas_new)









    Out[27]:





(0.26037227066051688,
 0.24063694267515925,
 0.4411641791044776,
 0.3316305732484076)



In [30]:

    
np.mean(alphas), np.std(alphas), np.mean(pis), np.std(pis), np.mean(kappas), np.std(kappas)









    Out[30]:





(0.26037227066051688,
 0.51648063586027104,
 0.24063694267515925,
 0.64518920498377497,
 0.44116417910447758,
 0.5255568928975124)



In [80]:

    
album_wise_dict = {}
for row in masterList:
    album_wise_dict[row[2]] = album_wise_dict.get(row[2],[]) + [(row[0],row[1], row[3])]



In [93]:

    
alphas = []
pis = []
kappas = []
for album in album_wise_dict.keys():
    reliability_matrix = album_wise_dict[album]
    t = AnnotationTask(data=reliability_matrix)
    try:
        alphas.append(t.alpha())
        pis.append(t.pi()) 
        kappas.append(t.kappa())
    except ZeroDivisionError as e:
        print("Caught")









    



Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught



In [94]:

    
np.mean(alphas), np.std(alphas), np.mean(pis), np.std(pis), np.mean(kappas), np.std(kappas)









    Out[94]:





(0.21692423159260182,
 0.10650736948244149,
 0.14203322802064169,
 0.15090682592663776,
 0.17703867734101922,
 0.15223211774801929)

POSITION BIAS



In [97]:

    
res = JRS.createResultDict(1,100,workerData=True)
masterList = []

for album in res.keys(): # loop 1
    responses = res[album]
    workers = responses['workerid']
    pos = 1
    for response in responses.keys(): # loop 2
        if 'Answer' in response and response.split(".")[1].isdigit():
            shrNoShr = []
            gid = response.split(".")[1]
            for shrNShr in responses[response]: # loop 3.1
                if len(shrNShr.split("|")) != 2: # no response captured
                    shrNoShr.append("*")
                elif shrNShr.split("|")[1] == 'share':
                    shrNoShr.append(1)
                else:
                    shrNoShr.append(0)
            
            for i in range(len(workers)): # loop 3.2
                masterList.append((pos, shrNoShr[i]))
            pos += 1
masterList = list(filter(lambda x : x[1] != '*',masterList))         
pos_shares = [0]*20
positions = [0] * 20
for row in masterList:
    pos_shares[row[0]-1] += row[1]
    positions[row[0]-1] += 1
    
positions_norm = []
for pos in positions:
    positions_norm.append(pos*100/sum(positions))
    
    
pos_shr_norm = []
for i in range(len(positions)):
    pos_shr_norm.append(pos_shares[i]*100/positions[i])



In [119]:

    
layout= go.Layout(
                    showlegend=False,
                    legend=dict(
                        x=0.5,
                        y=1,
                        font=dict(size=20)
                    ),
                    xaxis= dict(
                        title= 'Position of the image',
                        ticklen= 5,
                        zeroline= True,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
          # tickangle=45
                    ),
                    yaxis=dict(
                        ticklen= 5,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
                        title="Share proportion (%)"
                        #range=range
                    ),
        barmode='grouped'
                )

trace1 = go.Bar(
                    x = list(range(1,21)),
                    name = "No. of times image appeared in x position",
                    y = positions_norm,
                    opacity = 0.5,
                    #marker=dict(color='grey')
                    
            )

trace2 = go.Scatter(
                    name = "No. of times image shared in x position",
                    x = list(range(1,21)),
                    y = pos_shr_norm,
                    opacity = 1,
                    marker=dict(color='blue'),
                    mode ='lines'
                    
            )


data = [trace1, trace2]
fig = dict(data=data,layout=layout)
iplot(fig,filename="Expt2 Training data distributions")



In [5]:

    
import DeriveFinalResultSet as drs, htmltag as HT

def genHTMLTableFiles(shrCntsObj):
    shrPropDict = drs.getShrProp(shrCntsObj)
    totCntDict = drs.genTotCnts(shrCntsObj)
    df = pd.DataFrame(shrPropDict,index = ['Share Proportion']).transpose()
    df_tot = pd.DataFrame(totCntDict,index = ['Total']).transpose()
    return df,df.to_html(bold_rows = False), df_tot



In [268]:

    
ftr = 'AGE'



In [69]:

    
def get_plots_per_ftr(ftr, ftr_txt):
    d = drs.ovrallShrCntsByFtr(drs.gidAidMapFl,drs.aidFeatureMapFl,ftr,drs.imgJobMap,1,100)
    head3 = HT.h3("Data-Frame by " + ftr)
    df1,tb1, df_tot = genHTMLTableFiles(d)
    dct = df_tot.to_dict()['Total']


    totals_dct = {}
    shares_dct = {}

    for key in dct.keys():
        if 'total' in key:
            if key[0] == 'UNIDENTIFIED' or key[0] == 'unknown':
                totals_dct['unknown'] = totals_dct.get('unknown',0) + dct[key]
            elif 'juvenile' in key[0]:
                totals_dct['juvenile'] = totals_dct.get('juvenile',0) + dct[key]
            else:
                totals_dct[key[0]] = dct[key]
        elif 'share' in key:
            if key[0] == 'UNIDENTIFIED' or key[0] == 'unknown':
                shares_dct['unknown'] = shares_dct.get('unknown',0) + dct[key]
            elif 'juvenile' in key[0]:
                shares_dct['juvenile'] = shares_dct.get('juvenile',0) + dct[key]
            else:
                shares_dct[key[0]] = dct[key]
    
    df_total = pd.DataFrame(totals_dct, index=['total']).transpose()
    df_share = pd.DataFrame(shares_dct, index=['shared']).transpose()  
    df_total.reset_index(inplace=True)
    df_share.reset_index(inplace=True)

    df = pd.merge(df_total, df_share)
    df['total_proportion'] = df.total * 100 / sum(list(df.total))
    df['share_proportion'] = df.shared * 100/ df.total
    
    df = df.sort_values(by=['total_proportion'], ascending=False)
        
    print(df)
    layout= go.Layout(
                    showlegend=False,
                    legend=dict(
                        x=0.5,
                        y=1,
                        font=dict(size=20)
                    ),
                    xaxis= dict(
                        title= ftr_txt,
                        ticklen= 5,
                        zeroline= True,
                        titlefont=dict(size=20),
                        tickfont=dict(size=15),
          # tickangle=45
                    ),
                    yaxis=dict(
                        ticklen= 5,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
                        title="Share proportion (%)"
                        #range=range
                    ),
        barmode='grouped'
                )
    trace1 = go.Bar(
                        x = list(range(1,len(df['index'])+1)),
                        name = "No. of times image with 'X' appeared",
                        y = list(df.total_proportion),
                        opacity = 0.5,
                        # marker=dict(color='grey')

                )

    trace2 = go.Scatter(
                        name = "No. of times image with 'X' shared",
                        x = list(range(1,len(df['index'])+1)),
                        y = list(df.share_proportion),
                        opacity = 1,
                        marker=dict(color='blue'),

                )
    data = [trace1, trace2]
    return df, dict(data=data, layout=layout)



In [51]:

    
df,fig = get_plots_per_ftr('VIEW_POINT', 'View point of the animal')
iplot(fig,filename="Expt2 Training data distributions")









    



        index  total  shared  total_proportion  share_proportion
6        left   9916    5097         44.976641         51.401775
8     unknown   4831    2182         21.912278         45.166632
7       right   3193    1744         14.482696         54.619480
1    backleft   1545     753          7.007756         48.737864
4   frontleft   1248     738          5.660634         59.134615
2   backright    465     217          2.109130         46.666667
0        back    327     184          1.483195         56.269113
5  frontright    268     182          1.215585         67.910448
3       front    254     153          1.152084         60.236220



In [52]:

    
df,fig = get_plots_per_ftr('AGE', 'Age of the animal')
iplot(fig,filename="Expt2 Training data distributions")









    



      index  total  shared  total_proportion  share_proportion
0     adult   7765    4112         34.683759         52.955570
1    infant   7404    3826         33.071288         51.674770
3   unknown   4920    2220         21.976059         45.121951
2  juvenile   2299    1259         10.268894         54.762940



In [53]:

    
_,fig = get_plots_per_ftr('SPECIES', 'Species of the animal')
iplot(fig,filename="Expt2 Training data distributions")









    



           index  total  shared  total_proportion  share_proportion
2   zebra_plains  11315    5751         56.945143         50.826337
1        unknown   4831    2182         24.313035         45.166632
0  giraffe_masai   3724    2128         18.741822         57.142857



In [70]:

    
_,fig = get_plots_per_ftr('CONTRIBUTOR', 'ID\'s of the contributor who took images')
iplot(fig,filename="Expt2 Training data distributions")









    



                                                index  total  shared  \
41                                            unknown   4831    2182   
0   GIRM_MUGU_20,hyrule:joncrall:/media/raid/work/...   1383     724   
3                   NNP GZC Car '12WHITE', Person 'A'    994     547   
37  NNP_Master,pachy.cs.uic.edu:jonc:/home/shared_...    919     479   
5                   NNP GZC Car '13WHITE', Person 'B'    905     558   
14                  NNP GZC Car '1PURPLE', Person 'B'    672     367   
38  PZ_MUGU_18,hyrule:joncrall:/media/raid/work/PZ...    569     242   
20                   NNP GZC Car '1WHITE', Person 'C'    529     327   
6                   NNP GZC Car '14WHITE', Person 'A'    514     199   
18                   NNP GZC Car '1WHITE', Person 'A'    490     293   
26                  NNP GZC Car '3PURPLE', Person 'B'    480     255   
15                  NNP GZC Car '1PURPLE', Person 'D'    450     252   
25                   NNP GZC Car '2WHITE', Person 'A'    440     167   
1                   NNP GZC Car '10WHITE', Person 'A'    419     202   
32                     NNP GZC Car '6RED', Person 'B'    360     200   
28                     NNP GZC Car '3RED', Person 'B'    360     140   
39  PZ_MUGU_19,hyrule:joncrall:/media/raid/work/PZ...    350     188   
19                   NNP GZC Car '1WHITE', Person 'B'    349     173   
30                     NNP GZC Car '4RED', Person 'A'    337     193   
2                   NNP GZC Car '11WHITE', Person 'A'    310     181   
7                   NNP GZC Car '15WHITE', Person 'A'    300     195   
29                   NNP GZC Car '3WHITE', Person 'A'    299     139   
8                   NNP GZC Car '15WHITE', Person 'C'    270      77   
36                   NNP GZC Car '9WHITE', Person 'A'    257     147   
12                    NNP GZC Car '1BLUE', Person 'B'    248     189   
9                   NNP GZC Car '15WHITE', Person 'D'    208     145   
27                     NNP GZC Car '3RED', Person 'A'    200      84   
31                   NNP GZC Car '5WHITE', Person 'A'    200      93   
40  PZ_MUGU_20,hyrule:joncrall:/media/raid/work/PZ...    190     104   
24                     NNP GZC Car '2RED', Person 'D'    180     103   
23                     NNP GZC Car '2RED', Person 'C'    180      72   
17                     NNP GZC Car '1RED', Person 'B'    180     109   
16                     NNP GZC Car '1RED', Person 'A'    174      91   
10                  NNP GZC Car '17WHITE', Person 'A'    170      41   
34                   NNP GZC Car '7WHITE', Person 'B'    170      64   
13                  NNP GZC Car '1PURPLE', Person 'A'    168     111   
33                   NNP GZC Car '6WHITE', Person 'B'    150      73   
35                   NNP GZC Car '8WHITE', Person 'A'    146      80   
11                    NNP GZC Car '1BLUE', Person 'A'    130      65   
4                   NNP GZC Car '13WHITE', Person 'A'    130      64   
22                     NNP GZC Car '2RED', Person 'B'    100      46   
21                     NNP GZC Car '2RED', Person 'A'     90      53   

    total_proportion  share_proportion  
41         24.397758         45.166632  
0           6.984496         52.349964  
3           5.019948         55.030181  
37          4.641180         52.121872  
5           4.570476         61.657459  
14          3.393768         54.613095  
38          2.873592         42.530756  
20          2.671582         61.814745  
6           2.595828         38.715953  
18          2.474622         59.795918  
26          2.424120         53.125000  
15          2.272612         56.000000  
25          2.222110         37.954545  
1           2.116055         48.210024  
32          1.818090         55.555556  
28          1.818090         38.888889  
39          1.767587         53.714286  
19          1.762537         49.570201  
30          1.701934         57.270030  
2           1.565577         58.387097  
7           1.515075         65.000000  
29          1.510025         46.488294  
8           1.363567         28.518519  
36          1.297914         57.198444  
12          1.252462         76.209677  
9           1.050452         69.711538  
27          1.010050         42.000000  
31          1.010050         46.500000  
40          0.959547         54.736842  
24          0.909045         57.222222  
23          0.909045         40.000000  
17          0.909045         60.555556  
16          0.878743         52.298851  
10          0.858542         24.117647  
34          0.858542         37.647059  
13          0.848442         66.071429  
33          0.757537         48.666667  
35          0.737336         54.794521  
11          0.656532         50.000000  
4           0.656532         49.230769  
22          0.505025         46.000000  
21          0.454522         58.888889



In [305]:



In [101]:

    
results = JRS.createResultDict(1,100)
imgAlbumDict = JRS.genImgAlbumDictFromMap("../data/imageGID_job_map_expt2_corrected.csv")

res = JRS.createResultDict(1,100,workerData=True)
masterList = []

for album in res.keys(): # loop 1
    responses = res[album]
    workers = responses['workerid']
    for response in responses.keys(): # loop 2
        if 'Answer' in response and response.split(".")[1].isdigit():
            shrNoShr = []
            gid = response.split(".")[1]
            for shrNShr in responses[response]: # loop 3.1
                if len(shrNShr.split("|")) != 2: # no response captured
                    shrNoShr.append("*")
                elif shrNShr.split("|")[1] == 'share':
                    shrNoShr.append(1)
                else:
                    shrNoShr.append(0)
            
            for i in range(len(workers)): # loop 3.2
                masterList.append((gid,shrNoShr[i]))
                
masterList = list(filter(lambda x : x[1] != '*',masterList))



In [112]:

    
tots = {}
shares = {}
for tup in masterList:
    tots[tup[0]] = tots.get(tup[0], 0) + 1
    shares[tup[0]] = shares.get(tup[0], 0) + tup[1]



In [113]:

    
df_tots = pd.DataFrame(tots, index=['totals']).transpose().reset_index()
df_shares = pd.DataFrame(shares, index=['shares']).transpose().reset_index()

df = pd.merge(df_tots, df_shares)



In [114]:

    
inpExifFl = "../data/GZC_exifs_beauty_full.json"

with open(inpExifFl,"r") as inpJsonFl:
    exifJsonObj = json.load(inpJsonFl)
exifDf = pd.DataFrame(exifJsonObj).transpose()

exifDf['date'] = exifDf['date'].apply(lambda x : datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))
exifDf['day'] = exifDf.date.apply(lambda x : x.day)
exifDf['hour'] = exifDf.date.apply(lambda x : x.hour)
exifDf.drop(['size','date'],1,inplace=True)
exifDf.reset_index(inplace=True)



In [115]:

    
df = pd.merge(df, exifDf)
# df = df[[ 'totals', 'shares', 'hour']]
df.head()









    Out[115]:






  
    
      
      index
      totals
      shares
      arousal
      contrast
      dominance
      height
      hsv_itten_std_h
      hsv_itten_std_s
      hsv_itten_std_v
      lat
      long
      orientation
      pleasure
      symmetry
      width
      day
      hour
    
  
  
    
      0
      10
      20
      4
      -0.0677241
      1.4738
      0.500873
      4000
      105719
      140171
      133193
      -1.36709
      36.782
      1
      0.441554
      11.4251
      6000
      1
      15
    
    
      1
      1000
      10
      9
      0.0267916
      2.2988
      0.446958
      3072
      100419
      106270
      146868
      -1.34968
      36.806
      1
      0.385603
      10.8675
      4608
      1
      17
    
    
      2
      1003
      10
      2
      0.00380291
      1.61078
      0.534058
      3072
      116605
      145911
      182645
      -1.34965
      36.8062
      1
      0.463471
      9.66617
      4608
      1
      17
    
    
      3
      1005
      10
      8
      -0.049412
      1.75693
      0.55702
      3072
      102427
      140823
      132426
      -1.34965
      36.8062
      1
      0.488551
      12.6299
      4608
      1
      17
    
    
      4
      101
      10
      2
      0.0550115
      2.42374
      0.444083
      4000
      142995
      193383
      250416
      -1.37325
      36.8003
      1
      0.380382
      6.21572
      6000
      1
      16



In [116]:

    
tots_hour = df.groupby(['hour'])['totals'].sum()
shares_hour = df.groupby(['hour'])['shares'].sum()


# data.groupby(func, axis=0).mean()
# >>> data.groupby(['col1', 'col2'])['col3'].mean()



In [117]:

    
tots_hour = pd.DataFrame(tots_hour.to_dict(), index=['totals']).transpose().reset_index()
shares_hour = pd.DataFrame(shares_hour.to_dict(), index=['shares']).transpose().reset_index()
df = pd.merge(tots_hour, shares_hour)
df['total_proportion'] = df.totals * 100 / sum(df.totals)

df['share_proportion'] = df.shares * 100 / df.totals



In [118]:

    
layout= go.Layout(
                    showlegend=False,
                    legend=dict(
                        x=0.5,
                        y=1,
                        font=dict(size=20)
                    ),
                    xaxis= dict(
                        title= 'Time of the day when the picture was clicked',
                        ticklen= 5,
                        zeroline= True,
                        titlefont=dict(size=20),
                        tickfont=dict(size=15),
           # tickangle=45
                    ),
                    yaxis=dict(
                        ticklen= 5,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
                        title="Share proportion (%)"
                        #range=range
                    ),
        barmode='grouped'
                )
trace1 = go.Bar(
                        x = list(df["index"]),
                        name = "No. of times image with 'X' appeared",
                        y = list(df.total_proportion),
                        opacity = 0.5,
                        #marker=dict(color='grey')

                )

trace2 = go.Scatter(
                        name = "No. of times image with 'X' shared",
                        x = list(df["index"]),
                        y = list(df.share_proportion),
                        opacity = 1,
                        marker=dict(color='blue'),

                )
data = [trace1, trace2]
fig = dict(data=data, layout=layout)
iplot(fig,filename="Expt2 Training data distributions")



In [88]:



In [74]:



In [170]:

    
df['not_shares'] = df['totals'] - df['shares']



In [173]:

    
contingency = [list(df['shares']) , list(df['not_shares'])]



In [143]:

    
def get_stat_significance_test(ftr):    
    df,fig = get_plots_per_ftr(ftr, 'View point of the animal')
    df.index = df['index']
    df['not_shared']= df.total - df.shared
    df.drop(['total_proportion', 'share_proportion', 'index', 'total'],1,inplace=True)
    df = df.transpose()
    print(df)
    obs = list(df.to_records())
    obs = [list(row)[1:] for row in obs]
    return chi2_contingency(obs)



In [149]:

    
chi2, p, dof, expected









    Out[149]:





(563.4898346926658,
 5.1293480782924233e-93,
 41,
 array([[ 2443.19145498,   699.42740266,   502.69764153,   464.767739  ,
           457.68749053,   339.85192667,   287.7615272 ,   267.53224585,
           259.94626534,   247.80869653,   242.75137619,   227.57941518,
           222.52209484,   211.90172214,   182.06353214,   182.06353214,
           177.00621181,   176.50047977,   170.43169537,   156.77693046,
           151.71961012,   151.21387809,   136.54764911,   129.97313267,
           125.42154437,   105.19226302,   101.14640675,   101.14640675,
            96.08908641,    91.03176607,    91.03176607,    91.03176607,
            87.99737387,    85.97444574,    85.97444574,    84.96298167,
            75.85980506,    73.83687693,    65.74516439,    65.74516439,
            50.57320337,    45.51588304],
        [ 2387.80854502,   683.57259734,   491.30235847,   454.232261  ,
           447.31250947,   332.14807333,   281.2384728 ,   261.46775415,
           254.05373466,   242.19130347,   237.24862381,   222.42058482,
           217.47790516,   207.09827786,   177.93646786,   177.93646786,
           172.99378819,   172.49952023,   166.56830463,   153.22306954,
           148.28038988,   147.78612191,   133.45235089,   127.02686733,
           122.57845563,   102.80773698,    98.85359325,    98.85359325,
            93.91091359,    88.96823393,    88.96823393,    88.96823393,
            86.00262613,    84.02555426,    84.02555426,    83.03701833,
            74.14019494,    72.16312307,    64.25483561,    64.25483561,
            49.42679663,    44.48411696]]))



In [154]:

    
positions



In [156]:

    
pos_not_shares = [positions[i]-pos_shares[i] for i in range(len(positions))]



In [157]:

    
pos_not_shares



In [158]:

    
contigency = [pos_shares, pos_not_shares]



In [174]:

    
chi2_contingency(contigency)









    Out[174]:





(30.718992689054222,
 0.043331726922545527,
 19,
 array([[ 500.6747134 ,  500.16898136,  503.7091056 ,  500.6747134 ,
          501.68617747,  500.16898136,  502.69764153,  500.16898136,
          498.65178526,  502.1919095 ,  503.20337357,  499.1575173 ,
          500.16898136,  501.18044543,  499.66324933,  500.16898136,
          502.69764153,  502.1919095 ,  502.1919095 ,  492.58300086],
        [ 489.3252866 ,  488.83101864,  492.2908944 ,  489.3252866 ,
          490.31382253,  488.83101864,  491.30235847,  488.83101864,
          487.34821474,  490.8080905 ,  491.79662643,  487.8424827 ,
          488.83101864,  489.81955457,  488.33675067,  488.83101864,
          491.30235847,  490.8080905 ,  490.8080905 ,  481.41699914]]))



In [176]:

    
len(contigency[0])









    Out[176]:





20



In [195]:

    
d = drs.getCountingLogic("../data/Flickr_IBEIS_Ftrs_gid_aid_features.json", "../data/Flickr_IBEIS_Ftrs_aid_features.json", "NID",withNumInds=False,mode="GZC")

l = list(d.values())


l = [item for subitem in l for item in subitem]



In [189]:

    
with open("../data/Flickr_IBEIS_Ftrs_aid_features.json") as fl:
    jsonObj = json.load(fl)



In [190]:

    
len(jsonObj)









    Out[190]:





2047



In [198]:

    
len(set(l))









    Out[198]:





1080



In [19]:

    
df['total_proportion'] = df['total'] *100 / sum(df['total'])



In [21]:

    
df['share_prortion'] = df['shared'] * 100 / df['total']



In [29]:

    
a = ('UNIDENTIFIED', 'not_share')



In [37]:

    
dct









    Out[37]:





{('UNIDENTIFIED', 'not_share'): 2649,
 ('UNIDENTIFIED', 'share'): 2182,
 ('UNIDENTIFIED', 'total'): 4831,
 ('adult', 'not_share'): 3653,
 ('adult', 'share'): 4112,
 ('adult', 'total'): 7765,
 ('infant', 'not_share'): 3578,
 ('infant', 'share'): 3826,
 ('infant', 'total'): 7404,
 ('juveniles - one year old', 'not_share'): 551,
 ('juveniles - one year old', 'share'): 604,
 ('juveniles - one year old', 'total'): 1155,
 ('juveniles- two year old', 'not_share'): 489,
 ('juveniles- two year old', 'share'): 655,
 ('juveniles- two year old', 'total'): 1144,
 ('unknown', 'not_share'): 51,
 ('unknown', 'share'): 38,
 ('unknown', 'total'): 89}



In [ ]:

	index	totals	shares	arousal	contrast	dominance	height	hsv_itten_std_h	hsv_itten_std_s	hsv_itten_std_v	lat	long	orientation	pleasure	symmetry	width	day	hour
0	10	20	4	-0.0677241	1.4738	0.500873	4000	105719	140171	133193	-1.36709	36.782	1	0.441554	11.4251	6000	1	15
1	1000	10	9	0.0267916	2.2988	0.446958	3072	100419	106270	146868	-1.34968	36.806	1	0.385603	10.8675	4608	1	17
2	1003	10	2	0.00380291	1.61078	0.534058	3072	116605	145911	182645	-1.34965	36.8062	1	0.463471	9.66617	4608	1	17
3	1005	10	8	-0.049412	1.75693	0.55702	3072	102427	140823	132426	-1.34965	36.8062	1	0.488551	12.6299	4608	1	17
4	101	10	2	0.0550115	2.42374	0.444083	4000	142995	193383	250416	-1.37325	36.8003	1	0.380382	6.21572	6000	1	16

Inter annotator agreement scores for workers

Creating reliability matrix for understanding agreement of share-rates for images that appear in different albums

POSITION BIAS