In [1]:
import JobsMapResultsFilesToContainerObjs as JRS
from nltk.metrics.agreement import AnnotationTask
from collections import OrderedDict, Counter
import re, json
import importlib
importlib.reload(JRS)
import pandas as pd
import plotly.plotly as py
import cufflinks as cf
cf.go_offline()
from math import ceil
from plotly.offline import plot, iplot
import plotly.graph_objs as go
from datetime import datetime


Inter annotator agreement scores for workers

Three parameters are being used to measure inter-worker agreement scores. Quantify the measure of how workers agree to the same images that appear in different albums.

alpha()
Krippendorff 1980

pi()
Scott 1955; here, multi-pi. Equivalent to K from Siegel and Castellan (1988).

s()
Bennett, Albert and Goldstein 1954

Observations : Every worker has worked on atleast two albums.

(maxAlbums,minAlbums,avgAlbums) = (80, 2, 8.220264317180616)


In [36]:
workerStats = {}
wrkrCnts = {}
for row in masterList:
    wrkrCnts[row[0]] = wrkrCnts.get(row[0],[]) + [row[2]]
    
for wrkr in wrkrCnts.keys():
    workerStats[wrkr] = {'shares' : sum(wrkrCnts[wrkr]), 'total_attempts' : len(wrkrCnts[wrkr]),'proportion' : sum(wrkrCnts[wrkr])*100/len(wrkrCnts[wrkr])}

In [38]:
df = pd.DataFrame.from_dict(workerStats).transpose()

df.sort_values(by='proportion',inplace=True,ascending=True)

In [46]:
df['proportion'] = df['proportion'].apply(lambda x : round(int(x)/10)*10)

In [16]:
print(len(workerStats.keys()))

workersMulAlbms = [worker for worker in workerStats.keys() if workerStats[worker]['total_attempts'] > 10]
print(len(workersMulAlbms))

## Every worker worked on more than 1 album


228
228

In [33]:
numAlbums = {worker : ceil(workerStats[worker]['total_attempts']/10) for worker in workerStats.keys() if worker != 'A2E9NUZZ4S2VJ9'}
attempts = list(numAlbums.values())
maxAlbums = max(attempts)
minAlbums = min(attempts)
avgAlbums = sum(attempts)/len(attempts)

maxAlbums,minAlbums,avgAlbums


Out[33]:
(80, 2, 8.220264317180616)

In [32]:
for worker in numAlbums.keys():
    if numAlbums[worker] >= 100:
        print(worker)


A2E9NUZZ4S2VJ9

In [31]:
numAlbums['A2E9NUZZ4S2VJ9'],workerStats['A2E9NUZZ4S2VJ9']['total_attempts']


Out[31]:
(120, 1198)

In [1]:
def createResultDict(jobRangeStart,jobRangeEnd,workerData=False):
    masterDict = OrderedDict()
    keysOfInterest = []
    for i in range(jobRangeStart,jobRangeEnd+1):
        inFLTitle = "photo_album_" + str(i)
        inFL = "../results/photo_album_" + str(i) + ".results"
        with open(inFL,"r") as inp:
            inFLList = [line.replace('"','') for line in inp]

        header = inFLList[0].split("\t")
        resultList = [line.split("\t") for line in inFLList[1:]]

        resultDict = OrderedDict()
        for i in range(0,len(resultList)):
            for j in range(0,len(header)):
                resultDict[header[j]] = resultDict.get(header[j],[]) + [resultList[i][j]]
                
        if workerData:
            keysOfInterest = list(filter(lambda x: re.search("workerid",x),resultDict.keys()))
        else:
            keysOfInterest = list(filter(lambda x: re.search("Answer",x),resultDict.keys()))
        newDict = OrderedDict()
        for key in keysOfInterest:
            newDict[key] = resultDict[key]

        masterDict[inFLTitle] = newDict
    print(keysOfInterest)    
    return masterDict

Creating reliability matrix for understanding agreement of share-rates for images that appear in different albums

What you trying to say: to what degree does the share rate of image x that appear in album a agree to share rate of album b.. Not so important


In [4]:
results = JRS.createResultDict(1,100)
imgAlbumDict = JRS.genImgAlbumDictFromMap("../data/imageGID_job_map_expt2_corrected.csv")

res = JRS.createResultDict(1,100,workerData=True)
masterList = []

for album in res.keys(): # loop 1
    responses = res[album]
    workers = responses['workerid']
    for response in responses.keys(): # loop 2
        if 'Answer' in response and response.split(".")[1].isdigit():
            shrNoShr = []
            gid = response.split(".")[1]
            for shrNShr in responses[response]: # loop 3.1
                if len(shrNShr.split("|")) != 2: # no response captured
                    shrNoShr.append("*")
                elif shrNShr.split("|")[1] == 'share':
                    shrNoShr.append(1)
                else:
                    shrNoShr.append(0)
            
            for i in range(len(workers)): # loop 3.2
                masterList.append((workers[i],gid,gid + "_" + album,shrNoShr[i]))
                
# img_mul_albms = [gid for gid in imgAlbumDict.keys() if len(imgAlbumDict[gid]) > 1]
# alphas = []
# alphas_new = []
# pis = []
# kappas = []
# for img in img_mul_albms:
#     some_dict = {}
#     for tup in masterList:
#         if tup[1] == img:
#             if tup[0] in some_dict.keys():
#                 some_dict[tup[0]].update({tup[2]:tup[3]})
#             else:
#                 some_dict[tup[0]] = {tup[2]:tup[3]}

#     some_dict_final = {worker : some_dict[worker] for worker in some_dict.keys() if len(some_dict[worker].values()) > 1}
    
#     reliability_matrix = []

#     for worker in some_dict_final.keys():
#         dct = some_dict_final[worker]
#         for gid_albm in dct.keys():
#             reliability_matrix.append((worker, gid_albm, dct[gid_albm]))

#     t = AnnotationTask(data=reliability_matrix)
#     if len(some_dict_final) > 1:
#         alphas.append(KAS.krippendorff_alpha(list(some_dict_final.values()), KAS.nominal_metric, missing_items='*'))
#         try:
#             alphas_new.append(float("{0:.3f}".format(t.alpha())))
#             pis.append(float("{0:.3f}".format(t.pi())))
#             kappas.append(float("{0:.3f}".format(t.kappa())))
#         except ZeroDivisionError as e:
#             print("Caught")
#             pis.append(1)
#             kappas.append(1)
#             alphas_new.append(1)

In [27]:
sum(alphas)/len(alphas), sum(pis)/len(pis), sum(kappas)/len(kappas), sum(alphas_new)/len(alphas_new)


Out[27]:
(0.26037227066051688,
 0.24063694267515925,
 0.4411641791044776,
 0.3316305732484076)

In [30]:
np.mean(alphas), np.std(alphas), np.mean(pis), np.std(pis), np.mean(kappas), np.std(kappas)


Out[30]:
(0.26037227066051688,
 0.51648063586027104,
 0.24063694267515925,
 0.64518920498377497,
 0.44116417910447758,
 0.5255568928975124)

In [80]:
album_wise_dict = {}
for row in masterList:
    album_wise_dict[row[2]] = album_wise_dict.get(row[2],[]) + [(row[0],row[1], row[3])]

In [93]:
alphas = []
pis = []
kappas = []
for album in album_wise_dict.keys():
    reliability_matrix = album_wise_dict[album]
    t = AnnotationTask(data=reliability_matrix)
    try:
        alphas.append(t.alpha())
        pis.append(t.pi()) 
        kappas.append(t.kappa())
    except ZeroDivisionError as e:
        print("Caught")


Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught
Caught

In [94]:
np.mean(alphas), np.std(alphas), np.mean(pis), np.std(pis), np.mean(kappas), np.std(kappas)


Out[94]:
(0.21692423159260182,
 0.10650736948244149,
 0.14203322802064169,
 0.15090682592663776,
 0.17703867734101922,
 0.15223211774801929)

POSITION BIAS


In [97]:
res = JRS.createResultDict(1,100,workerData=True)
masterList = []

for album in res.keys(): # loop 1
    responses = res[album]
    workers = responses['workerid']
    pos = 1
    for response in responses.keys(): # loop 2
        if 'Answer' in response and response.split(".")[1].isdigit():
            shrNoShr = []
            gid = response.split(".")[1]
            for shrNShr in responses[response]: # loop 3.1
                if len(shrNShr.split("|")) != 2: # no response captured
                    shrNoShr.append("*")
                elif shrNShr.split("|")[1] == 'share':
                    shrNoShr.append(1)
                else:
                    shrNoShr.append(0)
            
            for i in range(len(workers)): # loop 3.2
                masterList.append((pos, shrNoShr[i]))
            pos += 1
masterList = list(filter(lambda x : x[1] != '*',masterList))         
pos_shares = [0]*20
positions = [0] * 20
for row in masterList:
    pos_shares[row[0]-1] += row[1]
    positions[row[0]-1] += 1
    
positions_norm = []
for pos in positions:
    positions_norm.append(pos*100/sum(positions))
    
    
pos_shr_norm = []
for i in range(len(positions)):
    pos_shr_norm.append(pos_shares[i]*100/positions[i])

In [119]:
layout= go.Layout(
                    showlegend=False,
                    legend=dict(
                        x=0.5,
                        y=1,
                        font=dict(size=20)
                    ),
                    xaxis= dict(
                        title= 'Position of the image',
                        ticklen= 5,
                        zeroline= True,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
          # tickangle=45
                    ),
                    yaxis=dict(
                        ticklen= 5,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
                        title="Share proportion (%)"
                        #range=range
                    ),
        barmode='grouped'
                )

trace1 = go.Bar(
                    x = list(range(1,21)),
                    name = "No. of times image appeared in x position",
                    y = positions_norm,
                    opacity = 0.5,
                    #marker=dict(color='grey')
                    
            )

trace2 = go.Scatter(
                    name = "No. of times image shared in x position",
                    x = list(range(1,21)),
                    y = pos_shr_norm,
                    opacity = 1,
                    marker=dict(color='blue'),
                    mode ='lines'
                    
            )


data = [trace1, trace2]
fig = dict(data=data,layout=layout)
iplot(fig,filename="Expt2 Training data distributions")



In [5]:
import DeriveFinalResultSet as drs, htmltag as HT

def genHTMLTableFiles(shrCntsObj):
    shrPropDict = drs.getShrProp(shrCntsObj)
    totCntDict = drs.genTotCnts(shrCntsObj)
    df = pd.DataFrame(shrPropDict,index = ['Share Proportion']).transpose()
    df_tot = pd.DataFrame(totCntDict,index = ['Total']).transpose()
    return df,df.to_html(bold_rows = False), df_tot

In [268]:
ftr = 'AGE'

In [69]:
def get_plots_per_ftr(ftr, ftr_txt):
    d = drs.ovrallShrCntsByFtr(drs.gidAidMapFl,drs.aidFeatureMapFl,ftr,drs.imgJobMap,1,100)
    head3 = HT.h3("Data-Frame by " + ftr)
    df1,tb1, df_tot = genHTMLTableFiles(d)
    dct = df_tot.to_dict()['Total']


    totals_dct = {}
    shares_dct = {}

    for key in dct.keys():
        if 'total' in key:
            if key[0] == 'UNIDENTIFIED' or key[0] == 'unknown':
                totals_dct['unknown'] = totals_dct.get('unknown',0) + dct[key]
            elif 'juvenile' in key[0]:
                totals_dct['juvenile'] = totals_dct.get('juvenile',0) + dct[key]
            else:
                totals_dct[key[0]] = dct[key]
        elif 'share' in key:
            if key[0] == 'UNIDENTIFIED' or key[0] == 'unknown':
                shares_dct['unknown'] = shares_dct.get('unknown',0) + dct[key]
            elif 'juvenile' in key[0]:
                shares_dct['juvenile'] = shares_dct.get('juvenile',0) + dct[key]
            else:
                shares_dct[key[0]] = dct[key]
    
    df_total = pd.DataFrame(totals_dct, index=['total']).transpose()
    df_share = pd.DataFrame(shares_dct, index=['shared']).transpose()  
    df_total.reset_index(inplace=True)
    df_share.reset_index(inplace=True)

    df = pd.merge(df_total, df_share)
    df['total_proportion'] = df.total * 100 / sum(list(df.total))
    df['share_proportion'] = df.shared * 100/ df.total
    
    df = df.sort_values(by=['total_proportion'], ascending=False)
        
    print(df)
    layout= go.Layout(
                    showlegend=False,
                    legend=dict(
                        x=0.5,
                        y=1,
                        font=dict(size=20)
                    ),
                    xaxis= dict(
                        title= ftr_txt,
                        ticklen= 5,
                        zeroline= True,
                        titlefont=dict(size=20),
                        tickfont=dict(size=15),
          # tickangle=45
                    ),
                    yaxis=dict(
                        ticklen= 5,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
                        title="Share proportion (%)"
                        #range=range
                    ),
        barmode='grouped'
                )
    trace1 = go.Bar(
                        x = list(range(1,len(df['index'])+1)),
                        name = "No. of times image with 'X' appeared",
                        y = list(df.total_proportion),
                        opacity = 0.5,
                        # marker=dict(color='grey')

                )

    trace2 = go.Scatter(
                        name = "No. of times image with 'X' shared",
                        x = list(range(1,len(df['index'])+1)),
                        y = list(df.share_proportion),
                        opacity = 1,
                        marker=dict(color='blue'),

                )
    data = [trace1, trace2]
    return df, dict(data=data, layout=layout)

In [51]:
df,fig = get_plots_per_ftr('VIEW_POINT', 'View point of the animal')
iplot(fig,filename="Expt2 Training data distributions")


        index  total  shared  total_proportion  share_proportion
6        left   9916    5097         44.976641         51.401775
8     unknown   4831    2182         21.912278         45.166632
7       right   3193    1744         14.482696         54.619480
1    backleft   1545     753          7.007756         48.737864
4   frontleft   1248     738          5.660634         59.134615
2   backright    465     217          2.109130         46.666667
0        back    327     184          1.483195         56.269113
5  frontright    268     182          1.215585         67.910448
3       front    254     153          1.152084         60.236220

In [52]:
df,fig = get_plots_per_ftr('AGE', 'Age of the animal')
iplot(fig,filename="Expt2 Training data distributions")


      index  total  shared  total_proportion  share_proportion
0     adult   7765    4112         34.683759         52.955570
1    infant   7404    3826         33.071288         51.674770
3   unknown   4920    2220         21.976059         45.121951
2  juvenile   2299    1259         10.268894         54.762940

In [53]:
_,fig = get_plots_per_ftr('SPECIES', 'Species of the animal')
iplot(fig,filename="Expt2 Training data distributions")


           index  total  shared  total_proportion  share_proportion
2   zebra_plains  11315    5751         56.945143         50.826337
1        unknown   4831    2182         24.313035         45.166632
0  giraffe_masai   3724    2128         18.741822         57.142857

In [70]:
_,fig = get_plots_per_ftr('CONTRIBUTOR', 'ID\'s of the contributor who took images')
iplot(fig,filename="Expt2 Training data distributions")


                                                index  total  shared  \
41                                            unknown   4831    2182   
0   GIRM_MUGU_20,hyrule:joncrall:/media/raid/work/...   1383     724   
3                   NNP GZC Car '12WHITE', Person 'A'    994     547   
37  NNP_Master,pachy.cs.uic.edu:jonc:/home/shared_...    919     479   
5                   NNP GZC Car '13WHITE', Person 'B'    905     558   
14                  NNP GZC Car '1PURPLE', Person 'B'    672     367   
38  PZ_MUGU_18,hyrule:joncrall:/media/raid/work/PZ...    569     242   
20                   NNP GZC Car '1WHITE', Person 'C'    529     327   
6                   NNP GZC Car '14WHITE', Person 'A'    514     199   
18                   NNP GZC Car '1WHITE', Person 'A'    490     293   
26                  NNP GZC Car '3PURPLE', Person 'B'    480     255   
15                  NNP GZC Car '1PURPLE', Person 'D'    450     252   
25                   NNP GZC Car '2WHITE', Person 'A'    440     167   
1                   NNP GZC Car '10WHITE', Person 'A'    419     202   
32                     NNP GZC Car '6RED', Person 'B'    360     200   
28                     NNP GZC Car '3RED', Person 'B'    360     140   
39  PZ_MUGU_19,hyrule:joncrall:/media/raid/work/PZ...    350     188   
19                   NNP GZC Car '1WHITE', Person 'B'    349     173   
30                     NNP GZC Car '4RED', Person 'A'    337     193   
2                   NNP GZC Car '11WHITE', Person 'A'    310     181   
7                   NNP GZC Car '15WHITE', Person 'A'    300     195   
29                   NNP GZC Car '3WHITE', Person 'A'    299     139   
8                   NNP GZC Car '15WHITE', Person 'C'    270      77   
36                   NNP GZC Car '9WHITE', Person 'A'    257     147   
12                    NNP GZC Car '1BLUE', Person 'B'    248     189   
9                   NNP GZC Car '15WHITE', Person 'D'    208     145   
27                     NNP GZC Car '3RED', Person 'A'    200      84   
31                   NNP GZC Car '5WHITE', Person 'A'    200      93   
40  PZ_MUGU_20,hyrule:joncrall:/media/raid/work/PZ...    190     104   
24                     NNP GZC Car '2RED', Person 'D'    180     103   
23                     NNP GZC Car '2RED', Person 'C'    180      72   
17                     NNP GZC Car '1RED', Person 'B'    180     109   
16                     NNP GZC Car '1RED', Person 'A'    174      91   
10                  NNP GZC Car '17WHITE', Person 'A'    170      41   
34                   NNP GZC Car '7WHITE', Person 'B'    170      64   
13                  NNP GZC Car '1PURPLE', Person 'A'    168     111   
33                   NNP GZC Car '6WHITE', Person 'B'    150      73   
35                   NNP GZC Car '8WHITE', Person 'A'    146      80   
11                    NNP GZC Car '1BLUE', Person 'A'    130      65   
4                   NNP GZC Car '13WHITE', Person 'A'    130      64   
22                     NNP GZC Car '2RED', Person 'B'    100      46   
21                     NNP GZC Car '2RED', Person 'A'     90      53   

    total_proportion  share_proportion  
41         24.397758         45.166632  
0           6.984496         52.349964  
3           5.019948         55.030181  
37          4.641180         52.121872  
5           4.570476         61.657459  
14          3.393768         54.613095  
38          2.873592         42.530756  
20          2.671582         61.814745  
6           2.595828         38.715953  
18          2.474622         59.795918  
26          2.424120         53.125000  
15          2.272612         56.000000  
25          2.222110         37.954545  
1           2.116055         48.210024  
32          1.818090         55.555556  
28          1.818090         38.888889  
39          1.767587         53.714286  
19          1.762537         49.570201  
30          1.701934         57.270030  
2           1.565577         58.387097  
7           1.515075         65.000000  
29          1.510025         46.488294  
8           1.363567         28.518519  
36          1.297914         57.198444  
12          1.252462         76.209677  
9           1.050452         69.711538  
27          1.010050         42.000000  
31          1.010050         46.500000  
40          0.959547         54.736842  
24          0.909045         57.222222  
23          0.909045         40.000000  
17          0.909045         60.555556  
16          0.878743         52.298851  
10          0.858542         24.117647  
34          0.858542         37.647059  
13          0.848442         66.071429  
33          0.757537         48.666667  
35          0.737336         54.794521  
11          0.656532         50.000000  
4           0.656532         49.230769  
22          0.505025         46.000000  
21          0.454522         58.888889  

In [305]:


In [101]:
results = JRS.createResultDict(1,100)
imgAlbumDict = JRS.genImgAlbumDictFromMap("../data/imageGID_job_map_expt2_corrected.csv")

res = JRS.createResultDict(1,100,workerData=True)
masterList = []

for album in res.keys(): # loop 1
    responses = res[album]
    workers = responses['workerid']
    for response in responses.keys(): # loop 2
        if 'Answer' in response and response.split(".")[1].isdigit():
            shrNoShr = []
            gid = response.split(".")[1]
            for shrNShr in responses[response]: # loop 3.1
                if len(shrNShr.split("|")) != 2: # no response captured
                    shrNoShr.append("*")
                elif shrNShr.split("|")[1] == 'share':
                    shrNoShr.append(1)
                else:
                    shrNoShr.append(0)
            
            for i in range(len(workers)): # loop 3.2
                masterList.append((gid,shrNoShr[i]))
                
masterList = list(filter(lambda x : x[1] != '*',masterList))

In [112]:
tots = {}
shares = {}
for tup in masterList:
    tots[tup[0]] = tots.get(tup[0], 0) + 1
    shares[tup[0]] = shares.get(tup[0], 0) + tup[1]

In [113]:
df_tots = pd.DataFrame(tots, index=['totals']).transpose().reset_index()
df_shares = pd.DataFrame(shares, index=['shares']).transpose().reset_index()

df = pd.merge(df_tots, df_shares)

In [114]:
inpExifFl = "../data/GZC_exifs_beauty_full.json"

with open(inpExifFl,"r") as inpJsonFl:
    exifJsonObj = json.load(inpJsonFl)
exifDf = pd.DataFrame(exifJsonObj).transpose()

exifDf['date'] = exifDf['date'].apply(lambda x : datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))
exifDf['day'] = exifDf.date.apply(lambda x : x.day)
exifDf['hour'] = exifDf.date.apply(lambda x : x.hour)
exifDf.drop(['size','date'],1,inplace=True)
exifDf.reset_index(inplace=True)

In [115]:
df = pd.merge(df, exifDf)
# df = df[[ 'totals', 'shares', 'hour']]
df.head()


Out[115]:
index totals shares arousal contrast dominance height hsv_itten_std_h hsv_itten_std_s hsv_itten_std_v lat long orientation pleasure symmetry width day hour
0 10 20 4 -0.0677241 1.4738 0.500873 4000 105719 140171 133193 -1.36709 36.782 1 0.441554 11.4251 6000 1 15
1 1000 10 9 0.0267916 2.2988 0.446958 3072 100419 106270 146868 -1.34968 36.806 1 0.385603 10.8675 4608 1 17
2 1003 10 2 0.00380291 1.61078 0.534058 3072 116605 145911 182645 -1.34965 36.8062 1 0.463471 9.66617 4608 1 17
3 1005 10 8 -0.049412 1.75693 0.55702 3072 102427 140823 132426 -1.34965 36.8062 1 0.488551 12.6299 4608 1 17
4 101 10 2 0.0550115 2.42374 0.444083 4000 142995 193383 250416 -1.37325 36.8003 1 0.380382 6.21572 6000 1 16

In [116]:
tots_hour = df.groupby(['hour'])['totals'].sum()
shares_hour = df.groupby(['hour'])['shares'].sum()


# data.groupby(func, axis=0).mean()
# >>> data.groupby(['col1', 'col2'])['col3'].mean()

In [117]:
tots_hour = pd.DataFrame(tots_hour.to_dict(), index=['totals']).transpose().reset_index()
shares_hour = pd.DataFrame(shares_hour.to_dict(), index=['shares']).transpose().reset_index()
df = pd.merge(tots_hour, shares_hour)
df['total_proportion'] = df.totals * 100 / sum(df.totals)

df['share_proportion'] = df.shares * 100 / df.totals

In [118]:
layout= go.Layout(
                    showlegend=False,
                    legend=dict(
                        x=0.5,
                        y=1,
                        font=dict(size=20)
                    ),
                    xaxis= dict(
                        title= 'Time of the day when the picture was clicked',
                        ticklen= 5,
                        zeroline= True,
                        titlefont=dict(size=20),
                        tickfont=dict(size=15),
           # tickangle=45
                    ),
                    yaxis=dict(
                        ticklen= 5,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
                        title="Share proportion (%)"
                        #range=range
                    ),
        barmode='grouped'
                )
trace1 = go.Bar(
                        x = list(df["index"]),
                        name = "No. of times image with 'X' appeared",
                        y = list(df.total_proportion),
                        opacity = 0.5,
                        #marker=dict(color='grey')

                )

trace2 = go.Scatter(
                        name = "No. of times image with 'X' shared",
                        x = list(df["index"]),
                        y = list(df.share_proportion),
                        opacity = 1,
                        marker=dict(color='blue'),

                )
data = [trace1, trace2]
fig = dict(data=data, layout=layout)
iplot(fig,filename="Expt2 Training data distributions")



In [88]:


In [74]:


In [170]:
df['not_shares'] = df['totals'] - df['shares']

In [173]:
contingency = [list(df['shares']) , list(df['not_shares'])]

In [143]:
def get_stat_significance_test(ftr):    
    df,fig = get_plots_per_ftr(ftr, 'View point of the animal')
    df.index = df['index']
    df['not_shared']= df.total - df.shared
    df.drop(['total_proportion', 'share_proportion', 'index', 'total'],1,inplace=True)
    df = df.transpose()
    print(df)
    obs = list(df.to_records())
    obs = [list(row)[1:] for row in obs]
    return chi2_contingency(obs)

In [149]:
chi2, p, dof, expected


Out[149]:
(563.4898346926658,
 5.1293480782924233e-93,
 41,
 array([[ 2443.19145498,   699.42740266,   502.69764153,   464.767739  ,
           457.68749053,   339.85192667,   287.7615272 ,   267.53224585,
           259.94626534,   247.80869653,   242.75137619,   227.57941518,
           222.52209484,   211.90172214,   182.06353214,   182.06353214,
           177.00621181,   176.50047977,   170.43169537,   156.77693046,
           151.71961012,   151.21387809,   136.54764911,   129.97313267,
           125.42154437,   105.19226302,   101.14640675,   101.14640675,
            96.08908641,    91.03176607,    91.03176607,    91.03176607,
            87.99737387,    85.97444574,    85.97444574,    84.96298167,
            75.85980506,    73.83687693,    65.74516439,    65.74516439,
            50.57320337,    45.51588304],
        [ 2387.80854502,   683.57259734,   491.30235847,   454.232261  ,
           447.31250947,   332.14807333,   281.2384728 ,   261.46775415,
           254.05373466,   242.19130347,   237.24862381,   222.42058482,
           217.47790516,   207.09827786,   177.93646786,   177.93646786,
           172.99378819,   172.49952023,   166.56830463,   153.22306954,
           148.28038988,   147.78612191,   133.45235089,   127.02686733,
           122.57845563,   102.80773698,    98.85359325,    98.85359325,
            93.91091359,    88.96823393,    88.96823393,    88.96823393,
            86.00262613,    84.02555426,    84.02555426,    83.03701833,
            74.14019494,    72.16312307,    64.25483561,    64.25483561,
            49.42679663,    44.48411696]]))

In [154]:
positions


Out[154]:
[990,
 989,
 996,
 990,
 992,
 989,
 994,
 989,
 986,
 993,
 995,
 987,
 989,
 991,
 988,
 989,
 994,
 993,
 993,
 974]

In [156]:
pos_not_shares = [positions[i]-pos_shares[i] for i in range(len(positions))]

In [157]:
pos_not_shares


Out[157]:
[460,
 461,
 475,
 454,
 498,
 483,
 470,
 503,
 492,
 473,
 492,
 490,
 524,
 498,
 503,
 502,
 521,
 480,
 509,
 499]

In [158]:
contigency = [pos_shares, pos_not_shares]

In [174]:
chi2_contingency(contigency)


Out[174]:
(30.718992689054222,
 0.043331726922545527,
 19,
 array([[ 500.6747134 ,  500.16898136,  503.7091056 ,  500.6747134 ,
          501.68617747,  500.16898136,  502.69764153,  500.16898136,
          498.65178526,  502.1919095 ,  503.20337357,  499.1575173 ,
          500.16898136,  501.18044543,  499.66324933,  500.16898136,
          502.69764153,  502.1919095 ,  502.1919095 ,  492.58300086],
        [ 489.3252866 ,  488.83101864,  492.2908944 ,  489.3252866 ,
          490.31382253,  488.83101864,  491.30235847,  488.83101864,
          487.34821474,  490.8080905 ,  491.79662643,  487.8424827 ,
          488.83101864,  489.81955457,  488.33675067,  488.83101864,
          491.30235847,  490.8080905 ,  490.8080905 ,  481.41699914]]))

In [176]:
len(contigency[0])


Out[176]:
20

In [195]:
d = drs.getCountingLogic("../data/Flickr_IBEIS_Ftrs_gid_aid_features.json", "../data/Flickr_IBEIS_Ftrs_aid_features.json", "NID",withNumInds=False,mode="GZC")

l = list(d.values())


l = [item for subitem in l for item in subitem]

In [189]:
with open("../data/Flickr_IBEIS_Ftrs_aid_features.json") as fl:
    jsonObj = json.load(fl)

In [190]:
len(jsonObj)


Out[190]:
2047

In [198]:
len(set(l))


Out[198]:
1080

In [19]:
df['total_proportion'] = df['total'] *100 / sum(df['total'])

In [21]:
df['share_prortion'] = df['shared'] * 100 / df['total']

In [29]:
a = ('UNIDENTIFIED', 'not_share')

In [37]:
dct


Out[37]:
{('UNIDENTIFIED', 'not_share'): 2649,
 ('UNIDENTIFIED', 'share'): 2182,
 ('UNIDENTIFIED', 'total'): 4831,
 ('adult', 'not_share'): 3653,
 ('adult', 'share'): 4112,
 ('adult', 'total'): 7765,
 ('infant', 'not_share'): 3578,
 ('infant', 'share'): 3826,
 ('infant', 'total'): 7404,
 ('juveniles - one year old', 'not_share'): 551,
 ('juveniles - one year old', 'share'): 604,
 ('juveniles - one year old', 'total'): 1155,
 ('juveniles- two year old', 'not_share'): 489,
 ('juveniles- two year old', 'share'): 655,
 ('juveniles- two year old', 'total'): 1144,
 ('unknown', 'not_share'): 51,
 ('unknown', 'share'): 38,
 ('unknown', 'total'): 89}

In [ ]: