notebook.community

Edit and run



In [12]:

    
# Notebook for extracting Microsoft AI data 
# Join with share data to see the proportions

import csv
import json
import JobsMapResultsFilesToContainerObjs as ImageMap
import DeriveFinalResultSet as drs
import importlib
import pandas as pd
import htmltag as HT
from collections import OrderedDict
import matplotlib.pyplot as plt
flName = "../data/All_Zebra_Count_Tag_Output_Results.txt"
pd.set_option('display.max_colwidth', -1)
imgAlbumDict = ImageMap.genImgAlbumDictFromMap(drs.imgJobMap)
master = ImageMap.createResultDict(1,100)
imgShareNotShareList,noResponse = ImageMap.imgShareCountsPerAlbum(imgAlbumDict,master)



In [13]:

    
taggedData = json.load(open("../data/GZC_data_tagged.json"))



In [14]:

    
# Block of code for building rank list of images shared in the descending order of their share rates 
# Appended with Microsoft Image Tagging API results
rnkFlLst = []
with open("../FinalResults/rankListImages_expt2.csv","r") as rnkFl:
    rnkFlCsv = csv.reader(rnkFl)
    header = rnkFlCsv.__next__()
    for row in rnkFlCsv:
        rnkFlLst.append(row)
        
rnkListDf = pd.DataFrame(rnkFlLst,columns=['GID','Shares','Not Shares','Total','Proportion'])
rnkListDf['Proportion'] = rnkListDf['Proportion'].astype('float')
rnkListDf.sort_values(by="Proportion",ascending=False,inplace=True)

# create an overall giant csv
gidFtrs = {}
for gid in taggedData:
    tgs = taggedData[gid]['tags']
    if len(tgs) == 0:
        gidFtrs[gid] = [None]
    for dic in tgs:
        if dic['confidence'] >= 0.5: # added for retaining only high confidence tags
            gidFtrs[gid] = gidFtrs.get(gid,[]) + [dic['name']]
        
gidFtrsLst = []
for gid in gidFtrs:
    gidFtrsLst.append((gid,(gidFtrs[gid])))
    
df = pd.DataFrame(gidFtrsLst,columns=['GID','tags'])

shrPropsTags = pd.merge(rnkListDf,df,left_on='GID',right_on='GID')

shrPropsTags.to_csv("../FinalResults/resultsExpt2RankList_Tags.csv",index=False)
shrPropsTags['URL'] = '<img src = "https://socialmediabias.blob.core.windows.net/wildlifephotos/All_Zebra_Count_Images/' + shrPropsTags['GID'] + '.jpeg" width = "350">'

shrPropsTags.sort_values(by=['Proportion','GID'],ascending=False,inplace=True)
fullFl = HT.html(HT.body(HT.HTML(shrPropsTags.to_html(bold_rows = False,index=False))))

outputFile = open("../FinalResults/resultsExpt2RankList_Tags.html","w")
outputFile.write(fullFl)
outputFile.close()



In [15]:

    
# create an overall giant csv

rnkFlLst = []
with open("../FinalResults/rankListImages_expt2.csv","r") as rnkFl:
    rnkFlCsv = csv.reader(rnkFl)
    header = rnkFlCsv.__next__()
    for row in rnkFlCsv:
        rnkFlLst.append(row)
        
        
gidFtrs = {}
for gid in taggedData:
    tgs = taggedData[gid]['tags']
    if len(tgs) == 0:
        gidFtrs[gid] = [None]
    for dic in tgs:
        gidFtrs[gid] = gidFtrs.get(gid,[]) + [dic['name']]



In [16]:

    
tgsShrNoShrCount = {}
for lst in rnkFlLst:
    tgs = gidFtrs[lst[0]]
    tmpDict = {'share': int(lst[1]), 'not_share': int(lst[2]), 'total' : int(lst[3])}
    for tag in tgs:
        oldDict ={}
        oldDict =  tgsShrNoShrCount.get(tag,{'share' : 0,'not_share' : 0,'total' : 0})
        oldDict['share'] = oldDict.get('share',0) + tmpDict['share']
        oldDict['not_share'] = oldDict.get('not_share',0) + tmpDict['not_share']
        oldDict['total'] = oldDict.get('total',0) + tmpDict['total']

        tgsShrNoShrCount[tag] = oldDict



In [17]:

    
tgsShrCntDf = pd.DataFrame(tgsShrNoShrCount).transpose()
tgsShrCntDf['proportion'] = tgsShrCntDf['share'] * 100 / tgsShrCntDf['total']
tgsShrCntDf.sort_values(by=['proportion','share'],ascending=False,inplace=True)
tgsShrCntDf = tgsShrCntDf[['share','not_share','total','proportion']]
tgsShrCntDf.to_csv("../FinalResults/RankListTags.csv")

fullFl = HT.html(HT.body(HT.HTML(tgsShrCntDf.to_html(bold_rows = False))))

outputFile = open("../FinalResults/RankListTags.html","w")
outputFile.write(fullFl)
outputFile.close()



In [18]:

    
tgsShrCntDf.head()









    Out[18]:






  
    
      
      share
      not_share
      total
      proportion
    
  
  
    
      bustard
      10
      0
      10
      100
    
    
      hawk
      10
      0
      10
      100
    
    
      oystercatcher
      10
      0
      10
      100
    
    
      goose
      10
      0
      10
      100
    
    
      ibis
      10
      0
      10
      100



In [21]:

    
tgsShrCntDf['proportion'].head(10).plot(kind='bar')
plt.savefig("../FinalResults/RankListTags.png",bbox_inches='tight')



In [ ]:

	share	total	proportion
bustard	10	10	100
hawk	10	10	100
oystercatcher	10	10	100
goose	10	10	100
ibis	10	10	100