In [12]:
# Notebook for extracting Microsoft AI data 
# Join with share data to see the proportions

import csv
import json
import JobsMapResultsFilesToContainerObjs as ImageMap
import DeriveFinalResultSet as drs
import importlib
import pandas as pd
import htmltag as HT
from collections import OrderedDict
import matplotlib.pyplot as plt
flName = "../data/All_Zebra_Count_Tag_Output_Results.txt"
pd.set_option('display.max_colwidth', -1)
imgAlbumDict = ImageMap.genImgAlbumDictFromMap(drs.imgJobMap)
master = ImageMap.createResultDict(1,100)
imgShareNotShareList,noResponse = ImageMap.imgShareCountsPerAlbum(imgAlbumDict,master)

In [13]:
taggedData = json.load(open("../data/GZC_data_tagged.json"))

In [14]:
# Block of code for building rank list of images shared in the descending order of their share rates 
# Appended with Microsoft Image Tagging API results
rnkFlLst = []
with open("../FinalResults/rankListImages_expt2.csv","r") as rnkFl:
    rnkFlCsv = csv.reader(rnkFl)
    header = rnkFlCsv.__next__()
    for row in rnkFlCsv:
        rnkFlLst.append(row)
        
rnkListDf = pd.DataFrame(rnkFlLst,columns=['GID','Shares','Not Shares','Total','Proportion'])
rnkListDf['Proportion'] = rnkListDf['Proportion'].astype('float')
rnkListDf.sort_values(by="Proportion",ascending=False,inplace=True)

# create an overall giant csv
gidFtrs = {}
for gid in taggedData:
    tgs = taggedData[gid]['tags']
    if len(tgs) == 0:
        gidFtrs[gid] = [None]
    for dic in tgs:
        if dic['confidence'] >= 0.5: # added for retaining only high confidence tags
            gidFtrs[gid] = gidFtrs.get(gid,[]) + [dic['name']]
        
gidFtrsLst = []
for gid in gidFtrs:
    gidFtrsLst.append((gid,(gidFtrs[gid])))
    
df = pd.DataFrame(gidFtrsLst,columns=['GID','tags'])

shrPropsTags = pd.merge(rnkListDf,df,left_on='GID',right_on='GID')

shrPropsTags.to_csv("../FinalResults/resultsExpt2RankList_Tags.csv",index=False)
shrPropsTags['URL'] = '<img src = "https://socialmediabias.blob.core.windows.net/wildlifephotos/All_Zebra_Count_Images/' + shrPropsTags['GID'] + '.jpeg" width = "350">'

shrPropsTags.sort_values(by=['Proportion','GID'],ascending=False,inplace=True)
fullFl = HT.html(HT.body(HT.HTML(shrPropsTags.to_html(bold_rows = False,index=False))))

outputFile = open("../FinalResults/resultsExpt2RankList_Tags.html","w")
outputFile.write(fullFl)
outputFile.close()

In [15]:
# create an overall giant csv

rnkFlLst = []
with open("../FinalResults/rankListImages_expt2.csv","r") as rnkFl:
    rnkFlCsv = csv.reader(rnkFl)
    header = rnkFlCsv.__next__()
    for row in rnkFlCsv:
        rnkFlLst.append(row)
        
        
gidFtrs = {}
for gid in taggedData:
    tgs = taggedData[gid]['tags']
    if len(tgs) == 0:
        gidFtrs[gid] = [None]
    for dic in tgs:
        gidFtrs[gid] = gidFtrs.get(gid,[]) + [dic['name']]

In [16]:
tgsShrNoShrCount = {}
for lst in rnkFlLst:
    tgs = gidFtrs[lst[0]]
    tmpDict = {'share': int(lst[1]), 'not_share': int(lst[2]), 'total' : int(lst[3])}
    for tag in tgs:
        oldDict ={}
        oldDict =  tgsShrNoShrCount.get(tag,{'share' : 0,'not_share' : 0,'total' : 0})
        oldDict['share'] = oldDict.get('share',0) + tmpDict['share']
        oldDict['not_share'] = oldDict.get('not_share',0) + tmpDict['not_share']
        oldDict['total'] = oldDict.get('total',0) + tmpDict['total']

        tgsShrNoShrCount[tag] = oldDict

In [17]:
tgsShrCntDf = pd.DataFrame(tgsShrNoShrCount).transpose()
tgsShrCntDf['proportion'] = tgsShrCntDf['share'] * 100 / tgsShrCntDf['total']
tgsShrCntDf.sort_values(by=['proportion','share'],ascending=False,inplace=True)
tgsShrCntDf = tgsShrCntDf[['share','not_share','total','proportion']]
tgsShrCntDf.to_csv("../FinalResults/RankListTags.csv")

fullFl = HT.html(HT.body(HT.HTML(tgsShrCntDf.to_html(bold_rows = False))))

outputFile = open("../FinalResults/RankListTags.html","w")
outputFile.write(fullFl)
outputFile.close()

In [18]:
tgsShrCntDf.head()


Out[18]:
share not_share total proportion
bustard 10 0 10 100
hawk 10 0 10 100
oystercatcher 10 0 10 100
goose 10 0 10 100
ibis 10 0 10 100

In [21]:
tgsShrCntDf['proportion'].head(10).plot(kind='bar')
plt.savefig("../FinalResults/RankListTags.png",bbox_inches='tight')

In [ ]: