In [12]:
# Notebook for extracting Microsoft AI data
# Join with share data to see the proportions
import csv
import json
import JobsMapResultsFilesToContainerObjs as ImageMap
import DeriveFinalResultSet as drs
import importlib
import pandas as pd
import htmltag as HT
from collections import OrderedDict
import matplotlib.pyplot as plt
flName = "../data/All_Zebra_Count_Tag_Output_Results.txt"
pd.set_option('display.max_colwidth', -1)
imgAlbumDict = ImageMap.genImgAlbumDictFromMap(drs.imgJobMap)
master = ImageMap.createResultDict(1,100)
imgShareNotShareList,noResponse = ImageMap.imgShareCountsPerAlbum(imgAlbumDict,master)
In [13]:
taggedData = json.load(open("../data/GZC_data_tagged.json"))
In [14]:
# Block of code for building rank list of images shared in the descending order of their share rates
# Appended with Microsoft Image Tagging API results
rnkFlLst = []
with open("../FinalResults/rankListImages_expt2.csv","r") as rnkFl:
rnkFlCsv = csv.reader(rnkFl)
header = rnkFlCsv.__next__()
for row in rnkFlCsv:
rnkFlLst.append(row)
rnkListDf = pd.DataFrame(rnkFlLst,columns=['GID','Shares','Not Shares','Total','Proportion'])
rnkListDf['Proportion'] = rnkListDf['Proportion'].astype('float')
rnkListDf.sort_values(by="Proportion",ascending=False,inplace=True)
# create an overall giant csv
gidFtrs = {}
for gid in taggedData:
tgs = taggedData[gid]['tags']
if len(tgs) == 0:
gidFtrs[gid] = [None]
for dic in tgs:
if dic['confidence'] >= 0.5: # added for retaining only high confidence tags
gidFtrs[gid] = gidFtrs.get(gid,[]) + [dic['name']]
gidFtrsLst = []
for gid in gidFtrs:
gidFtrsLst.append((gid,(gidFtrs[gid])))
df = pd.DataFrame(gidFtrsLst,columns=['GID','tags'])
shrPropsTags = pd.merge(rnkListDf,df,left_on='GID',right_on='GID')
shrPropsTags.to_csv("../FinalResults/resultsExpt2RankList_Tags.csv",index=False)
shrPropsTags['URL'] = '<img src = "https://socialmediabias.blob.core.windows.net/wildlifephotos/All_Zebra_Count_Images/' + shrPropsTags['GID'] + '.jpeg" width = "350">'
shrPropsTags.sort_values(by=['Proportion','GID'],ascending=False,inplace=True)
fullFl = HT.html(HT.body(HT.HTML(shrPropsTags.to_html(bold_rows = False,index=False))))
outputFile = open("../FinalResults/resultsExpt2RankList_Tags.html","w")
outputFile.write(fullFl)
outputFile.close()
In [15]:
# create an overall giant csv
rnkFlLst = []
with open("../FinalResults/rankListImages_expt2.csv","r") as rnkFl:
rnkFlCsv = csv.reader(rnkFl)
header = rnkFlCsv.__next__()
for row in rnkFlCsv:
rnkFlLst.append(row)
gidFtrs = {}
for gid in taggedData:
tgs = taggedData[gid]['tags']
if len(tgs) == 0:
gidFtrs[gid] = [None]
for dic in tgs:
gidFtrs[gid] = gidFtrs.get(gid,[]) + [dic['name']]
In [16]:
tgsShrNoShrCount = {}
for lst in rnkFlLst:
tgs = gidFtrs[lst[0]]
tmpDict = {'share': int(lst[1]), 'not_share': int(lst[2]), 'total' : int(lst[3])}
for tag in tgs:
oldDict ={}
oldDict = tgsShrNoShrCount.get(tag,{'share' : 0,'not_share' : 0,'total' : 0})
oldDict['share'] = oldDict.get('share',0) + tmpDict['share']
oldDict['not_share'] = oldDict.get('not_share',0) + tmpDict['not_share']
oldDict['total'] = oldDict.get('total',0) + tmpDict['total']
tgsShrNoShrCount[tag] = oldDict
In [17]:
tgsShrCntDf = pd.DataFrame(tgsShrNoShrCount).transpose()
tgsShrCntDf['proportion'] = tgsShrCntDf['share'] * 100 / tgsShrCntDf['total']
tgsShrCntDf.sort_values(by=['proportion','share'],ascending=False,inplace=True)
tgsShrCntDf = tgsShrCntDf[['share','not_share','total','proportion']]
tgsShrCntDf.to_csv("../FinalResults/RankListTags.csv")
fullFl = HT.html(HT.body(HT.HTML(tgsShrCntDf.to_html(bold_rows = False))))
outputFile = open("../FinalResults/RankListTags.html","w")
outputFile.write(fullFl)
outputFile.close()
In [18]:
tgsShrCntDf.head()
Out[18]:
In [21]:
tgsShrCntDf['proportion'].head(10).plot(kind='bar')
plt.savefig("../FinalResults/RankListTags.png",bbox_inches='tight')
In [ ]: