Notebook name: VisualizeResults.ipynb

Author: Sreejith Menon (smenon8@uic.edu)

Contains code for generating various visuals and plots that are currently hosted in Results home page.

Highlights:

  • Rank list of all images with results
  • Share proportion comparisons based on single features
  • Share proportion compariosns based on two features (all-pairs)
  • Position Bias - Share proportion changes bases on the position of the image in the mechanical turk album
  • Responses to general questions asked in the mechanical turk jobs.

In [24]:
import csv
import htmltag as HT
import DeriveFinalResultSet as drs
import JobsMapResultsFilesToContainerObjs as ImageMap
import pandas as pd
import matplotlib.pyplot as plt
import importlib
import statistics as s
importlib.reload(drs)
importlib.reload(ImageMap)
pd.set_option('display.max_colwidth', -1)
from collections import Counter
import plotly.plotly as py
import cufflinks as cf
cf.go_offline()
import GetPropertiesAPI as GP
importlib.reload(GP)


Out[24]:
<module 'GetPropertiesAPI' from '/Users/sreejithmenon/Google Drive/Project/AnimalPhotoBias/script/GetPropertiesAPI.py'>

In [ ]:
def genHTMLTableFiles(shrCntsObj):
    shrPropDict = drs.getShrProp(shrCntsObj)
    
    df = pd.DataFrame(shrPropDict,index = ['Share Proportion']).transpose()
    
    return df,df.to_html(bold_rows = False)

In [ ]:
# Generate rank list of all images by share proportion
rnkFlLst = []
with open("../FinalResults/rankListImages_expt2.csv","r") as rnkFl:
    rnkFlCsv = csv.reader(rnkFl)
    header = rnkFlCsv.__next__()
    for row in rnkFlCsv:
        rnkFlLst.append(row)
        
thTgs = []
trTgs = []

trTgs.append(HT.tr(HT.th("GID"),HT.th("Share count"),HT.th("Not Share count"),
                  HT.th("Total Count"),HT.th("Share Proportion"),HT.th("Image")))

for tup in rnkFlLst:
    tdGid = HT.td(tup[0])
    tdShare = HT.td(tup[1])
    tdNotShare = HT.td(tup[2])
    tdTot = HT.td(tup[3])
    tdProp = HT.td(tup[4])
    url = "https://socialmediabias.blob.core.windows.net/wildlifephotos/All_Zebra_Count_Images/" + tup[0] + ".jpeg"
    tdImg = HT.td(HT.img(src = url,alt = "Unavailable",width = "300"))
    trTgs.append(HT.tr(tdGid,tdShare,tdNotShare,tdTot,tdProp,tdImg))
    
fullFile = HT.html(HT.body(HT.table(HT.HTML('  \n'.join(trTgs)),border="1")))

outputFile = open("../data/resultsExpt2RankList1.html","w")
outputFile.write(fullFile)
outputFile.close()

In [ ]:
# Generate the share prortion tables for pair wise features.

d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"SPECIES","AGE",drs.imgJobMap,1,100)
h3_1 = HT.h3("Data-Frame by SPECIES-AGE")
df1,tb1 = genHTMLTableFiles(d)

d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"SPECIES","SEX",drs.imgJobMap,1,100)
h3_2 = HT.h3("Data-Frame by SPECIES-SEX")
df2,tb2 = genHTMLTableFiles(d)

d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"SPECIES","VIEW_POINT",drs.imgJobMap,1,100)
h3_3 = HT.h3("Data-Frame by SPECIES-VIEW_POINT")
df3,tb3 = genHTMLTableFiles(d)

d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"SPECIES","QUALITY",drs.imgJobMap,1,100)
h3_4 = HT.h3("Data-Frame by SPECIES-QUALITY")
df4,tb4 = genHTMLTableFiles(d)

d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"SPECIES","EXEMPLAR_FLAG",drs.imgJobMap,1,100)
h3_5 = HT.h3("Data-Frame by SPECIES-EXEMPLAR_FLAG")
df5,tb5 = genHTMLTableFiles(d)

## *******## *******## *******## *******## *******## *******## *******## *******

d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"VIEW_POINT","SPECIES",drs.imgJobMap,1,100)
h3_6 = HT.h3("Data-Frame by VIEW_POINT-SPECIES")
df6,tb6 = genHTMLTableFiles(d)

d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"VIEW_POINT","AGE",drs.imgJobMap,1,100)
h3_7 = HT.h3("Data-Frame by VIEW_POINT-AGE")
df7,tb7 = genHTMLTableFiles(d)

d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"VIEW_POINT","SEX",drs.imgJobMap,1,100)
h3_8 = HT.h3("Data-Frame by VIEW_POINT-SEX")
df8,tb8 = genHTMLTableFiles(d)

d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"VIEW_POINT","QUALITY",drs.imgJobMap,1,100)
h3_9 = HT.h3("Data-Frame by VIEW_POINT-QUALITY")
df9,tb9 = genHTMLTableFiles(d)

d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"VIEW_POINT","EXEMPLAR_FLAG",drs.imgJobMap,1,100)
h3_10 = HT.h3("Data-Frame by VIEW_POINT-EXEMPLAR_FLAG")
df10,tb10 = genHTMLTableFiles(d)

## *******## *******## *******## *******## *******## *******## *******## *******
d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"SEX","SPECIES",drs.imgJobMap,1,100)
h3_11 = HT.h3("Data-Frame by SEX-SPECIES")
df11,tb11 = genHTMLTableFiles(d)

d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"SEX","AGE",drs.imgJobMap,1,100)
h3_12 = HT.h3("Data-Frame by SEX-AGE")
df12,tb12 = genHTMLTableFiles(d)

d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"SEX","QUALITY",drs.imgJobMap,1,100)
h3_13 = HT.h3("Data-Frame by SEX-QUALITY")
df13,tb13 = genHTMLTableFiles(d)

d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"SEX","EXEMPLAR_FLAG",drs.imgJobMap,1,100)
h3_14 = HT.h3("Data-Frame by SEX-EXEMPLAR_FLAG")
df14,tb14 = genHTMLTableFiles(d)

d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"SEX","VIEW_POINT",drs.imgJobMap,1,100)
h3_15 = HT.h3("Data-Frame by SEX-VIEW_POINT")
df15,tb15 = genHTMLTableFiles(d)

## *******## *******## *******## *******## *******## *******## *******## *******
d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"AGE","SPECIES",drs.imgJobMap,1,100)
h3_16 = HT.h3("Data-Frame by AGE-SPECIES")
df16,tb16 = genHTMLTableFiles(d)

d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"AGE","SEX",drs.imgJobMap,1,100)
h3_17 = HT.h3("Data-Frame by AGE-SEX")
df17,tb17 = genHTMLTableFiles(d)

d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"AGE","QUALITY",drs.imgJobMap,1,100)
h3_18 = HT.h3("Data-Frame by AGE-QUALITY")
df18,tb18 = genHTMLTableFiles(d)

d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"AGE","EXEMPLAR_FLAG",drs.imgJobMap,1,100)
h3_19 = HT.h3("Data-Frame by AGE-EXEMPLAR_FLAG")
df19,tb19 = genHTMLTableFiles(d)

d = drs.ovrallShrCntsByTwoFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,"AGE","VIEW_POINT",drs.imgJobMap,1,100)
h3_20 = HT.h3("Data-Frame by AGE-VIEW_POINT")
df20,tb20 = genHTMLTableFiles(d)

fullFl = HT.html(HT.body(HT.HTML(h3_1),HT.HTML(tb1),
                        HT.HTML(h3_2),HT.HTML(tb2),
                        HT.HTML(h3_3),HT.HTML(tb3),
                        HT.HTML(h3_4),HT.HTML(tb4),
                        HT.HTML(h3_5),HT.HTML(tb5),
                        HT.HTML(h3_6),HT.HTML(tb6),
                        HT.HTML(h3_7),HT.HTML(tb7),
                        HT.HTML(h3_8),HT.HTML(tb8),
                        HT.HTML(h3_9),HT.HTML(tb9),
                        HT.HTML(h3_10),HT.HTML(tb10),
                        HT.HTML(h3_11),HT.HTML(tb11),
                        HT.HTML(h3_12),HT.HTML(tb12),
                        HT.HTML(h3_13),HT.HTML(tb13),
                        HT.HTML(h3_14),HT.HTML(tb14),
                        HT.HTML(h3_15),HT.HTML(tb15),
                        HT.HTML(h3_16),HT.HTML(tb16),
                        HT.HTML(h3_17),HT.HTML(tb17),
                        HT.HTML(h3_18),HT.HTML(tb18),
                        HT.HTML(h3_19),HT.HTML(tb19),
                        HT.HTML(h3_20),HT.HTML(tb20)
                        ))

outputFile = open("../FinalResults/twoFeatures.html","w")
outputFile.write(fullFl)
outputFile.close()

In [ ]:
def getHTMLTabForFtr(ftr):
    d = drs.ovrallShrCntsByFtr(drs.gidAidMapFl,drs.aidFeatureMapFl,ftr,drs.imgJobMap,1,100)
    head3 = HT.h3("Data-Frame by " + ftr)
    df1,tb1 = genHTMLTableFiles(d)
    df1.sort_values(by=['Share Proportion'],ascending=False,inplace=True)
    fig = df1.iplot(kind='bar',filename=str(ftr + '_expt2' ))
    iframe = fig.embed_code

    df1.reset_index(inplace=True)
    df1.columns = [ftr,'Share Proportion']
    
    a,b,c = drs.genObjsForConsistency(drs.gidAidMapFl,drs.aidFeatureMapFl,ftr,drs.imgJobMap)    
    consistency = drs.getConsistencyDict(a,b,c)
    
    df2 = pd.DataFrame(drs.genVarStddevShrPropAcrsAlbms(consistency)).transpose()
    df2.reset_index(inplace=True)
    df2.columns = [ftr,'mean','standard_deviation','variance']
    
    df = pd.merge(df1,df2,left_on=ftr,right_on=ftr,how='left')
    
    return df,head3,df.to_html(bold_rows = False,index=False),iframe

In [ ]:
# Generate the share prortion tables and visuals (bar diagrams) for single features.
df1,h3_1,tb1,img1 = getHTMLTabForFtr("SEX")
df2,h3_2,tb2,img2 = getHTMLTabForFtr("AGE")
df3,h3_3,tb3,img3 = getHTMLTabForFtr("SEX")
df4,h3_4,tb4,img4 = getHTMLTabForFtr("VIEW_POINT")
df5,h3_5,tb5,img5 = getHTMLTabForFtr("QUALITY")
df6,h3_6,tb6,img6 = getHTMLTabForFtr("EXEMPLAR_FLAG")
df7,h3_7,tb7,img7 = getHTMLTabForFtr("CONTRIBUTOR")
fullFl = HT.html(HT.body(HT.HTML(h3_1),HT.HTML(tb1), HT.html(img1),
                        HT.HTML(h3_2),HT.HTML(tb2), HT.html(img2),
                        HT.HTML(h3_3),HT.HTML(tb3), HT.html(img3),
                        HT.HTML(h3_4),HT.HTML(tb4), HT.html(img4),
                        HT.HTML(h3_5),HT.HTML(tb5), HT.html(img5),
                        HT.HTML(h3_6),HT.HTML(tb6), HT.html(img6),
                        HT.HTML(h3_7),HT.HTML(tb7), HT.html(img7)
                         ))

outputFile = open("../FinalResults/oneFeature.html","w")
outputFile.write(fullFl)
outputFile.close()

plt.close('all')

In [ ]:
df = pd.DataFrame(drs.genAlbmFtrs(drs.gidAidMapFl,drs.aidFeatureMapFl,drs.imgJobMap,['SPECIES','AGE','SEX'])).transpose()
fullFl= HT.html(HT.body(HT.HTML(df.to_html(bold_rows = False))))

df.to_csv("../FinalResults/albumProperties.csv",index=False)
outputFile = open("../FinalResults/albumProperties.html","w")
outputFile.write(fullFl)
outputFile.close()

In [ ]:
# DO NOT RUN AGAIN WITHOUT TAKING BACKUP OF THE HTML FILE
imgAlbmShrs,consistency = drs.getShrPropImgsAcrossAlbms(drs.imgJobMap,1,100,"../FinalResults/shareRateSameImgsAcrossAlbums.json")
df = pd.DataFrame(imgAlbmShrs,index=["Share Proportion"]).transpose()
gidShrVarStdDevDict = drs.genVarStddevShrPropAcrsAlbms(consistency)
df2 = pd.DataFrame(gidShrVarStdDevDict).transpose()
df2.reset_index(inplace=True)
df2.columns = ['GID','Standard Deviation','Variance']

subindex = df.groupby(level=0).head(1).index
subindex2 = df2.groupby(level=0).head(1)['Standard Deviation']
subindex3 = df2.groupby(level=0).head(1)['Variance']
df.loc[subindex, 'Standard Deviation'] = subindex2.get_values()
df.loc[subindex, 'Variance'] = subindex3.get_values()

df.to_csv("../FinalResults/shareRateSameImgsAcrossAlbums.csv")

df.loc[subindex, 'URL'] = '<img src = "https://socialmediabias.blob.core.windows.net/wildlifephotos/All_Zebra_Count_Images/' + subindex.get_level_values(0) + '.jpeg" width = "350">'
df = df.fillna("")

fullFl= HT.html(HT.body(HT.HTML(df.to_html(bold_rows = False))))
outputFile = open("../FinalResults/shareRateSameImgsAcrossAlbums.html","w")
outputFile.write(fullFl)
outputFile.close()

df = pd.DataFrame(sorted(gidShrVarStdDevDict.items(),key = lambda x : x[1]['standard_deviation'],reverse=True),columns=['GID','Stats'])
df.to_csv("../FinalResults/ImgsStdDevDesc.csv",index=False)

In [ ]:
summaryPosCnt = drs.getPosShrProptn(drs.imgJobMap,1,100)

df = pd.DataFrame(summaryPosCnt).transpose()
cols = ['share','not_share','total','proportion']
df = df[cols]
# imgTg = HT.img(src="images/PositionBias.png")
# df.plot()
# plt.savefig("../FinalResults/PositionBias.png",bbox_inches='tight')

fig = df.iplot(kind='line',filename=str('Position_bias' + '_expt2' ))
iframe = fig.embed_code

fullFl= HT.html(HT.body(HT.HTML(df.to_html(bold_rows = False))),HT.HTML(iframe))
outputFile = open("../FinalResults/PositionBias1.html","w")
outputFile.write(fullFl)
outputFile.close()

In [ ]:
df[['share','not_share','proportion']].plot()
plt.show()

In [ ]:
# Overall share statistics ranked by shared proportion of images along with features and tags
df = ImageMap.createMstrFl("../data/resultsFeaturesComb_expt2.csv",[ 'GID', 'AID','Album', 'AGE','EXEMPLAR_FLAG', 'INDIVIDUAL_NAME', 'NID', 'QUALITY', 'SEX', 'SPECIES','VIEW_POINT','CONTRIBUTOR'])

dfRes = pd.DataFrame.from_csv("../FinalResults/resultsExpt2RankList_Tags.csv")
dfRes.reset_index(inplace=True)
dfRes.GID = dfRes.GID.astype(str)
dfRes['URL'] = '<img src = "https://socialmediabias.blob.core.windows.net/wildlifephotos/All_Zebra_Count_Images/' + dfRes['GID'] + '.jpeg" width = "350">'

dfSummary = pd.merge(df,dfRes,left_on ='GID',right_on='GID')
dfSummary.sort_values(by='Proportion',ascending=False,inplace=True)
dfSummary = dfSummary[['GID','Album','AGE','INDIVIDUAL_NAME','QUALITY','SEX','SPECIES','VIEW_POINT','CONTRIBUTOR','tags','Shared','Not Shared','Total','Proportion','URL']]
dfSummary.to_csv("/tmp/ImgShrRnkListWithTags.csv",index=False)

fullFl= HT.html(HT.body(HT.HTML(dfSummary.to_html(bold_rows = False,index=False))))
outputFile = open("../FinalResults/ImgShrRnkListWithTags.html","w")
outputFile.write(fullFl)
outputFile.close()

In [ ]:
# Visualizations for general questions asked in the mechanical turk
ans = ImageMap.genCntrsGenQues(1,100,['Answer.q1','Answer.q2'])

q1 = ans['Answer.q1']
q1 = {key : q1[key] for key in q1 if key != ''}
dfQ1 = pd.DataFrame(q1,index=['Counts']).transpose()
dfQ1.sort_values(by='Counts',ascending=False,inplace=True)

fig = dfQ1.iplot(kind='bar',filename="Frequency of posting pictures",title="How frequently do you share pictures on social media")
iframe = fig.embed_code
iframe

mapVal = {'A' : 'None',
'B' : '1 to 5',
'C' : '5 to 10',
'D' : '10 to 50',
'E' : '50 or more'}

q2 = ans['Answer.q2']
q2 = {mapVal[key] : q2[key] for key in q2 if key != ''}
dfQ2 = pd.DataFrame(q2,index=['Counts']).transpose()
dfQ2.sort_values(by='Counts',ascending=False,inplace=True)

fig = dfQ2.iplot(kind='bar',filename="Number of photos people share after safari",title="How many photos will you share on social media after a safari")
iframe2 = fig.embed_code
iframe2

In [ ]: