In [1]:
import JobsMapResultsFilesToContainerObjs as ImageMap
import pandas as pd
import PopulationEstimatorFromClf as PE
import importlib
importlib.reload(PE)
import cufflinks as cf # this is necessary to link pandas to plotly
cf.go_offline()
import json
import plotly.graph_objs as go
import htmltag as HT
import MarkRecapHelper as MR
importlib.reload(MR)
import DeriveFinalResultSet as DRS
import DataStructsHelperAPI as DS
importlib.reload(DS)
import random
In [ ]:
attribs = [ 'GID', 'AID', 'AGE',
'EXEMPLAR_FLAG', 'INDIVIDUAL_NAME', 'NID', 'QUALITY', 'SEX', 'SPECIES',
'VIEW_POINT','CONTRIBUTOR']
df = ImageMap.genGidAidFtrDf("../data/full_gid_aid_map.json","../data/full_aid_features.json",'../data/full_gid_aid_ftr.csv')
df_comb = ImageMap.createMstrFl("../data/full_gid_aid_ftr.csv","../data/GZC_data_tagged.json",attribs,"../data/full_gid_aid_ftr_agg.csv")
In [ ]:
with open("../FinalResults/PopulationEstimate.json","r") as jsonFl:
resObj = json.load(jsonFl)
In [ ]:
df = pd.DataFrame(resObj)
df['Axes Name'] = df['Classifier'] + " " + df['Attribute']
df = df[['Axes Name', 'all','giraffes','zebras','shared_images_count']]
df['Error_total_pop'] = df['all'] - 3620
df['Error_zebra_pop'] = df['zebras'] - 3468
df['Error_giraffe_pop'] = df['giraffes'] - 177
df['Predicted_Shared_proportion'] = df['shared_images_count'] * 100 / 6523
dfFull = df[['Axes Name','all','Error_total_pop','zebras','Error_zebra_pop','giraffes','Error_giraffe_pop','shared_images_count','Predicted_Shared_proportion']]
dfFull['norm_error_total_pop'] = dfFull['Error_total_pop'] / 3620
dfFull['norm_error_zebra_pop'] = dfFull['Error_zebra_pop'] / 3468
dfFull['norm_error_giraffe_pop'] = dfFull['Error_giraffe_pop'] / 177
dfFull.head()
In [ ]:
dfErrors= dfFull[['Axes Name','Error_total_pop','Error_zebra_pop','Error_giraffe_pop']]
dfErrors.index = df['Axes Name']
dfErrors.drop(['Axes Name'],1,inplace=True)
In [ ]:
layout = go.Layout(
title="Estimation absolute-errors using predict-shared data",
titlefont = dict(
size=22),
xaxis=dict(
title="Classifier and Attribute Selection method",
titlefont = dict(
size=15),
showticklabels=True,
tickangle=35,
tickfont=dict(
size=9,
color='black')
),
yaxis=dict(
title="Absolute Error",
titlefont = dict(
size=15),
showticklabels=True,
tickfont=dict(
size=9,
color='black')
))
fig1 = dfErrors.iplot(kind='bar',filename="Absolute_Errors",layout=layout)
In [ ]:
dfNormErrors= dfFull[['Axes Name','norm_error_total_pop','norm_error_zebra_pop','norm_error_giraffe_pop']]
dfNormErrors.index = df['Axes Name']
dfNormErrors.drop(['Axes Name'],1,inplace=True)
In [ ]:
layout = go.Layout(
title="Estimation normalized-errors using predict-shared data",
titlefont = dict(
size=22),
xaxis=dict(
title="Classifier and Attribute Selection method",
titlefont = dict(
size=15),
showticklabels=True,
tickangle=35,
tickfont=dict(
size=9,
color='black')
),
yaxis=dict(
title="Normalized Error",
titlefont = dict(
size=15),
showticklabels=True,
tickfont=dict(
size=9,
color='black')
))
fig2 = dfNormErrors.iplot(kind='bar',filename="Norm_Errors",layout=layout)
# Error = (predicted population - actual population)
# Normalized error formula = Error / actual population
In [ ]:
dfNoOutliers = dfErrors[(abs(dfErrors['Error_total_pop']) <= 2750 )][(abs(dfErrors['Error_total_pop']) > 10)]
In [ ]:
layout = go.Layout(
title="Estimation errors using predict-shared data -no outliers",
titlefont = dict(
size=22),
xaxis=dict(
title="Classifier and Attribute Selection method",
titlefont = dict(
size=15),
showticklabels=True,
tickangle=35,
tickfont=dict(
size=9,
color='black')
),
yaxis=dict(
title="Absolute Error",
titlefont = dict(
size=15),
showticklabels=True,
tickfont=dict(
size=9,
color='black')
))
fig3 = dfNoOutliers.iplot(kind='bar',filename="errors_noOutliers",layout=layout)
In [ ]:
# predicted shared proportion (x) vs normalized error zebra (y1) and giraffe (y2)? thanks!
dfNewPlot = dfFull[['Predicted_Shared_proportion','norm_error_zebra_pop','norm_error_giraffe_pop']]
dfNewPlot.index = dfNewPlot['Predicted_Shared_proportion']/100
dfNewPlot.drop(['Predicted_Shared_proportion'],1,inplace=True)
dfNewPlot.head()
In [ ]:
layout = go.Layout(
title="Predicted Shared Proportion versus Norm Error",
titlefont = dict(
size=22),
xaxis=dict(
title="Predicted Share Proportion",
titlefont = dict(
size=15),
showticklabels=True,
tickangle=35,
tickfont=dict(
size=9,
color='black')
),
yaxis=dict(
title="Normalized Error",
titlefont = dict(
size=15),
showticklabels=True,
tickfont=dict(
size=9,
color='black')
)
)
fig4 = dfNewPlot.iplot(kind='bar',filename="predictedSharedVsError",layout=layout)
In [ ]:
fullFl = HT.HTML(HT.body(HT.h2("Population Estimates using predicted shared data - master table"),
HT.HTML(dfFull.to_html(index=False)),
HT.HTML(fig1.embed_code),
HT.HTML(fig2.embed_code),
HT.HTML(fig3.embed_code),
HT.HTML(fig4.embed_code)
))
outputFile = open("../FinalResults/PopulationEstimationUsingClf.html","w")
outputFile.write(fullFl)
outputFile.close()
In [ ]:
layout = go.Layout(
title="Number of images shared(k) versus estimated population",
titlefont = dict(
size=15),
xaxis=dict(
title="Number of images shared (k)",
titlefont = dict(
size=15),
showticklabels=True,
tickangle=35,
tickfont=dict(
size=9,
color='black')
),
yaxis=dict(
title="Estimated Population",
titlefont = dict(
size=15),
showticklabels=True,
tickfont=dict(
size=9,
color='black')
)
)
In [ ]:
clfTypes = ['bayesian','logistic','svm','dtree','random_forests','ada_boost']
attribTypes = ['sparse','non_sparse','non_zero','abv_mean']
for clf in clfTypes:
for attrib in attribTypes:
print("Starting to classify %s : %s" %(clf,attrib))
clfObj,predResults = PE.trainTestClf("../FinalResults/ImgShrRnkListWithTags.csv",
"../data/full_gid_aid_ftr_agg.csv",
clf,
attrib,
"../data/infoGainsExpt2.csv")
flNm = str("../FinalResults/"+ clf + "_" + attrib + "_kShares")
prediction_probabs = {list(clfObj.test_x.index)[i] : clfObj.predProbabs[i] for i in range(len(clfObj.test_x.index))}
fixedK = {k : PE.kSharesPerContributor(prediction_probabs,inExifFl,inGidAidMapFl,inAidFtrFl,lambda : k) for k in range(59,85)}
df = pd.DataFrame(fixedK).transpose().reset_index()
df.columns = ['num_images','all','giraffes','zebras']
df.index = df['num_images']
df.drop(['num_images'],1,inplace=True)
df.to_csv(str(flNm+".csv"))
df_html = df.to_html(index=True)
randomized = PE.kSharesPerContributor(prediction_probabs,inExifFl,inGidAidMapFl,inAidFtrFl,lambda : random.randint(60,90))
df['Randomized_all'] = randomized['all']
df['Randomized_giraffe'] = randomized['giraffes']
df['Randomized_zebras'] = randomized['zebras']
fig1 = df.iplot(kind='line',layout=layout,filename=str(flNm+".html"))
fullFl = HT.HTML(HT.body(HT.h2("Population Estimates with k shares per contributor using %s and attribute selection method %s" %(clf,attrib)),
HT.HTML(df_html)
HT.HTML(fig1.embed_code)
))
outputFile = open(flNm,"w")
outputFile.write(fullFl)
outputFile.close()
print("Classification testing complete %s : %s" %(clf,attrib))
print()
In [ ]: