Notebook Name: PopulationEstimatorClfNB

Author Name: Sreejith Menon (smenon8@uic.edu)


In [1]:
import JobsMapResultsFilesToContainerObjs as ImageMap
import pandas as pd
import PopulationEstimatorFromClf as PE
import importlib
importlib.reload(PE)
import cufflinks as cf # this is necessary to link pandas to plotly
cf.go_offline()
import json
import plotly.graph_objs as go
import htmltag as HT
import MarkRecapHelper as MR
importlib.reload(MR)
import DeriveFinalResultSet as DRS
import DataStructsHelperAPI as DS
importlib.reload(DS)
import random


Logic for creating the comma seperated aggregate data file

Not needed to run every time


In [ ]:
attribs = [ 'GID', 'AID', 'AGE',
       'EXEMPLAR_FLAG', 'INDIVIDUAL_NAME', 'NID', 'QUALITY', 'SEX', 'SPECIES',
       'VIEW_POINT','CONTRIBUTOR']

df = ImageMap.genGidAidFtrDf("../data/full_gid_aid_map.json","../data/full_aid_features.json",'../data/full_gid_aid_ftr.csv')
df_comb = ImageMap.createMstrFl("../data/full_gid_aid_ftr.csv","../data/GZC_data_tagged.json",attribs,"../data/full_gid_aid_ftr_agg.csv")

Visuals for accuracies of predictions


In [ ]:
with open("../FinalResults/PopulationEstimate.json","r") as jsonFl:
    resObj = json.load(jsonFl)

In [ ]:
df = pd.DataFrame(resObj)
df['Axes Name'] = df['Classifier'] + " " + df['Attribute']

df = df[['Axes Name', 'all','giraffes','zebras','shared_images_count']]
df['Error_total_pop'] = df['all'] - 3620
df['Error_zebra_pop'] = df['zebras'] - 3468
df['Error_giraffe_pop'] = df['giraffes'] - 177
df['Predicted_Shared_proportion'] = df['shared_images_count'] * 100 / 6523
dfFull = df[['Axes Name','all','Error_total_pop','zebras','Error_zebra_pop','giraffes','Error_giraffe_pop','shared_images_count','Predicted_Shared_proportion']]
dfFull['norm_error_total_pop'] = dfFull['Error_total_pop'] / 3620
dfFull['norm_error_zebra_pop'] = dfFull['Error_zebra_pop'] / 3468
dfFull['norm_error_giraffe_pop'] = dfFull['Error_giraffe_pop'] / 177
dfFull.head()

In [ ]:
dfErrors= dfFull[['Axes Name','Error_total_pop','Error_zebra_pop','Error_giraffe_pop']]
dfErrors.index = df['Axes Name']
dfErrors.drop(['Axes Name'],1,inplace=True)

In [ ]:
layout = go.Layout(
    title="Estimation absolute-errors using predict-shared data",
    titlefont = dict(
            size=22),
    xaxis=dict(
        title="Classifier and Attribute Selection method",
        titlefont = dict(
            size=15),
        showticklabels=True,
        tickangle=35,
        tickfont=dict(
            size=9,
            color='black')
    ),
    yaxis=dict(
        title="Absolute Error",
        titlefont = dict(
            size=15),
        showticklabels=True,
        tickfont=dict(
            size=9,
            color='black')
    ))
fig1 = dfErrors.iplot(kind='bar',filename="Absolute_Errors",layout=layout)

In [ ]:
dfNormErrors= dfFull[['Axes Name','norm_error_total_pop','norm_error_zebra_pop','norm_error_giraffe_pop']]
dfNormErrors.index = df['Axes Name']
dfNormErrors.drop(['Axes Name'],1,inplace=True)

In [ ]:
layout = go.Layout(
    title="Estimation normalized-errors using predict-shared data",
    titlefont = dict(
            size=22),
    xaxis=dict(
        title="Classifier and Attribute Selection method",
        titlefont = dict(
            size=15),
        showticklabels=True,
        tickangle=35,
        tickfont=dict(
            size=9,
            color='black')
    ),
    yaxis=dict(
        title="Normalized Error",
        titlefont = dict(
            size=15),
        showticklabels=True,
        tickfont=dict(
            size=9,
            color='black')
    ))
fig2 = dfNormErrors.iplot(kind='bar',filename="Norm_Errors",layout=layout)
# Error = (predicted population - actual population)
# Normalized error formula =  Error / actual population

In [ ]:
dfNoOutliers = dfErrors[(abs(dfErrors['Error_total_pop']) <= 2750 )][(abs(dfErrors['Error_total_pop']) > 10)]

In [ ]:
layout = go.Layout(
    title="Estimation errors using predict-shared data -no outliers",
    titlefont = dict(
            size=22),
    xaxis=dict(
        title="Classifier and Attribute Selection method",
        titlefont = dict(
            size=15),
        showticklabels=True,
        tickangle=35,
        tickfont=dict(
            size=9,
            color='black')
    ),
    yaxis=dict(
        title="Absolute Error",
        titlefont = dict(
            size=15),
        showticklabels=True,
        tickfont=dict(
            size=9,
            color='black')
    ))
fig3 = dfNoOutliers.iplot(kind='bar',filename="errors_noOutliers",layout=layout)

In [ ]:
# predicted shared proportion (x) vs normalized error zebra (y1) and giraffe (y2)? thanks!
dfNewPlot = dfFull[['Predicted_Shared_proportion','norm_error_zebra_pop','norm_error_giraffe_pop']]
dfNewPlot.index = dfNewPlot['Predicted_Shared_proportion']/100
dfNewPlot.drop(['Predicted_Shared_proportion'],1,inplace=True)
dfNewPlot.head()

In [ ]:
layout = go.Layout(
    title="Predicted Shared Proportion versus Norm Error",
    titlefont = dict(
            size=22),
    xaxis=dict(
        title="Predicted Share Proportion",
        titlefont = dict(
            size=15),
        showticklabels=True,
        tickangle=35,
        tickfont=dict(
            size=9,
            color='black')
    ),
    yaxis=dict(
        title="Normalized Error",
        titlefont = dict(
            size=15),
        showticklabels=True,
        tickfont=dict(
            size=9,
            color='black')
    )
    )
fig4 = dfNewPlot.iplot(kind='bar',filename="predictedSharedVsError",layout=layout)

In [ ]:
fullFl = HT.HTML(HT.body(HT.h2("Population Estimates using predicted shared data - master table"),
                HT.HTML(dfFull.to_html(index=False)),
                HT.HTML(fig1.embed_code),
                HT.HTML(fig2.embed_code),
                HT.HTML(fig3.embed_code),
                HT.HTML(fig4.embed_code)
               ))


outputFile = open("../FinalResults/PopulationEstimationUsingClf.html","w")
outputFile.write(fullFl)
outputFile.close()

Synthetic Experiments

Synthetic Experiment #1

Assign a score to each image (here probability) and select the top 'k' images for each contributor and share them

Calculate the population estimate

Synthetic Experiment #2

Assign a score to each image (here probability) and select the top 'x' images for each contributor where x is a random number and share them

Calculate the population estimate


In [ ]:
layout = go.Layout(
        title="Number of images shared(k) versus estimated population",
        titlefont = dict(
                size=15),
        xaxis=dict(
            title="Number of images shared (k)",
            titlefont = dict(
                size=15),
            showticklabels=True,
            tickangle=35,
            tickfont=dict(
                size=9,
                color='black')
        ),
        yaxis=dict(
            title="Estimated Population",
            titlefont = dict(
                size=15),
            showticklabels=True,
            tickfont=dict(
                size=9,
                color='black')
        )
        )

In [ ]:
clfTypes = ['bayesian','logistic','svm','dtree','random_forests','ada_boost']
attribTypes = ['sparse','non_sparse','non_zero','abv_mean']

for clf in clfTypes:
    for attrib in attribTypes:
        print("Starting to classify %s : %s" %(clf,attrib))
        clfObj,predResults = PE.trainTestClf("../FinalResults/ImgShrRnkListWithTags.csv",
                             "../data/full_gid_aid_ftr_agg.csv",
                             clf,
                             attrib,
                             "../data/infoGainsExpt2.csv")

        flNm = str("../FinalResults/"+ clf + "_" + attrib + "_kShares")

        prediction_probabs = {list(clfObj.test_x.index)[i] : clfObj.predProbabs[i] for i in range(len(clfObj.test_x.index))}
        fixedK = {k : PE.kSharesPerContributor(prediction_probabs,inExifFl,inGidAidMapFl,inAidFtrFl,lambda : k) for k in range(59,85)}

        df = pd.DataFrame(fixedK).transpose().reset_index()
        df.columns = ['num_images','all','giraffes','zebras']
        df.index = df['num_images']
        df.drop(['num_images'],1,inplace=True)
        df.to_csv(str(flNm+".csv"))
        df_html = df.to_html(index=True)
        randomized = PE.kSharesPerContributor(prediction_probabs,inExifFl,inGidAidMapFl,inAidFtrFl,lambda : random.randint(60,90))
        df['Randomized_all'] = randomized['all']
        df['Randomized_giraffe'] = randomized['giraffes']
        df['Randomized_zebras'] = randomized['zebras']
        
        fig1 = df.iplot(kind='line',layout=layout,filename=str(flNm+".html"))
        fullFl = HT.HTML(HT.body(HT.h2("Population Estimates with k shares per contributor using %s and attribute selection method %s" %(clf,attrib)),
                        HT.HTML(df_html)
                        HT.HTML(fig1.embed_code)         
                       ))

        outputFile = open(flNm,"w")
        outputFile.write(fullFl)
        outputFile.close()
        print("Classification testing complete %s : %s" %(clf,attrib))
        print()

In [ ]: