In [2]:

    
import cufflinks as cf # this is necessary to link pandas to plotly
cf.go_offline()
import json
import plotly.graph_objs as go
import pandas as pd
import htmltag as HT
import PopulationEstimatorAPI as PE, ClassiferHelperAPI as CH
import importlib
import MarkRecapHelper as MR
importlib.reload(PE)
import random
import DataStructsHelperAPI as DS
from plotly.offline import plot, iplot

Logic for creating the comma seperated aggregate data file

Not needed to run every time



In [ ]:

    
attribs = [ 'GID', 'AID', 'AGE',
       'EXEMPLAR_FLAG', 'INDIVIDUAL_NAME', 'NID', 'QUALITY', 'SEX', 'SPECIES',
       'VIEW_POINT','CONTRIBUTOR']

df = ImageMap.genGidAidFtrDf("../data/full_gid_aid_map.json","../data/full_aid_features.json",'../data/full_gid_aid_ftr.csv')
df_comb = ImageMap.createMstrFl("../data/full_gid_aid_ftr.csv","../data/GZC_data_tagged.json",attribs,"../data/full_gid_aid_ftr_agg.csv")

Visuals for accuracies of predictions



In [ ]:

    
with open("../FinalResults/PopulationEstimate.json","r") as jsonFl:
    resObj = json.load(jsonFl)



In [ ]:

    
df = pd.DataFrame(resObj)
df['Axes Name'] = df['Classifier'] + " " + df['Attribute']

df = df[['Axes Name', 'all','giraffes','zebras','shared_images_count']]
df['Error_total_pop'] = df['all'] - 3620
df['Error_zebra_pop'] = df['zebras'] - 3468
df['Error_giraffe_pop'] = df['giraffes'] - 177
df['Predicted_Shared_proportion'] = df['shared_images_count'] * 100 / 6523
dfFull = df[['Axes Name','all','Error_total_pop','zebras','Error_zebra_pop','giraffes','Error_giraffe_pop','shared_images_count','Predicted_Shared_proportion']]
dfFull['norm_error_total_pop'] = dfFull['Error_total_pop'] / 3620
dfFull['norm_error_zebra_pop'] = dfFull['Error_zebra_pop'] / 3468
dfFull['norm_error_giraffe_pop'] = dfFull['Error_giraffe_pop'] / 177
dfFull.head()



In [ ]:

    
dfErrors= dfFull[['Axes Name','Error_total_pop','Error_zebra_pop','Error_giraffe_pop']]
dfErrors.index = df['Axes Name']
dfErrors.drop(['Axes Name'],1,inplace=True)



In [ ]:

    
layout = go.Layout(
    title="Estimation absolute-errors using predict-shared data",
    titlefont = dict(
            size=22),
    xaxis=dict(
        title="Classifier and Attribute Selection method",
        titlefont = dict(
            size=15),
        showticklabels=True,
        tickangle=35,
        tickfont=dict(
            size=9,
            color='black')
    ),
    yaxis=dict(
        title="Absolute Error",
        titlefont = dict(
            size=15),
        showticklabels=True,
        tickfont=dict(
            size=9,
            color='black')
    ))
fig1 = dfErrors.iplot(kind='bar',filename="Absolute_Errors",layout=layout)



In [ ]:

    
dfNormErrors= dfFull[['Axes Name','norm_error_total_pop','norm_error_zebra_pop','norm_error_giraffe_pop']]
dfNormErrors.index = df['Axes Name']
dfNormErrors.drop(['Axes Name'],1,inplace=True)



In [ ]:

    
layout = go.Layout(
    title="Estimation normalized-errors using predict-shared data",
    titlefont = dict(
            size=22),
    xaxis=dict(
        title="Classifier and Attribute Selection method",
        titlefont = dict(
            size=15),
        showticklabels=True,
        tickangle=35,
        tickfont=dict(
            size=9,
            color='black')
    ),
    yaxis=dict(
        title="Normalized Error",
        titlefont = dict(
            size=15),
        showticklabels=True,
        tickfont=dict(
            size=9,
            color='black')
    ))
fig2 = dfNormErrors.iplot(kind='bar',filename="Norm_Errors",layout=layout)
# Error = (predicted population - actual population)
# Normalized error formula =  Error / actual population



In [ ]:

    
dfNoOutliers = dfErrors[(abs(dfErrors['Error_total_pop']) <= 2750 )][(abs(dfErrors['Error_total_pop']) > 10)]



In [ ]:

    
layout = go.Layout(
    title="Estimation errors using predict-shared data -no outliers",
    titlefont = dict(
            size=22),
    xaxis=dict(
        title="Classifier and Attribute Selection method",
        titlefont = dict(
            size=15),
        showticklabels=True,
        tickangle=35,
        tickfont=dict(
            size=9,
            color='black')
    ),
    yaxis=dict(
        title="Absolute Error",
        titlefont = dict(
            size=15),
        showticklabels=True,
        tickfont=dict(
            size=9,
            color='black')
    ))
fig3 = dfNoOutliers.iplot(kind='bar',filename="errors_noOutliers",layout=layout)



In [ ]:

    
# predicted shared proportion (x) vs normalized error zebra (y1) and giraffe (y2)? thanks!
dfNewPlot = dfFull[['Predicted_Shared_proportion','norm_error_zebra_pop','norm_error_giraffe_pop']]
dfNewPlot.index = dfNewPlot['Predicted_Shared_proportion']/100
dfNewPlot.drop(['Predicted_Shared_proportion'],1,inplace=True)
dfNewPlot.head()



In [ ]:

    
layout = go.Layout(
    title="Predicted Shared Proportion versus Norm Error",
    titlefont = dict(
            size=22),
    xaxis=dict(
        title="Predicted Share Proportion",
        titlefont = dict(
            size=15),
        showticklabels=True,
        tickangle=35,
        tickfont=dict(
            size=9,
            color='black')
    ),
    yaxis=dict(
        title="Normalized Error",
        titlefont = dict(
            size=15),
        showticklabels=True,
        tickfont=dict(
            size=9,
            color='black')
    )
    )
fig4 = dfNewPlot.iplot(kind='bar',filename="predictedSharedVsError",layout=layout)



In [ ]:

    
fullFl = HT.HTML(HT.body(HT.h2("Population Estimates using predicted shared data - master table"),
                HT.HTML(dfFull.to_html(index=False)),
                HT.HTML(fig1.embed_code),
                HT.HTML(fig2.embed_code),
                HT.HTML(fig3.embed_code),
                HT.HTML(fig4.embed_code)
               ))


outputFile = open("../FinalResults/PopulationEstimationUsingClf.html","w")
outputFile.write(fullFl)
outputFile.close()

Synthetic Experiments

Synthetic Experiment #1

Calculate the population estimate

Synthetic Experiment #2

Calculate the population estimate



In [ ]:

    
appearanceDays = {}
for card in sdCards.keys():
    pred_results = {gid : predResults[gid] for gid in sdCards[card] if gid != '3644'}
    dfPredRes = pd.DataFrame(pred_results,index=['share']).transpose().reset_index()
    dfPredRes.columns = ['GID','share']
    appearanceDays[card] = set(pd.DataFrame.merge(dfPredRes,dfGidDays,on='GID').to_dict()['day'].values())



In [ ]:

    
appearanceDays



In [ ]:

    
import PopulationEstimatorAPI as PE
import importlib
importlib.reload(PE)



In [ ]:

    
l = PE.buildErrPlots('clf')
for ifrm in l:
    print(ifrm)
    print("<p>X-axis : k <br>Y axis = Percentage Error</p>")
    print()



In [ ]:

    
import pandas as pd



In [ ]:

    
def buildErrPlots(clfOrRgr, thresholdMeth=False, randomShare=False):
    if clfOrRgr == 'clf':
        algTypes = ['bayesian','logistic','svm','dtree','random_forests','ada_boost']
    else:
        algTypes = ['linear','ridge','lasso','svr','dtree_regressor','elastic_net']
    attribTypes = ['sparse','non_sparse','non_zero','abv_mean', 'beauty']
    
    flNms = [str(alg + "_" + attrib) for alg in algTypes for attrib in attribTypes]

    if thresholdMeth:
        suffix = "_thresholded.csv"
        hdr = "threshold"
        if clfOrRgr == 'clf':
            titleSuffix = "classifiers thresholded"
        else:
            titleSuffix = "regressors thresholded"
    else:
        hdr = "num_images"
        if randomShare:
            suffix = "_kSharesRandom.csv"
            if clfOrRgr == 'clf':
                titleSuffix = "classifiers random choices"
            else:
                titleSuffix = "regressors random choices"
        else:
            suffix = "_kShares.csv"
            if clfOrRgr == 'clf':
                titleSuffix = "classifiers top k choices"
            else:
                titleSuffix = "regressors top k choices"

    df = pd.DataFrame.from_csv(str("../FinalResults/"+flNms[0]+suffix)).reset_index()
    df.columns = list(map(lambda x : str(x + "_" + flNms[0]) if x != hdr else x,list(df.columns)))
    for i in range(1,len(flNms)):
        df1 = pd.DataFrame.from_csv(str("../FinalResults/"+flNms[i]+suffix)).reset_index()
        df1.columns = list(map(lambda x : str(x + "_" + flNms[i]) if x != hdr else x,list(df1.columns)))
        df = pd.DataFrame.merge(df,df1,on=hdr)

    df.index = df[hdr]
    df.drop([hdr],1,inplace=True)
    

    # calculate errors in estimation
    # % error = (predicted - actual) * 100 / actual
    for col in df.columns:
        if 'all' in col:
            df[str(col+'_err')] = (df[col] - 3620) / 36.20
        elif 'zebras' in col:
            df[str(col+'_err')] = (df[col] - 3468) / 34.68
        elif 'giraffes' in col:
            df[str(col+'_err')] = (df[col] - 177) / 1.77

    figs=[]
    errorCols = [col for col in df.columns if 'err' in col]
    # df = df[errorCols]
    return df

    for alg in algTypes:
        algCol = [col for col in df.columns if alg in col]
        algDf = df[algCol]
        titleAlg = "All %s %s" %(alg,titleSuffix)
        figs.append(algDf.iplot(kind='line',title=titleAlg))

    for attrib in attribTypes:
        attribCol = [col for col in df.columns if attrib in col]
        attribDf = df[attribCol]
        titleAttrib = "All %s %s" %(attrib,titleSuffix)
        figs.append(attribDf.iplot(kind='line',title=titleAttrib))

    figCodes = [fig.embed_code for fig in figs]
    return figCodes



In [ ]:

    
df = buildErrPlots('clf', randomShare=True)



In [ ]:

    
df.to_csv("/tmp/test.csv")



In [ ]:



In [ ]:

    
cols = list(filter(lambda x : 'zebra' in x and 'beauty' in x, list(df.columns)))
df[cols].to_csv("/tmp/zebras_bty_rgr.csv")



In [ ]:

    
import PopulationEstimatorAPI as PE
import importlib
importlib.reload(PE)



In [ ]:

    
l = PE.buildErrPlots('rgr', thresholdMeth=True)



In [ ]:

    
for i in l:
    print(i)
    print("<p>X-axis : k <br>Y axis = Percentage Error</p>")
    print()



In [4]:

    
train_fl, test_fl = "../data/BeautyFtrVector_GZC_Expt2.csv", "../data/GZC_exifs_beauty_full.csv"
inExifFl,inGidAidMapFl,inAidFtrFl = "../data/imgs_exif_data_full.json","../data/full_gid_aid_map.json","../data/full_aid_features.json"
meth = 'linear'
attrib = 'beauty'
regrArgs = {'linear' : {'fit_intercept' : True},
            'ridge' : {'fit_intercept' : True},
            'lasso' : {'fit_intercept' : True},
            'elastic_net' : {'fit_intercept' : True},
            'svr' : {'fit_intercept' : True},
            'dtree_regressor' : {'fit_intercept' : True}}

methObj,predResults = CH.trainTestRgrs(train_fl,
                                test_fl,
                                meth,
                                attrib,
                                infoGainFl="../data/infoGainsExpt2.csv",
                                methArgs = regrArgs
                                )









    



Number of outliers identified: 0
6524 6524



In [22]:

    
PE.kSharesPerContribAfterCoinFlip(predResults, inExifFl, inGidAidMapFl, inAidFtrFl, lambda : 2)









    



Number of images used: 4480
Number of images used: 4480
Number of images used: 4480






    



/Users/sreejithmenon/Google Drive/CodeBase/AWESOME/script/MarkRecapHelper.py:92: UserWarning:

There are no recaptures for this case.







    Out[22]:





{'all': 1320.0, 'giraffes': None, 'zebras': 817.0}



In [19]:

    
res = [{'all': 1320.0, 'giraffes': None, 'zebras': 817.0},
{'all': 2000.0, 'giraffes': 120, 'zebras': 817.0},
{'all': 2220.0, 'giraffes': None, 'zebras': None},
{'all': 3220.0, 'giraffes': 180, 'zebras': 2000},
{'all': 3220.0, 'giraffes': 180, 'zebras': 2500}]

df1 = pd.DataFrame(res)



In [ ]:

    
df1.iplot(kind='line')



In [5]:

    
df = PE.runSyntheticExptsRgr(inExifFl, inGidAidMapFl, inAidFtrFl, range(2,30), thresholdMeth=False, randomShare=False, beautyFtrs = True)









    



Starting to run linear on test data
Attribute Selection Method : beauty
Number of outliers identified: 0
6524 6524
Starting population estimation experiments
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480






    



/Users/sreejithmenon/Google Drive/CodeBase/AWESOME/script/MarkRecapHelper.py:92: UserWarning:

There are no recaptures for this case.







    



Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480
Population estimation experiments complete
Number of images used: 4480
Number of images used: 4480
Number of images used: 4480



In [6]:

    
df['gnd_truth_zebra'] = 3468
df['gnd_truth_girrafe'] = 177
df['gnd_truth_all'] = 3628



In [25]:

    
df.plot(kind='line')









    Out[25]:





<matplotlib.axes._subplots.AxesSubplot at 0x116e69e80>



In [12]:

    
import matplotlib.pyplot as plt



In [ ]:

    
plt.show()



In [23]:

    
df.drop(['all', 'giraffes', 'gnd_truth_girrafe', 'gnd_truth_all'],1,inplace=True)



In [24]:

    
df.head()









    Out[24]:






  
    
      
      zebras
      gnd_truth_zebra
    
    
      num_images
      
      
    
  
  
    
      2
      817.0
      3468
    
    
      3
      1250.0
      3468
    
    
      4
      2400.0
      3468
    
    
      5
      3094.0
      3468
    
    
      6
      4494.0
      3468



In [ ]:

	zebras	gnd_truth_zebra
num_images
2	817.0	3468
3	1250.0	3468
4	2400.0	3468
5	3094.0	3468
6	4494.0	3468

Logic for creating the comma seperated aggregate data file

Visuals for accuracies of predictions

Synthetic Experiments

Synthetic Experiment #1

Assign a score to each image (here probability) and select the top 'k' images for each contributor and share them

Calculate the population estimate

Synthetic Experiment #2

Assign a score to each image (here probability) and select the top 'x' images for each contributor where x is a random number and share them

Calculate the population estimate