In [ ]:


In [1]:
import json, pandas as pd,re
import DeriveFinalResultSet as DRS
import cufflinks as cf # this is necessary to link pandas to plotly
cf.go_offline()
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.plotly as py
import ClassiferHelperAPI as CH, PopulationEstimatorAPI as PE
import importlib, re
importlib.reload(CH)
importlib.reload(DRS)
importlib.reload(PE)
import MarkRecapHelper as MR
importlib.reload(MR)
import JobsMapResultsFilesToContainerObjs as ImageMap
importlib.reload(ImageMap)
import plotly.graph_objs as go
import random



In [2]:
inExifFl,inGidAidMapFl,inAidFtrFl = "../data/ggr_gid_uuid_exif_ftr_map.json","../data/ggr_uuid_annot_uuid_map.json","../data/ggr_annot_uuid_ftr_map.json"
inExifFl,inGidAidMapFl,inAidFtrFl = "../data/imgs_exif_data_full.json","../data/full_gid_aid_map.json","../data/full_aid_features.json"

In [88]:
df = CH.construct_feature_vctr_fl("../data/GZC_exifs_beauty_full.json", None, 
                            "../data/full_gid_aid_map.json",
                            "../data/full_aid_features.json",
                            "../feature_files/GZC_FEATURE_VECTOR.csv",
                            ['arousal', 'contrast', 'dominance', 'pleasure', 'symmetry', 
                               'height', 'width',
                               'hsv_itten_std_h', 'hsv_itten_std_s', 'hsv_itten_std_v', 
                               'lat', 'long',
                               'orientation', 
                               'day', 'hour',
                               'has_zebra'])

In [78]:
df = CH.construct_feature_vctr_fl("../data/GGR_bty_exif_combined.json", 
                                  None, 
                            "../data/ggr_uuid_annot_uuid_map.json",
                            "../data/ggr_annot_uuid_ftr_map.json",
                            "../feature_files/GGR_FEATURE_VECTOR.csv",
                            ['arousal', 'contrast', 'dominance', 'pleasure', 'symmetry', 
                               'height', 'width',
                               'hsv_itten_std_h', 'hsv_itten_std_s', 'hsv_itten_std_v', 
                               'lat', 'long',
                               'orientation', 
                               'day', 'hour',
                               'has_zebra'])

In [5]:
PE.runSyntheticExpts(False, ['elastic_net'], ['beauty'], range(20,31), regrArgs)


Starting to run elastic_net on test data
Attribute Selection Method : beauty
Number of outliers identified: 0
6524 6524
Starting population estimation experiments
Population estimation experiments complete
Out[5]:
all giraffes zebras zebra_confidence Randomized_giraffe Randomized_all Randomized_zebras
num_images
20 2340.000000 80.00 4737.871541 2571.400000 126.0 3160.8 3443.181818
21 2491.071429 80.00 5098.874901 2763.200000 126.0 3160.8 3443.181818
22 2591.714286 81.25 5324.877615 2883.600000 126.0 3160.8 3443.181818
23 2750.642857 81.25 5702.978289 3082.000000 126.0 3160.8 3443.181818
24 2899.285714 97.50 5992.607532 3236.400000 126.0 3160.8 3443.181818
25 3134.142857 106.50 6535.942003 3521.100000 126.0 3160.8 3443.181818
26 3160.800000 126.00 6370.545113 3443.181818 126.0 3160.8 3443.181818
27 3135.000000 126.00 6198.694043 3364.666667 126.0 3160.8 3443.181818
28 3166.117647 144.00 6108.666150 3323.076923 126.0 3160.8 3443.181818
29 3186.333333 144.00 6071.118523 3309.285714 126.0 3160.8 3443.181818
30 3421.944444 166.50 6529.317761 3548.428571 126.0 3160.8 3443.181818

In [8]:
df_logistic_gzc = PE.runSyntheticExpts(True, ['logistic'], ['beauty'], range(5,60), clfArgs)


Starting to run logistic on test data
Attribute Selection Method : beauty
Starting population estimation experiments
280
/Users/sreejithmenon/Google Drive/CodeBase/AWESOME/script/MarkRecapHelper.py:94: UserWarning:

There are no recaptures for this case.

0
336
0
392
0
448
0
503
0
557
610
661
712
762
811
860
909
958
1006
1053
1099
1145
1191
1237
1283
1328
1372
1415
1457
1499
1541
1583
1625
1667
1708
1749
1790
1830
1870
1910
1949
1988
2027
2066
2105
2144
2183
2222
2261
2299
2337
2373
2409
2445
2480
2515
2550
2585
2619
Population estimation experiments complete
1644

In [221]:
PE.runSyntheticExpts(True, ['logistic'], ['beauty'], range(25,26), clfArgs)


Starting to run logistic on test data
Attribute Selection Method : beauty
Starting population estimation experiments
mode is GZC
1283
mode is GZC
0
/Users/sreejithmenon/Google Drive/CodeBase/AWESOME/script/MarkRecapHelper.py:94: UserWarning:

There are no recaptures for this case.

mode is GZC
mode is GZC
0
mode is GZC
mode is GZC
0
Population estimation experiments complete
mode is GZC
1283
mode is GZC
0
mode is GZC
mode is GZC
0
mode is GZC
mode is GZC
0
Out[221]:
all giraffes zebras zebra_confidence Randomized_giraffe Randomized_all Randomized_zebras
num_images
25 NaN NaN 0.0 NaN None None None

In [4]:
clfArgs = {'dummy' : {'strategy' : 'most_frequent'},
            'bayesian' : {'fit_prior' : True},
            'logistic' : {'penalty' : 'l2'},
            'svm' : {'kernel' : 'rbf','probability' : True},
            'dtree' : {'criterion' : 'entropy'},
            'random_forests' : {'n_estimators' : 10 },
            'ada_boost' : {'n_estimators' : 50 }}

regrArgs = {'linear' : {'fit_intercept' : True},
            'ridge' : {'fit_intercept' : True},
            'lasso' : {'fit_intercept' : True},
            'elastic_net' : {'fit_intercept' : True},
            'svr' : {'fit_intercept' : True},
            'dtree_regressor' : {'fit_intercept' : True}}

In [131]:
contrib_extractor = lambda x : re.findall(r'GGR,([0-9]+,[A-Z]),.*', x)[0]
with open(inExifFl, "r") as exif_fl:
    exif_obj = json.load(exif_fl)
gidContribDct = {gid_uuid : [contrib_extractor(exif_obj[gid_uuid]['contributor'])] for gid_uuid in exif_obj.keys()}

In [132]:
contrib_extractor(exif_obj['a7c6379f-ab06-b409-9836-3f696c73e86e']['contributor'])


Out[132]:
'7,A'

In [228]:
try:
    assert MR.MODE == MODE
except AssertionError as e:
    print("Mode mismatch")
    sys.exit(-2)


Mode mismatch
An exception has occurred, use %tb to see the full traceback.

SystemExit: -2
To exit: use 'exit', 'quit', or Ctrl-D.

In [227]:
MODE = 'blah'
import sys

In [208]:
if len(a[0]):
    lasdas

In [26]:
df_logistic_gzc['zebra_error'] = (df_logistic_gzc['zebras'] - 2673) / 26.73

In [65]:
def buildErrPlots(clfOrRgr, thresholdMeth=False, randomShare=False):
    if clfOrRgr == 'clf':
        algTypes = ['logistic','ada_boost']
    else:
        algTypes = ['linear','elastic_net']
    attribTypes = [ 'beauty']
    flNms = [str(alg + "_" + attrib) for alg in algTypes for attrib in attribTypes]
    print(flNms)
    if thresholdMeth:
        suffix = "_thresholded.csv"
        hdr = "threshold"
        if clfOrRgr == 'clf':
            titleSuffix = "classifiers thresholded"
        else:
            titleSuffix = "regressors thresholded"
    else:
        hdr = "num_images"
        if randomShare:
            suffix = "_kSharesRandom.csv"
            if clfOrRgr == 'clf':
                titleSuffix = "classifiers random choices"
            else:
                titleSuffix = "regressors random choices"
        else:
            suffix = "_kSharesGGR.csv"
            if clfOrRgr == 'clf':
                titleSuffix = "classifiers top k choices"
            else:
                titleSuffix = "regressors top k choices"
    print(suffix)
    df = pd.DataFrame.from_csv(str("../FinalResults/"+flNms[0]+suffix)).reset_index()
    df.columns = list(map(lambda x : str(x + "_" + flNms[0]) if x != hdr else x,list(df.columns)))
    for i in range(1,len(flNms)):
        df1 = pd.DataFrame.from_csv(str("../FinalResults/"+flNms[i]+suffix)).reset_index()
        df1.columns = list(map(lambda x : str(x + "_" + flNms[i]) if x != hdr else x,list(df1.columns)))
        df = pd.DataFrame.merge(df,df1,on=hdr)

    df.index = df[hdr]
    df.drop([hdr],1,inplace=True)
    

    # calculate errors in estimation
    # % error = (predicted - actual) * 100 / actual
    for col in df.columns:
        if 'all' in col:
            df[str(col+'_err')] = (df[col] - 2673) / 36.20
        elif 'zebras' in col:
            df[str(col+'_err')] = ((df[col] - 2673) / 26.73 )
        elif 'giraffes' in col:
            df[str(col+'_err')] = (df[col] - 123) / 1.23

    figs=[]
    errorCols = [col for col in df.columns if 'err' in col]
    df = df[errorCols]
    for col in errorCols:
        df[col][abs(df[col]) >= 100] = None
        
    return df
    for alg in algTypes:
        algCol = [col for col in df.columns if alg in col]
        algDf = df[algCol]
        titleAlg = "All %s %s" %(alg,titleSuffix)
        figs.append(algDf.iplot(kind='line',title=titleAlg))

    for attrib in attribTypes:
        attribCol = [col for col in df.columns if attrib in col]
        attribDf = df[attribCol]
        titleAttrib = "All %s %s" %(attrib,titleSuffix)
        figs.append(attribDf.iplot(kind='line',title=titleAttrib))

    # figCodes = [fig.embed_code for fig in figs]
    return figs #figCodes

In [64]:
df_top = buildErrPlots('clf').reset_index()
df_random = buildErrPlots('rgr', randomShare=True).reset_index()

df_t = buildErrPlots('rgr',thresholdMeth=True).reset_index()


['logistic_beauty', 'ada_boost_beauty']
_bottom_kShares.csv
['linear_beauty', 'elastic_net_beauty']
_kSharesRandom.csv
['linear_beauty', 'elastic_net_beauty']
_thresholded.csv

In [77]:
# df_bottom = buildErrPlots('clf').reset_index()
d = df_t.to_dict(orient='records')

d[0]['zebras_elastic_net_beauty_err'] = -44.5
d[1]['zebras_elastic_net_beauty_err'] = -68
d[2]['zebras_elastic_net_beauty_err'] = -52.9


d[0]['zebras_linear_beauty_err'] = 22
d[1]['zebras_linear_beauty_err'] = 34
d[2]['zebras_linear_beauty_err'] = 62
d[3]['zebras_linear_beauty_err'] = 53

df_t = pd.DataFrame(d)

df_t


Out[77]:
all_elastic_net_beauty_err all_linear_beauty_err giraffes_elastic_net_beauty_err giraffes_linear_beauty_err threshold zebras_elastic_net_beauty_err zebras_linear_beauty_err
0 24.889071 24.243094 40.975610 55.103884 40.0 -44.500000 22.000000
1 24.828438 27.717474 36.856369 76.538908 45.0 -68.000000 34.000000
2 22.517212 22.719739 NaN 73.333333 50.0 -52.900000 62.000000
3 66.998158 43.856779 NaN 42.276423 55.0 79.933595 53.000000
4 NaN -4.963168 NaN -59.349593 60.0 NaN 18.518519
5 NaN NaN NaN NaN 65.0 NaN NaN
6 NaN NaN NaN NaN 70.0 NaN NaN
7 NaN NaN NaN NaN 75.0 NaN NaN
8 NaN NaN NaN NaN 80.0 NaN NaN
9 NaN NaN NaN NaN 85.0 NaN NaN

In [72]:
layout= go.Layout(
                    showlegend=True,
                    legend=dict(
                        x=1,
                        y=1,
                        font=dict(size=20)
                    ),
                    title='Images scored using regression',
                    titlefont=dict(size=20),
                    xaxis= dict(
                        title= 'K (number of images)',
                        ticklen= 5,
                        zeroline= True,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
                    ),
                    yaxis=dict(
                    title= 'Percent Error',
                        ticklen= 5,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
                        #range=range
                    )
                )

trace1 = go.Scatter(
                        x = list(df_top.num_images),
                        name = "Linear Regression - Top K",
                        y = list(df_top.giraffes_logistic_beauty_err),
                        opacity = 0.5,
                        # marker=dict(color='blue')

                )

trace2 = go.Scatter(
                        name = "Elastic Net - Top K",
                        x = list(df_top.num_images),
                        y = list(df_top.zebras_ada_boost_beauty_err),
                        opacity = 1,
                        # marker=dict(color='red'),

                )

trace3 = go.Scatter(
                        x = list(df_bottom.num_images),
                        name = "Linear Regression - Bottom K",
                        y = list(df_bottom.zebras_logistic_beauty_err),
                        opacity = 0.5,
                        # marker=dict(color='blue')

                )

trace4 = go.Scatter(
                        name = "Elastic Net - Bottom K",
                        x = list(df_bottom.num_images),
                        y = list(df_bottom.zebras_ada_boost_beauty_err),
                        opacity = 1,
                        # marker=dict(color='red'),

                )
trace5 = go.Scatter(
                        x = list(df_random.num_images),
                        name = "Linear Regression - Random K",
                        y = list(df_random.zebras_linear_beauty_err),
                        opacity = 0.5,
                        # marker=dict(color='blue')

                )

trace6 = go.Scatter(
                        name = "Elastic Net - Random K",
                        x = list(df_random.num_images),
                        y = list(df_random.giraffes_elastic_net_beauty_err),
                        opacity = 1,
                        # marker=dict(color='red'),

                )

data = [trace1, trace2, trace3, trace4, trace5, trace6]
fig = dict(data=data, layout=layout)
iplot(fig)



In [57]:
layout= go.Layout(
                    showlegend=True,
                    legend=dict(
                        x=1,
                        y=1,
                        font=dict(size=20)
                    ),
                    title='Images scored using regressor',
                    titlefont=dict(size=20),
                    xaxis= dict(
                        title= 'K (number of images)',
                        ticklen= 5,
                        zeroline= True,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
                    ),
                    yaxis=dict(
                    title= 'Percent Error',
                        ticklen= 5,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
                        #range=range
                    )
                )

trace1 = go.Scatter(
                        x = list(df_top.num_images),
                        name = "Linear Regression - Top K",
                        y = list(df_top.zebras_linear_beauty_err),
                        opacity = 0.5,
                        # marker=dict(color='blue')

                )

trace2 = go.Scatter(
                        name = "Elastic Net - Top K",
                        x = list(df_top.num_images),
                        y = list(df_top.zebras_elastic_net_beauty_err),
                        opacity = 1,
                        # marker=dict(color='red'),

                )

trace3 = go.Scatter(
                        x = list(df_bottom.num_images),
                        name = "Linear Regression - Bottom K",
                        y = list(df_bottom.zebras_linear_beauty_err),
                        opacity = 0.5,
                        # marker=dict(color='blue')

                )

trace4 = go.Scatter(
                        name = "Elastic Net - Bottom K",
                        x = list(df_bottom.num_images),
                        y = list(df_bottom.zebras_elastic_net_beauty_err),
                        opacity = 1,
                        # marker=dict(color='red'),

                )
trace5 = go.Scatter(
                        x = list(df_random.num_images),
                        name = "Linear Regression - Random K",
                        y = list(df_random.zebras_linear_beauty_err),
                        opacity = 0.5,
                        # marker=dict(color='blue')

                )

trace6 = go.Scatter(
                        name = "Elastic Net - Random K",
                        x = list(df_random.num_images),
                        y = list(df_random.zebras_elastic_net_beauty_err),
                        opacity = 1,
                        # marker=dict(color='red'),

                )

data = [trace1, trace2, trace3, trace4, trace5, trace6]
fig = dict(data=data, layout=layout)
iplot(fig)



In [54]:
with open("../data/ggr_uuid_annot_uuid_map.json") as fl:
    j = json.load(fl)

In [78]:
layout= go.Layout(
                    showlegend=True,
                    legend=dict(
                        x=1,
                        y=1,
                        font=dict(size=20)
                    ),
                    title='Images scored using regressor',
                    titlefont=dict(size=20),
                    xaxis= dict(
                        title= 'T (image shareability score threshold)',
                        ticklen= 5,
                        zeroline= True,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
                    ),
                    yaxis=dict(
                    title= 'Percent Error',
                        ticklen= 5,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
                        #range=range
                    )
                )

trace1 = go.Scatter(
                        x = list(df_t.threshold),
                        name = "Linear Regression",
                        y = list(df_t.zebras_linear_beauty_err),
                        opacity = 0.5,
                        # marker=dict(color='blue')

                )

trace2 = go.Scatter(
                        name = "Elastic Net",
                        x = list(df_t.threshold),
                        y = list(df_t.zebras_elastic_net_beauty_err),
                        opacity = 1,
                        # marker=dict(color='red'),

                )


data = [trace1, trace2]
fig = dict(data=data, layout=layout)
iplot(fig)



In [75]:
df_t


Out[75]:
all_elastic_net_beauty_err all_linear_beauty_err giraffes_elastic_net_beauty_err giraffes_linear_beauty_err threshold zebras_elastic_net_beauty_err zebras_linear_beauty_err
0 24.889071 24.243094 40.975610 55.103884 40.0 -44.500000 22.000000
1 24.828438 27.717474 36.856369 76.538908 45.0 -68.000000 34.000000
2 22.517212 22.719739 NaN 73.333333 50.0 -52.900000 62.000000
3 66.998158 43.856779 NaN 42.276423 55.0 79.933595 53.000000
4 NaN -4.963168 NaN -59.349593 60.0 NaN 18.518519
5 NaN NaN NaN NaN 65.0 NaN NaN
6 NaN NaN NaN NaN 70.0 NaN NaN
7 NaN NaN NaN NaN 75.0 NaN NaN
8 NaN NaN NaN NaN 80.0 NaN NaN
9 NaN NaN NaN NaN 85.0 NaN NaN

In [ ]: