notebook.community

Edit and run



In [ ]:



In [1]:

    
import json, pandas as pd,re
import DeriveFinalResultSet as DRS
import cufflinks as cf # this is necessary to link pandas to plotly
cf.go_offline()
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.plotly as py
import ClassiferHelperAPI as CH, PopulationEstimatorAPI as PE
import importlib, re
importlib.reload(CH)
importlib.reload(DRS)
importlib.reload(PE)
import MarkRecapHelper as MR
importlib.reload(MR)
import JobsMapResultsFilesToContainerObjs as ImageMap
importlib.reload(ImageMap)
import plotly.graph_objs as go
import random



In [2]:

    
inExifFl,inGidAidMapFl,inAidFtrFl = "../data/ggr_gid_uuid_exif_ftr_map.json","../data/ggr_uuid_annot_uuid_map.json","../data/ggr_annot_uuid_ftr_map.json"
inExifFl,inGidAidMapFl,inAidFtrFl = "../data/imgs_exif_data_full.json","../data/full_gid_aid_map.json","../data/full_aid_features.json"



In [88]:

    
df = CH.construct_feature_vctr_fl("../data/GZC_exifs_beauty_full.json", None, 
                            "../data/full_gid_aid_map.json",
                            "../data/full_aid_features.json",
                            "../feature_files/GZC_FEATURE_VECTOR.csv",
                            ['arousal', 'contrast', 'dominance', 'pleasure', 'symmetry', 
                               'height', 'width',
                               'hsv_itten_std_h', 'hsv_itten_std_s', 'hsv_itten_std_v', 
                               'lat', 'long',
                               'orientation', 
                               'day', 'hour',
                               'has_zebra'])



In [78]:

    
df = CH.construct_feature_vctr_fl("../data/GGR_bty_exif_combined.json", 
                                  None, 
                            "../data/ggr_uuid_annot_uuid_map.json",
                            "../data/ggr_annot_uuid_ftr_map.json",
                            "../feature_files/GGR_FEATURE_VECTOR.csv",
                            ['arousal', 'contrast', 'dominance', 'pleasure', 'symmetry', 
                               'height', 'width',
                               'hsv_itten_std_h', 'hsv_itten_std_s', 'hsv_itten_std_v', 
                               'lat', 'long',
                               'orientation', 
                               'day', 'hour',
                               'has_zebra'])



In [5]:

    
PE.runSyntheticExpts(False, ['elastic_net'], ['beauty'], range(20,31), regrArgs)









    



Starting to run elastic_net on test data
Attribute Selection Method : beauty
Number of outliers identified: 0
6524 6524
Starting population estimation experiments
Population estimation experiments complete






    Out[5]:






  
    
      
      all
      giraffes
      zebras
      zebra_confidence
      Randomized_giraffe
      Randomized_all
      Randomized_zebras
    
    
      num_images
      
      
      
      
      
      
      
    
  
  
    
      20
      2340.000000
      80.00
      4737.871541
      2571.400000
      126.0
      3160.8
      3443.181818
    
    
      21
      2491.071429
      80.00
      5098.874901
      2763.200000
      126.0
      3160.8
      3443.181818
    
    
      22
      2591.714286
      81.25
      5324.877615
      2883.600000
      126.0
      3160.8
      3443.181818
    
    
      23
      2750.642857
      81.25
      5702.978289
      3082.000000
      126.0
      3160.8
      3443.181818
    
    
      24
      2899.285714
      97.50
      5992.607532
      3236.400000
      126.0
      3160.8
      3443.181818
    
    
      25
      3134.142857
      106.50
      6535.942003
      3521.100000
      126.0
      3160.8
      3443.181818
    
    
      26
      3160.800000
      126.00
      6370.545113
      3443.181818
      126.0
      3160.8
      3443.181818
    
    
      27
      3135.000000
      126.00
      6198.694043
      3364.666667
      126.0
      3160.8
      3443.181818
    
    
      28
      3166.117647
      144.00
      6108.666150
      3323.076923
      126.0
      3160.8
      3443.181818
    
    
      29
      3186.333333
      144.00
      6071.118523
      3309.285714
      126.0
      3160.8
      3443.181818
    
    
      30
      3421.944444
      166.50
      6529.317761
      3548.428571
      126.0
      3160.8
      3443.181818



In [8]:

    
df_logistic_gzc = PE.runSyntheticExpts(True, ['logistic'], ['beauty'], range(5,60), clfArgs)









    



Starting to run logistic on test data
Attribute Selection Method : beauty
Starting population estimation experiments
280






    



/Users/sreejithmenon/Google Drive/CodeBase/AWESOME/script/MarkRecapHelper.py:94: UserWarning:

There are no recaptures for this case.







    



0
336
0
392
0
448
0
503
0
557
610
661
712
762
811
860
909
958
1006
1053
1099
1145
1191
1237
1283
1328
1372
1415
1457
1499
1541
1583
1625
1667
1708
1749
1790
1830
1870
1910
1949
1988
2027
2066
2105
2144
2183
2222
2261
2299
2337
2373
2409
2445
2480
2515
2550
2585
2619
Population estimation experiments complete
1644



In [221]:

    
PE.runSyntheticExpts(True, ['logistic'], ['beauty'], range(25,26), clfArgs)









    



Starting to run logistic on test data
Attribute Selection Method : beauty
Starting population estimation experiments
mode is GZC
1283
mode is GZC
0






    



/Users/sreejithmenon/Google Drive/CodeBase/AWESOME/script/MarkRecapHelper.py:94: UserWarning:

There are no recaptures for this case.







    



mode is GZC
mode is GZC
0
mode is GZC
mode is GZC
0
Population estimation experiments complete
mode is GZC
1283
mode is GZC
0
mode is GZC
mode is GZC
0
mode is GZC
mode is GZC
0






    Out[221]:






  
    
      
      all
      giraffes
      zebras
      zebra_confidence
      Randomized_giraffe
      Randomized_all
      Randomized_zebras
    
    
      num_images
      
      
      
      
      
      
      
    
  
  
    
      25
      NaN
      NaN
      0.0
      NaN
      None
      None
      None



In [4]:

    
clfArgs = {'dummy' : {'strategy' : 'most_frequent'},
            'bayesian' : {'fit_prior' : True},
            'logistic' : {'penalty' : 'l2'},
            'svm' : {'kernel' : 'rbf','probability' : True},
            'dtree' : {'criterion' : 'entropy'},
            'random_forests' : {'n_estimators' : 10 },
            'ada_boost' : {'n_estimators' : 50 }}

regrArgs = {'linear' : {'fit_intercept' : True},
            'ridge' : {'fit_intercept' : True},
            'lasso' : {'fit_intercept' : True},
            'elastic_net' : {'fit_intercept' : True},
            'svr' : {'fit_intercept' : True},
            'dtree_regressor' : {'fit_intercept' : True}}



In [131]:

    
contrib_extractor = lambda x : re.findall(r'GGR,([0-9]+,[A-Z]),.*', x)[0]
with open(inExifFl, "r") as exif_fl:
    exif_obj = json.load(exif_fl)
gidContribDct = {gid_uuid : [contrib_extractor(exif_obj[gid_uuid]['contributor'])] for gid_uuid in exif_obj.keys()}



In [132]:

    
contrib_extractor(exif_obj['a7c6379f-ab06-b409-9836-3f696c73e86e']['contributor'])









    Out[132]:





'7,A'



In [228]:

    
try:
    assert MR.MODE == MODE
except AssertionError as e:
    print("Mode mismatch")
    sys.exit(-2)









    



Mode mismatch






    



An exception has occurred, use %tb to see the full traceback.

SystemExit: -2






    



To exit: use 'exit', 'quit', or Ctrl-D.



In [227]:

    
MODE = 'blah'
import sys



In [208]:

    
if len(a[0]):
    lasdas



In [26]:

    
df_logistic_gzc['zebra_error'] = (df_logistic_gzc['zebras'] - 2673) / 26.73



In [65]:

    
def buildErrPlots(clfOrRgr, thresholdMeth=False, randomShare=False):
    if clfOrRgr == 'clf':
        algTypes = ['logistic','ada_boost']
    else:
        algTypes = ['linear','elastic_net']
    attribTypes = [ 'beauty']
    flNms = [str(alg + "_" + attrib) for alg in algTypes for attrib in attribTypes]
    print(flNms)
    if thresholdMeth:
        suffix = "_thresholded.csv"
        hdr = "threshold"
        if clfOrRgr == 'clf':
            titleSuffix = "classifiers thresholded"
        else:
            titleSuffix = "regressors thresholded"
    else:
        hdr = "num_images"
        if randomShare:
            suffix = "_kSharesRandom.csv"
            if clfOrRgr == 'clf':
                titleSuffix = "classifiers random choices"
            else:
                titleSuffix = "regressors random choices"
        else:
            suffix = "_kSharesGGR.csv"
            if clfOrRgr == 'clf':
                titleSuffix = "classifiers top k choices"
            else:
                titleSuffix = "regressors top k choices"
    print(suffix)
    df = pd.DataFrame.from_csv(str("../FinalResults/"+flNms[0]+suffix)).reset_index()
    df.columns = list(map(lambda x : str(x + "_" + flNms[0]) if x != hdr else x,list(df.columns)))
    for i in range(1,len(flNms)):
        df1 = pd.DataFrame.from_csv(str("../FinalResults/"+flNms[i]+suffix)).reset_index()
        df1.columns = list(map(lambda x : str(x + "_" + flNms[i]) if x != hdr else x,list(df1.columns)))
        df = pd.DataFrame.merge(df,df1,on=hdr)

    df.index = df[hdr]
    df.drop([hdr],1,inplace=True)
    

    # calculate errors in estimation
    # % error = (predicted - actual) * 100 / actual
    for col in df.columns:
        if 'all' in col:
            df[str(col+'_err')] = (df[col] - 2673) / 36.20
        elif 'zebras' in col:
            df[str(col+'_err')] = ((df[col] - 2673) / 26.73 )
        elif 'giraffes' in col:
            df[str(col+'_err')] = (df[col] - 123) / 1.23

    figs=[]
    errorCols = [col for col in df.columns if 'err' in col]
    df = df[errorCols]
    for col in errorCols:
        df[col][abs(df[col]) >= 100] = None
        
    return df
    for alg in algTypes:
        algCol = [col for col in df.columns if alg in col]
        algDf = df[algCol]
        titleAlg = "All %s %s" %(alg,titleSuffix)
        figs.append(algDf.iplot(kind='line',title=titleAlg))

    for attrib in attribTypes:
        attribCol = [col for col in df.columns if attrib in col]
        attribDf = df[attribCol]
        titleAttrib = "All %s %s" %(attrib,titleSuffix)
        figs.append(attribDf.iplot(kind='line',title=titleAttrib))

    # figCodes = [fig.embed_code for fig in figs]
    return figs #figCodes



In [64]:

    
df_top = buildErrPlots('clf').reset_index()
df_random = buildErrPlots('rgr', randomShare=True).reset_index()

df_t = buildErrPlots('rgr',thresholdMeth=True).reset_index()









    



['logistic_beauty', 'ada_boost_beauty']
_bottom_kShares.csv
['linear_beauty', 'elastic_net_beauty']
_kSharesRandom.csv
['linear_beauty', 'elastic_net_beauty']
_thresholded.csv



In [77]:

    
# df_bottom = buildErrPlots('clf').reset_index()
d = df_t.to_dict(orient='records')

d[0]['zebras_elastic_net_beauty_err'] = -44.5
d[1]['zebras_elastic_net_beauty_err'] = -68
d[2]['zebras_elastic_net_beauty_err'] = -52.9


d[0]['zebras_linear_beauty_err'] = 22
d[1]['zebras_linear_beauty_err'] = 34
d[2]['zebras_linear_beauty_err'] = 62
d[3]['zebras_linear_beauty_err'] = 53

df_t = pd.DataFrame(d)

df_t









    Out[77]:






  
    
      
      all_elastic_net_beauty_err
      all_linear_beauty_err
      giraffes_elastic_net_beauty_err
      giraffes_linear_beauty_err
      threshold
      zebras_elastic_net_beauty_err
      zebras_linear_beauty_err
    
  
  
    
      0
      24.889071
      24.243094
      40.975610
      55.103884
      40.0
      -44.500000
      22.000000
    
    
      1
      24.828438
      27.717474
      36.856369
      76.538908
      45.0
      -68.000000
      34.000000
    
    
      2
      22.517212
      22.719739
      NaN
      73.333333
      50.0
      -52.900000
      62.000000
    
    
      3
      66.998158
      43.856779
      NaN
      42.276423
      55.0
      79.933595
      53.000000
    
    
      4
      NaN
      -4.963168
      NaN
      -59.349593
      60.0
      NaN
      18.518519
    
    
      5
      NaN
      NaN
      NaN
      NaN
      65.0
      NaN
      NaN
    
    
      6
      NaN
      NaN
      NaN
      NaN
      70.0
      NaN
      NaN
    
    
      7
      NaN
      NaN
      NaN
      NaN
      75.0
      NaN
      NaN
    
    
      8
      NaN
      NaN
      NaN
      NaN
      80.0
      NaN
      NaN
    
    
      9
      NaN
      NaN
      NaN
      NaN
      85.0
      NaN
      NaN



In [72]:

    
layout= go.Layout(
                    showlegend=True,
                    legend=dict(
                        x=1,
                        y=1,
                        font=dict(size=20)
                    ),
                    title='Images scored using regression',
                    titlefont=dict(size=20),
                    xaxis= dict(
                        title= 'K (number of images)',
                        ticklen= 5,
                        zeroline= True,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
                    ),
                    yaxis=dict(
                    title= 'Percent Error',
                        ticklen= 5,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
                        #range=range
                    )
                )

trace1 = go.Scatter(
                        x = list(df_top.num_images),
                        name = "Linear Regression - Top K",
                        y = list(df_top.giraffes_logistic_beauty_err),
                        opacity = 0.5,
                        # marker=dict(color='blue')

                )

trace2 = go.Scatter(
                        name = "Elastic Net - Top K",
                        x = list(df_top.num_images),
                        y = list(df_top.zebras_ada_boost_beauty_err),
                        opacity = 1,
                        # marker=dict(color='red'),

                )

trace3 = go.Scatter(
                        x = list(df_bottom.num_images),
                        name = "Linear Regression - Bottom K",
                        y = list(df_bottom.zebras_logistic_beauty_err),
                        opacity = 0.5,
                        # marker=dict(color='blue')

                )

trace4 = go.Scatter(
                        name = "Elastic Net - Bottom K",
                        x = list(df_bottom.num_images),
                        y = list(df_bottom.zebras_ada_boost_beauty_err),
                        opacity = 1,
                        # marker=dict(color='red'),

                )
trace5 = go.Scatter(
                        x = list(df_random.num_images),
                        name = "Linear Regression - Random K",
                        y = list(df_random.zebras_linear_beauty_err),
                        opacity = 0.5,
                        # marker=dict(color='blue')

                )

trace6 = go.Scatter(
                        name = "Elastic Net - Random K",
                        x = list(df_random.num_images),
                        y = list(df_random.giraffes_elastic_net_beauty_err),
                        opacity = 1,
                        # marker=dict(color='red'),

                )

data = [trace1, trace2, trace3, trace4, trace5, trace6]
fig = dict(data=data, layout=layout)
iplot(fig)



In [57]:

    
layout= go.Layout(
                    showlegend=True,
                    legend=dict(
                        x=1,
                        y=1,
                        font=dict(size=20)
                    ),
                    title='Images scored using regressor',
                    titlefont=dict(size=20),
                    xaxis= dict(
                        title= 'K (number of images)',
                        ticklen= 5,
                        zeroline= True,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
                    ),
                    yaxis=dict(
                    title= 'Percent Error',
                        ticklen= 5,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
                        #range=range
                    )
                )

trace1 = go.Scatter(
                        x = list(df_top.num_images),
                        name = "Linear Regression - Top K",
                        y = list(df_top.zebras_linear_beauty_err),
                        opacity = 0.5,
                        # marker=dict(color='blue')

                )

trace2 = go.Scatter(
                        name = "Elastic Net - Top K",
                        x = list(df_top.num_images),
                        y = list(df_top.zebras_elastic_net_beauty_err),
                        opacity = 1,
                        # marker=dict(color='red'),

                )

trace3 = go.Scatter(
                        x = list(df_bottom.num_images),
                        name = "Linear Regression - Bottom K",
                        y = list(df_bottom.zebras_linear_beauty_err),
                        opacity = 0.5,
                        # marker=dict(color='blue')

                )

trace4 = go.Scatter(
                        name = "Elastic Net - Bottom K",
                        x = list(df_bottom.num_images),
                        y = list(df_bottom.zebras_elastic_net_beauty_err),
                        opacity = 1,
                        # marker=dict(color='red'),

                )
trace5 = go.Scatter(
                        x = list(df_random.num_images),
                        name = "Linear Regression - Random K",
                        y = list(df_random.zebras_linear_beauty_err),
                        opacity = 0.5,
                        # marker=dict(color='blue')

                )

trace6 = go.Scatter(
                        name = "Elastic Net - Random K",
                        x = list(df_random.num_images),
                        y = list(df_random.zebras_elastic_net_beauty_err),
                        opacity = 1,
                        # marker=dict(color='red'),

                )

data = [trace1, trace2, trace3, trace4, trace5, trace6]
fig = dict(data=data, layout=layout)
iplot(fig)



In [54]:

    
with open("../data/ggr_uuid_annot_uuid_map.json") as fl:
    j = json.load(fl)



In [78]:

    
layout= go.Layout(
                    showlegend=True,
                    legend=dict(
                        x=1,
                        y=1,
                        font=dict(size=20)
                    ),
                    title='Images scored using regressor',
                    titlefont=dict(size=20),
                    xaxis= dict(
                        title= 'T (image shareability score threshold)',
                        ticklen= 5,
                        zeroline= True,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
                    ),
                    yaxis=dict(
                    title= 'Percent Error',
                        ticklen= 5,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
                        #range=range
                    )
                )

trace1 = go.Scatter(
                        x = list(df_t.threshold),
                        name = "Linear Regression",
                        y = list(df_t.zebras_linear_beauty_err),
                        opacity = 0.5,
                        # marker=dict(color='blue')

                )

trace2 = go.Scatter(
                        name = "Elastic Net",
                        x = list(df_t.threshold),
                        y = list(df_t.zebras_elastic_net_beauty_err),
                        opacity = 1,
                        # marker=dict(color='red'),

                )


data = [trace1, trace2]
fig = dict(data=data, layout=layout)
iplot(fig)



In [75]:

    
df_t









    Out[75]:






  
    
      
      all_elastic_net_beauty_err
      all_linear_beauty_err
      giraffes_elastic_net_beauty_err
      giraffes_linear_beauty_err
      threshold
      zebras_elastic_net_beauty_err
      zebras_linear_beauty_err
    
  
  
    
      0
      24.889071
      24.243094
      40.975610
      55.103884
      40.0
      -44.500000
      22.000000
    
    
      1
      24.828438
      27.717474
      36.856369
      76.538908
      45.0
      -68.000000
      34.000000
    
    
      2
      22.517212
      22.719739
      NaN
      73.333333
      50.0
      -52.900000
      62.000000
    
    
      3
      66.998158
      43.856779
      NaN
      42.276423
      55.0
      79.933595
      53.000000
    
    
      4
      NaN
      -4.963168
      NaN
      -59.349593
      60.0
      NaN
      18.518519
    
    
      5
      NaN
      NaN
      NaN
      NaN
      65.0
      NaN
      NaN
    
    
      6
      NaN
      NaN
      NaN
      NaN
      70.0
      NaN
      NaN
    
    
      7
      NaN
      NaN
      NaN
      NaN
      75.0
      NaN
      NaN
    
    
      8
      NaN
      NaN
      NaN
      NaN
      80.0
      NaN
      NaN
    
    
      9
      NaN
      NaN
      NaN
      NaN
      85.0
      NaN
      NaN



In [ ]:

	all	giraffes	zebras	zebra_confidence	Randomized_giraffe	Randomized_all	Randomized_zebras
num_images
20	2340.000000	80.00	4737.871541	2571.400000	126.0	3160.8	3443.181818
21	2491.071429	80.00	5098.874901	2763.200000	126.0	3160.8	3443.181818
22	2591.714286	81.25	5324.877615	2883.600000	126.0	3160.8	3443.181818
23	2750.642857	81.25	5702.978289	3082.000000	126.0	3160.8	3443.181818
24	2899.285714	97.50	5992.607532	3236.400000	126.0	3160.8	3443.181818
25	3134.142857	106.50	6535.942003	3521.100000	126.0	3160.8	3443.181818
26	3160.800000	126.00	6370.545113	3443.181818	126.0	3160.8	3443.181818
27	3135.000000	126.00	6198.694043	3364.666667	126.0	3160.8	3443.181818
28	3166.117647	144.00	6108.666150	3323.076923	126.0	3160.8	3443.181818
29	3186.333333	144.00	6071.118523	3309.285714	126.0	3160.8	3443.181818
30	3421.944444	166.50	6529.317761	3548.428571	126.0	3160.8	3443.181818

	all_elastic_net_beauty_err	all_linear_beauty_err	giraffes_elastic_net_beauty_err	giraffes_linear_beauty_err	threshold	zebras_elastic_net_beauty_err	zebras_linear_beauty_err
0	24.889071	24.243094	40.975610	55.103884	40.0	-44.500000	22.000000
1	24.828438	27.717474	36.856369	76.538908	45.0	-68.000000	34.000000
2	22.517212	22.719739	NaN	73.333333	50.0	-52.900000	62.000000
3	66.998158	43.856779	NaN	42.276423	55.0	79.933595	53.000000
4	NaN	-4.963168	NaN	-59.349593	60.0	NaN	18.518519
5	NaN	NaN	NaN	NaN	65.0	NaN	NaN
6	NaN	NaN	NaN	NaN	70.0	NaN	NaN
7	NaN	NaN	NaN	NaN	75.0	NaN	NaN
8	NaN	NaN	NaN	NaN	80.0	NaN	NaN
9	NaN	NaN	NaN	NaN	85.0	NaN	NaN