In [127]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from collections import defaultdict
import vislab
import vislab.datasets
import vislab._results
import vislab.results
import sklearn.metrics
pd.options.display.float_format = '{:.2f}'.format


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [128]:
collection_name = 'flickr_mar23'
results_dirname = vislab.util.makedirs(vislab.config['paths']['shared_data'] + '/results')
cache_filename = '{}/{}_thresholds_and_accs.h5'.format(results_dirname, collection_name)

In [129]:
turk_df = pd.read_hdf('/Users/sergeyk/Dropbox/mturk-results/mturk-results.h5', 'df')
print turk_df[['conf_Bright', 'tagged_Bright', 'style_Bright']].dropna().head()
print turk_df.shape
print turk_df['_split'].value_counts()


             conf_Bright tagged_Bright style_Bright
10041013705         1.00          True        False
10050082426         0.43         False        False
10065812373         0.80          True        False
10072845645         0.20         False        False
10077053056         0.43         False        False

[5 rows x 3 columns]
(14252, 63)
test    14252
dtype: int64

In [130]:
# Load predictions

df, preds_panel = vislab._results.load_pred_results(
    collection_name, results_dirname,
    multiclass=True, force=False)
preds_df = preds_panel.minor_xs('caffe_fc6 None vw').copy()
preds_df = preds_df[preds_df['split'] == 'test']
print preds_df.shape

accs_df = pd.read_hdf(cache_filename, 'acc_df')
threshold_df = pd.read_hdf(cache_filename, 'threshold_df')
tdf = threshold_df['caffe_fc6 None vw'].copy()
print accs_df.head()
print accs_df.mean(0)
print threshold_df.head()


Loaded from cache: 3 records
(16000, 43)
                      caffe_fc6 None vw  caffe_fc7 None vw  mc_bit None vw
style_Bokeh                        0.79               0.79            0.77
style_Bright                       0.73               0.73            0.69
style_Depth_of_Field               0.69               0.68            0.70
style_Detailed                     0.75               0.76            0.76
style_Ethereal                     0.81               0.81            0.77

[5 rows x 3 columns]
caffe_fc6 None vw   0.78
caffe_fc7 None vw   0.78
mc_bit None vw      0.77
dtype: float64
                      caffe_fc6 None vw  caffe_fc7 None vw  mc_bit None vw
style_Bokeh                       -0.89              -0.89           -0.93
style_Bright                      -0.89              -0.91           -0.93
style_Depth_of_Field              -0.91              -0.93           -0.89
style_Detailed                    -0.89              -0.89           -0.89
style_Ethereal                    -0.89              -0.89           -0.86

[5 rows x 3 columns]

In [131]:
styles = list(set(vislab.datasets.flickr.underscored_style_names) - set(['style_Bokeh', 'style_Texture']))
accuracies = defaultdict(dict)
for style in styles:
    tag_name = style.replace('style_', 'tagged_')
    df_ = turk_df.dropna(subset=[tag_name])
    ind = vislab.results.get_balanced_dataset_ind(df_, style)
    df_ = df_.iloc[ind]
    accuracies['MTurk accuracy, Flickr g.t.'][style] = sklearn.metrics.accuracy_score(
        df_[style], df_[tag_name].astype(bool))

for style in styles:
    accuracies['Our accuracy, Flickr g.t.'][style] = vislab.results.pred_accuracy_at_threshold(preds_df, style, tdf[style])
    
for style in styles:
    pdf = preds_df.copy()
    pdf[style] = turk_df[style.replace('style_', 'tagged_')]
    pdf = pdf.dropna()
    accuracies['Our accuracy, MTurk g.t.'][style] = vislab.results.pred_accuracy_at_threshold(pdf, style, tdf[style])

acc_df = pd.DataFrame(accuracies) * 100
acc_df.index = [
    vislab.datasets.flickr.style_names[vislab.datasets.flickr.underscored_style_names.index(_)]
    for _ in acc_df.index
]
print acc_df.mean(0)
acc_df


MTurk accuracy, Flickr g.t.   75.59
Our accuracy, Flickr g.t.     78.12
Our accuracy, MTurk g.t.      77.28
dtype: float64
Out[131]:
MTurk accuracy, Flickr g.t. Our accuracy, Flickr g.t. Our accuracy, MTurk g.t.
Bright 69.93 73.38 73.52
Depth of Field 70.60 68.50 81.44
Detailed 63.30 75.25 68.55
Ethereal 78.53 80.62 78.63
Geometric Composition 81.40 77.75 81.64
HDR 72.79 82.00 76.44
Hazy 84.17 80.75 83.59
Horror 90.42 84.25 80.31
Long Exposure 73.83 84.19 74.21
Macro 91.71 86.56 88.64
Melancholy 67.19 70.88 69.06
Minimal 82.04 83.75 80.64
Noir 81.62 85.25 84.56
Pastel 66.87 74.56 77.30
Romantic 61.04 68.00 65.23
Serene 70.14 70.44 79.79
Sunny 84.45 84.56 78.99
Vintage 70.60 75.50 68.48

18 rows × 3 columns


In [132]:
acc_df.describe()


Out[132]:
MTurk accuracy, Flickr g.t. Our accuracy, Flickr g.t. Our accuracy, MTurk g.t.
count 18.00 18.00 18.00
mean 75.59 78.12 77.28
std 9.02 6.21 6.33
min 61.04 68.00 65.23
25% 69.98 73.67 73.69
50% 73.31 79.19 78.81
75% 81.93 84.08 81.24
max 91.71 86.56 88.64

8 rows × 3 columns


In [133]:
print acc_df.to_latex()


\begin{tabular}{lrrr}
\toprule
{} &  MTurk accuracy, Flickr g.t. &  Our accuracy, Flickr g.t. &  Our accuracy, MTurk g.t. \\
\midrule
Bright                &                        69.93 &                      73.38 &                     73.52 \\
Depth of Field        &                        70.60 &                      68.50 &                     81.44 \\
Detailed              &                        63.30 &                      75.25 &                     68.55 \\
Ethereal              &                        78.53 &                      80.62 &                     78.63 \\
Geometric Composition &                        81.40 &                      77.75 &                     81.64 \\
HDR                   &                        72.79 &                      82.00 &                     76.44 \\
Hazy                  &                        84.17 &                      80.75 &                     83.59 \\
Horror                &                        90.42 &                      84.25 &                     80.31 \\
Long Exposure         &                        73.83 &                      84.19 &                     74.21 \\
Macro                 &                        91.71 &                      86.56 &                     88.64 \\
Melancholy            &                        67.19 &                      70.88 &                     69.06 \\
Minimal               &                        82.04 &                      83.75 &                     80.64 \\
Noir                  &                        81.62 &                      85.25 &                     84.56 \\
Pastel                &                        66.87 &                      74.56 &                     77.30 \\
Romantic              &                        61.04 &                      68.00 &                     65.23 \\
Serene                &                        70.14 &                      70.44 &                     79.79 \\
Sunny                 &                        84.45 &                      84.56 &                     78.99 \\
Vintage               &                        70.60 &                      75.50 &                     68.48 \\
\bottomrule
\end{tabular}


In [134]:
# Display the styles that experiences more than 5% change in accuracy when
# switching from Flickr to MTurk ground truth.

name = '% change going from Flickr to MTurk g.t.'
acc_df[name] = 100. * (acc_df['Our accuracy, MTurk g.t.'] - acc_df['Our accuracy, Flickr g.t.']) / acc_df['Our accuracy, Flickr g.t.']
acc_df = acc_df.sort(name)
columns = ['Our accuracy, Flickr g.t.', 'Our accuracy, MTurk g.t.', name]
acc_df[acc_df[name].abs() > 5][columns]


Out[134]:
Our accuracy, Flickr g.t. Our accuracy, MTurk g.t. % change going from Flickr to MTurk g.t.
Long Exposure 84.19 74.21 -11.85
Vintage 75.50 68.48 -9.29
Detailed 75.25 68.55 -8.91
HDR 82.00 76.44 -6.78
Sunny 84.56 78.99 -6.58
Geometric Composition 77.75 81.64 5.00
Serene 70.44 79.79 13.27
Depth of Field 68.50 81.44 18.89

8 rows × 3 columns


In [119]:
print acc_df[acc_df[name].abs() > 5][columns].to_latex()


\begin{tabular}{lrrr}
\toprule
{} &  Our accuracy, Flickr g.t. &  Our accuracy, MTurk g.t. &  \% change going from Flickr to MTurk g.t. \\
\midrule
Vintage        &                      75.50 &                     67.80 &                                    -10.19 \\
Detailed       &                      75.25 &                     68.44 &                                     -9.05 \\
Long Exposure  &                      84.19 &                     76.79 &                                     -8.79 \\
Minimal        &                      83.75 &                     78.57 &                                     -6.18 \\
HDR            &                      82.00 &                     76.96 &                                     -6.15 \\
Sunny          &                      84.56 &                     79.94 &                                     -5.46 \\
Serene         &                      70.44 &                     76.80 &                                      9.03 \\
Depth of Field &                      68.50 &                     81.05 &                                     18.32 \\
\bottomrule
\end{tabular}


In [135]:
# Display the styles that had more than 0.05 accuracy points change
# between us and MTurkers.

name = 'Accuracy diff. between us and MTurk'
acc_df[name] = acc_df['Our accuracy, Flickr g.t.'] - acc_df['MTurk accuracy, Flickr g.t.']
acc_df = acc_df.sort(name)
columns = ['MTurk accuracy, Flickr g.t.', 'Our accuracy, Flickr g.t.', name]
acc_df[acc_df[name].abs() > 5][columns]


Out[135]:
MTurk accuracy, Flickr g.t. Our accuracy, Flickr g.t. Accuracy diff. between us and MTurk
Horror 90.42 84.25 -6.17
Macro 91.71 86.56 -5.15
Romantic 61.04 68.00 6.96
Pastel 66.87 74.56 7.69
HDR 72.79 82.00 9.21
Long Exposure 73.83 84.19 10.35
Detailed 63.30 75.25 11.95

7 rows × 3 columns


In [136]:
print acc_df[acc_df[name].abs() > 5][columns].to_latex()


\begin{tabular}{lrrr}
\toprule
{} &  MTurk accuracy, Flickr g.t. &  Our accuracy, Flickr g.t. &  Accuracy diff. between us and MTurk \\
\midrule
Horror        &                        90.42 &                      84.25 &                                -6.17 \\
Macro         &                        91.71 &                      86.56 &                                -5.15 \\
Romantic      &                        61.04 &                      68.00 &                                 6.96 \\
Pastel        &                        66.87 &                      74.56 &                                 7.69 \\
HDR           &                        72.79 &                      82.00 &                                 9.21 \\
Long Exposure &                        73.83 &                      84.19 &                                10.35 \\
Detailed      &                        63.30 &                      75.25 &                                11.95 \\
\bottomrule
\end{tabular}