In [127]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from collections import defaultdict
import vislab
import vislab.datasets
import vislab._results
import vislab.results
import sklearn.metrics
pd.options.display.float_format = '{:.2f}'.format
In [128]:
collection_name = 'flickr_mar23'
results_dirname = vislab.util.makedirs(vislab.config['paths']['shared_data'] + '/results')
cache_filename = '{}/{}_thresholds_and_accs.h5'.format(results_dirname, collection_name)
In [129]:
turk_df = pd.read_hdf('/Users/sergeyk/Dropbox/mturk-results/mturk-results.h5', 'df')
print turk_df[['conf_Bright', 'tagged_Bright', 'style_Bright']].dropna().head()
print turk_df.shape
print turk_df['_split'].value_counts()
In [130]:
# Load predictions
df, preds_panel = vislab._results.load_pred_results(
collection_name, results_dirname,
multiclass=True, force=False)
preds_df = preds_panel.minor_xs('caffe_fc6 None vw').copy()
preds_df = preds_df[preds_df['split'] == 'test']
print preds_df.shape
accs_df = pd.read_hdf(cache_filename, 'acc_df')
threshold_df = pd.read_hdf(cache_filename, 'threshold_df')
tdf = threshold_df['caffe_fc6 None vw'].copy()
print accs_df.head()
print accs_df.mean(0)
print threshold_df.head()
In [131]:
styles = list(set(vislab.datasets.flickr.underscored_style_names) - set(['style_Bokeh', 'style_Texture']))
accuracies = defaultdict(dict)
for style in styles:
tag_name = style.replace('style_', 'tagged_')
df_ = turk_df.dropna(subset=[tag_name])
ind = vislab.results.get_balanced_dataset_ind(df_, style)
df_ = df_.iloc[ind]
accuracies['MTurk accuracy, Flickr g.t.'][style] = sklearn.metrics.accuracy_score(
df_[style], df_[tag_name].astype(bool))
for style in styles:
accuracies['Our accuracy, Flickr g.t.'][style] = vislab.results.pred_accuracy_at_threshold(preds_df, style, tdf[style])
for style in styles:
pdf = preds_df.copy()
pdf[style] = turk_df[style.replace('style_', 'tagged_')]
pdf = pdf.dropna()
accuracies['Our accuracy, MTurk g.t.'][style] = vislab.results.pred_accuracy_at_threshold(pdf, style, tdf[style])
acc_df = pd.DataFrame(accuracies) * 100
acc_df.index = [
vislab.datasets.flickr.style_names[vislab.datasets.flickr.underscored_style_names.index(_)]
for _ in acc_df.index
]
print acc_df.mean(0)
acc_df
Out[131]:
In [132]:
acc_df.describe()
Out[132]:
In [133]:
print acc_df.to_latex()
In [134]:
# Display the styles that experiences more than 5% change in accuracy when
# switching from Flickr to MTurk ground truth.
name = '% change going from Flickr to MTurk g.t.'
acc_df[name] = 100. * (acc_df['Our accuracy, MTurk g.t.'] - acc_df['Our accuracy, Flickr g.t.']) / acc_df['Our accuracy, Flickr g.t.']
acc_df = acc_df.sort(name)
columns = ['Our accuracy, Flickr g.t.', 'Our accuracy, MTurk g.t.', name]
acc_df[acc_df[name].abs() > 5][columns]
Out[134]:
In [119]:
print acc_df[acc_df[name].abs() > 5][columns].to_latex()
In [135]:
# Display the styles that had more than 0.05 accuracy points change
# between us and MTurkers.
name = 'Accuracy diff. between us and MTurk'
acc_df[name] = acc_df['Our accuracy, Flickr g.t.'] - acc_df['MTurk accuracy, Flickr g.t.']
acc_df = acc_df.sort(name)
columns = ['MTurk accuracy, Flickr g.t.', 'Our accuracy, Flickr g.t.', name]
acc_df[acc_df[name].abs() > 5][columns]
Out[135]:
In [136]:
print acc_df[acc_df[name].abs() > 5][columns].to_latex()