In [1]:
%load_ext autoreload
%autoreload 2
import os
import sklearn.metrics
import pandas as pd
import vislab.datasets
import vislab.results
import vislab._results
In [2]:
label_df = vislab.datasets.pinterest.get_pins_80k_df()
label_df.columns
Out[2]:
In [3]:
c = vislab.util.get_mongodb_client()['predict']['pinterest_80k_mar23']
if c.find({'features': 'noise'}).count() > 0:
c.remove({'features': 'noise'})
pd.DataFrame([x for x in c.find()])
Out[3]:
In [241]:
results_dirname = vislab.util.makedirs(vislab.config['paths']['shared_data'] + '/results_mar23')
df, preds_panel = vislab._results.load_pred_results(
'pinterest_80k_mar23', results_dirname,
multiclass=True, force=True)
pred_prefix = 'pred'
print preds_panel.minor_axis
pred_df = preds_panel.minor_xs('caffe_fc6 None vw')
pred_df['image_url'] = label_df['image_url']
In [242]:
nice_feat_names = {
'caffe_fc6 None vw': 'Caffe FC6', 'caffe_fc6 False vw': 'Caffe FC6',
'caffe_fc7 None vw': 'Caffe FC7', 'caffe_fc7 False vw': 'Caffe FC7',
'mc_bit None vw': 'MC binary',
'random': 'Random'
}
mc_metrics = vislab.results.multiclass_metrics_feat_comparison(
preds_panel, label_df, pred_prefix, features=preds_panel.minor_axis.tolist() + ['random'],
balanced=True, with_plot=False, with_print=False, nice_feat_names=nice_feat_names)
ap_df = mc_metrics['ap_df'].copy()
In [4]:
results_dirname = vislab.util.makedirs(vislab.config['paths']['shared_data'] + '/results_mar23')
df, flickr_preds_panel = vislab._results.load_pred_results(
'flickr_on_pinterest_80k_mar23', results_dirname,
multiclass=True, force=False)
print flickr_preds_panel.minor_axis
In [12]:
flickr_preds_panel.shape
Out[12]:
In [5]:
# Get correct labels
dfs = []
for feat in flickr_preds_panel.minor_axis:
df = flickr_preds_panel.minor_xs(feat)
for col in label_df.columns:
df[col] = label_df[col]
dfs.append(df)
flickr_preds_panel = pd.Panel(
dict(zip(flickr_preds_panel.minor_axis, dfs))
).swapaxes('minor', 'items')
In [246]:
flickr_on_pinterest_mc_metrics = vislab.results.multiclass_metrics_feat_comparison(
flickr_preds_panel, label_df, pred_prefix, features=flickr_preds_panel.minor_axis.tolist() + ['random'],
balanced=True, with_plot=False, with_print=False, nice_feat_names=nice_feat_names)
In [14]:
flickr_pred_df = flickr_preds_panel.minor_xs('caffe_fc6 False vw')
flickr_pred_df['image_url'] = label_df['image_url']
flickr_pred_df.shape
Out[14]:
In [248]:
ap_df['Caffe FC6, Flickr-trained'] = flickr_on_pinterest_mc_metrics['ap_df']['Caffe FC6']
ap_df['Difference'] = ap_df['Caffe FC6'] - ap_df['Caffe FC6, Flickr-trained']
ap_df['Difference %'] = ap_df['Difference'] / ap_df['Caffe FC6']
ap_df['%'] = ap_df['Caffe FC6, Flickr-trained'] / ap_df['Caffe FC6']
In [249]:
diff = ap_df.sort('%')['%']
diff_top = diff[diff >= np.percentile(diff, 75)]
print('Styles that are most transferrable from Flickr to Pinterest (mean Flickr-trained AP: {:.1f}% of Pinterest-trained AP): {}.'.format(
diff_top.values.mean() * 100,
', '.join([_[6:] for _ in sorted(diff_top.index.tolist())]),
))
diff_bottom = diff[diff <= np.percentile(diff, 25)]
print('Styles that are least transferrable from Flickr to Pinterest (mean {:.1f}% of Pinterest-trained AP): {}.'.format(
diff_bottom.values.mean() * 100,
', '.join([_[6:] for _ in sorted(diff_bottom.index.tolist())]),
))
In [8]:
# Drop a couple of annoyingly formatted images
# pred_df = pred_df.drop(['389913280210700272', '288230444872088394', '287034176223604657'])
flickr_pred_df = flickr_pred_df.drop(['389913280210700272', '288230444872088394', '287034176223604657'])
In [9]:
# Get the caption data
# pred_df['caption'] = label_df['caption']
flickr_pred_df['caption'] = label_df['caption']
In [43]:
from IPython.display import HTML
def top_k_images(df, k=10):
return ' '.join(
'<div style="display: inline-block;"><img src="{}" width="210px" /><br />{}</div>'.format(row['image_url'], row.name)
for i, row in df[:k].iterrows())
import subprocess
import shlex
import re
def top_images_for_caption_and_style(df, caption_regexp, style, split=None):
r = re.compile(caption_regexp)
ix = df.index[[r.search(_) is not None for _ in df['caption']]]
df_ = df.loc[ix]
if split is not None:
df_ = df_[df_['split'] == split]
title = '<h4>{}, query: {}, results: {}</h4>'.format(
style, caption_regexp, df_.shape[0])
df_ = df_.sort(style, ascending=False)
# download and resize to folder
d = os.path.expanduser('~/work/aphrodite-writeup/figures/flickr_on_pinterest/')
dirname = vislab.util.makedirs(d + '{}/{}/'.format(caption_regexp, style))
w_dirname = vislab.util.makedirs(d + '{}/{}/w/'.format(caption_regexp, style))
h_dirname = vislab.util.makedirs(d + '{}/{}/h/'.format(caption_regexp, style))
counter = 0
for i, row in df_[:5].iterrows():
cmd = 'wget {} -O {}.jpg'.format(row['image_url'], counter)
subprocess.call(shlex.split(cmd), cwd=dirname)
cmd = 'find . -name "*.jpg" -depth 1 -exec convert {} -resize x310 -gravity Center -crop 192x310+0+0 -density 300 -units PixelsPerInch h/{} \;'
subprocess.call(shlex.split(cmd), cwd=dirname)
cmd = 'find . -name "*.jpg" -depth 1 -exec convert {} -resize 500 -gravity Center -crop 500x310+0+0 -density 300 -units PixelsPerInch w/{} \;'
subprocess.call(shlex.split(cmd), cwd=dirname)
counter += 1
return title + top_k_images(df_, k=5)
In [38]:
HTML('<h2>Flickr-learned style on all Pinterest data</h2>' + ' '.join(
top_images_for_caption_and_style(flickr_pred_df.iloc[:100], '', 'pred_' + style)
for style in vislab.datasets.flickr.underscored_style_names[:3]
))
Out[38]:
In [260]:
HTML('<h2>Pinterest-learned style on all Pinterest data</h2>' + ' '.join(
top_images_for_caption_and_style(pred_df, '', 'pred_' + style)
for style in vislab.datasets.flickr.underscored_style_names
))
Out[260]:
In [42]:
HTML('<h2>Flickr-learned style on Pinterest test data that matches caption query: "dress"</h2>' + ' '.join(
top_images_for_caption_and_style(flickr_pred_df, 'dress', 'pred_' + style)
for style in vislab.datasets.flickr.underscored_style_names
))
Out[42]:
In [262]:
HTML('<h2>Pinterest-learned style on Pinterest test data that matches caption query: "dress"</h2>' + ' '.join(
top_images_for_caption_and_style(pred_df, 'dress', 'pred_' + style)
for style in vislab.datasets.flickr.underscored_style_names
))
Out[262]:
In [44]:
HTML('<h2>Flickr-learned style on Pinterest test data that matches caption query: "flower"</h2>' + ' '.join(
top_images_for_caption_and_style(flickr_pred_df, 'flower', 'pred_' + style)
for style in vislab.datasets.flickr.underscored_style_names
))
Out[44]:
In [264]:
HTML('<h2>Pinterest-learned style on Pinterest test data that matches caption query: "flower"</h2>' + ' '.join(
top_images_for_caption_and_style(pred_df, 'flower', 'pred_' + style)
for style in vislab.datasets.flickr.underscored_style_names
))
Out[264]: