In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline
import pandas as pd
import scipy
In [2]:
import sklearn.metrics
import mir_eval
In [3]:
import cPickle as pickle
In [4]:
from glob import glob
import re
In [28]:
pd.set_option('precision', 4)
pd.set_option('max_rows', 2000)
In [6]:
np.set_printoptions(precision=3)
seaborn.set(style='darkgrid')
In [7]:
def plot_curve(file='', intervals=None, labels=None, scores=None, norm=None, min_score=0.0, **kwargs):
file_name = file
label_agreement = np.zeros((len(labels), len(labels)), dtype=bool)
for i in range(len(labels)):
for j in range(i, len(labels)):
label_agreement[i, j] = (labels[i] == labels[j])
label_agreement[j, i] = label_agreement[i, j]
time_norm = 1
durations = np.diff(intervals, axis=1).ravel()
if norm == 'min':
time_norm = np.minimum.outer(durations, durations)
elif norm == 'max':
time_norm = np.maximum.outer(durations, durations)
elif norm == 'hmean':
time_norm = 2./np.add.outer(durations, durations)
time_norm *= np.multiply.outer(durations, durations)
# TODO: have the label agreement index out nan-valued scores
scores = scores / time_norm
label_agreement[np.tril_indices_from(label_agreement, k=0)] = False
label_agreement[~np.isfinite(scores)] = False
label_disagreement = ~label_agreement
label_disagreement[np.tril_indices_from(label_disagreement, k=0)] = False
label_disagreement[~np.isfinite(scores)] = False
tp_scores = scores[label_agreement]
fp_scores = scores[label_disagreement]
num_pos = np.sum(label_agreement)
num_neg = np.sum(label_disagreement)
y_true = np.concatenate([np.zeros(len(tp_scores)), np.ones(len(fp_scores))])
y_score = np.concatenate([tp_scores, fp_scores])
fpr, tpr, thr = sklearn.metrics.roc_curve(y_true, y_score)
tp = num_pos * tpr
fp = num_neg * fpr
precision = tp / (tp + fp)
recall = tpr
fmeasure = np.asarray([mir_eval.util.f_measure(p, r) for p, r in zip(precision, recall)])
k = np.argmax(fmeasure)
thr_opt = thr[k]
plt.figure(figsize=(12, 4))
plt.subplot(1,3,1)
plt.plot([0, 1], [0, 1], linestyle='--', alpha=0.5)
plt.plot(fpr, tpr)
plt.plot(fpr[k], tpr[k], color='r', marker='*', markersize=10, alpha=0.5)
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title(file_name)
plt.subplot(1,3,2)
plt.plot(recall, precision)
plt.plot(recall[k], precision[k], marker='*', markersize=10, alpha=0.5, color='r')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('norm={}'.format(norm))
plt.subplot(1,3,3)
plt.plot(thr, fmeasure)
k = np.argmax(fmeasure)
plt.plot(thr[k], fmeasure[k], marker='*', markersize=10, alpha=0.5, color='r')
plt.xlabel(r'$\theta$')
plt.ylabel('$F_1$')
plt.title(r'({:.3f}, {:.3f})'.format(thr[k], fmeasure[k]))
plt.tight_layout()
return thr[k], fmeasure[k]
In [15]:
def summarize_results(files):
data = {}
for fname in files:
results = pickle.load(open(fname, 'r'))
fscores = pd.DataFrame.from_dict([f['fmeasures'] for f in results['file_scores']
if f['fmeasures'] is not None])
match = re.match('.*scores_datasetE(?P<dataset>.*?)E(?P<scale>.*?)_distE(?P<metric>.*).pk', fname)
fscores['dataset'] = pd.Series(match.group('dataset'), index=fscores.index)
fscores['scale'] = pd.Series(match.group('dataset') + '_' + match.group('scale'), index=fscores.index)
fscores['metric'] = pd.Series(match.group('metric'), index=fscores.index)
data[fname] = fscores
return data
In [16]:
output = summarize_results(sorted(glob('../data/*.pk')))
In [17]:
all_results = pd.concat(output.values())
In [31]:
all_results_ = all_results
In [32]:
all_results = all_results_[all_results_['none'] > 0.5]
In [33]:
all_results.groupby(['scale', 'metric']).hist(layout=(1,4), figsize=(12,2))
Out[33]:
In [34]:
print all_results.groupby(['scale', 'metric']).mean()
In [35]:
all_results.groupby(['scale', 'metric']).describe()
Out[35]:
In [14]:
vars().update(pickle.load(open('../data/scores_datasetESALAMI_levelEsmall_scale_distEcorrelation.pk', 'r')))
In [15]:
fscores = pd.DataFrame.from_dict([f['fmeasures'] for f in file_scores if f['fmeasures'] is not None])
In [16]:
fscores.describe()
Out[16]:
In [17]:
k = 382
for norm in [None, 'min', 'max', 'hmean']:
print plot_curve(norm=norm, **file_scores[k])
In [19]:
plt.figure(figsize=(12,5))
seaborn.heatmap(file_scores[k]['scores'],
yticklabels=file_scores[k]['labels'],
xticklabels=file_scores[k]['labels'])
plt.tight_layout()
In [ ]: