In [1]:
from sklearn.datasets import fetch_20newsgroups_vectorized
In [3]:
news20_trn = fetch_20newsgroups_vectorized(subset='train',
remove=('headers', 'footers', 'quotes'),
data_home="data/news20")
news20_tst = fetch_20newsgroups_vectorized(subset='test',
remove=('headers', 'footers', 'quotes'),
data_home="data/news20")
In [5]:
import pickle
In [6]:
with open('data/news20/news20.pkl', 'wb') as f:
pickle.dump(news20_trn, f, -1)
pickle.dump(news20_tst, f, -1)
In [98]:
with open('data/news20/news20.pkl', 'rb') as f:
news20_trn = pickle.load(f)
news20_tst = pickle.load(f)
X_trn, y_trn = news20_trn.data, news20_trn.target
X_tst, y_tst = news20_tst.data, news20_tst.target
In [2]:
target_names = news20_tst.target_names
target_num = len(target_names)
target_supports = [sum(y_tst == target) for target in range(target_num)]
In [3]:
for i in range(target_num):
print('{}\t{}\t{}'.format(i, target_names[i], target_supports[i]))
In [4]:
clfs = []
from sklearn import naive_bayes as nb
clfs.append(nb.BernoulliNB(alpha=1e-14))
clfs.append(nb.MultinomialNB(alpha=1e-3))
from sklearn import svm
clfs.append(svm.LinearSVC(C=4, penalty='l1', dual=False))
clfs.append(svm.LinearSVC(C=2, penalty='l2', dual=False))
In [5]:
#clf_names = ['NB_Bern', 'NB_Mult', 'SVM_L1', 'SVM_L2']
clf_names = ['A', 'B', 'C', 'D']
In [6]:
clf_y_hat = []
for clf in clfs:
clf.fit(X_trn, y_trn)
y_hat = clf.predict(X_tst)
clf_y_hat.append(y_hat)
In [7]:
from sklearn import metrics
clf_target_scores = []
clf_target_f1 = []
for j in range(len(clfs)):
y_hat = clf_y_hat[j]
target_scores = metrics.f1_score(y_tst, y_hat, average=None)
clf_target_scores.append(target_scores)
target_mi_f1 = metrics.f1_score(y_tst, y_hat, average='micro')
target_ma_f1 = metrics.f1_score(y_tst, y_hat, average='macro')
clf_target_f1.append((target_mi_f1, target_ma_f1))
#print(metrics.classification_report(y_tst, y_hat, target_names=target_names, digits=3))
In [8]:
for i in range(target_num):
print(i, end="")
for j in range(len(clfs)):
print('\t{:.3f}'.format(clf_target_scores[j][i]), end="")
print()
print('miF1', end="")
for j in range(len(clfs)):
print('\t{:.3f}'.format(clf_target_f1[j][0]), end="")
print()
print('maF1', end="")
for j in range(len(clfs)):
print('\t{:.3f}'.format(clf_target_f1[j][1]), end="")
print()
In [9]:
import numpy as np
import pymc3 as pm
In [10]:
from importlib import reload
In [78]:
import bperf_model as bpm
reload(bpm)
Out[78]:
In [12]:
def compare_classifiers(y_hat_a, y_hat_b):
target_traces = []
for i in range(target_num):
print(i, end="")
#
y_true = np.array(y_tst == i)
y_pred_a = np.array(y_hat_a == i)
y_pred_b = np.array(y_hat_b == i)
#
f1_score_a = metrics.f1_score(y_true, y_pred_a)
f1_score_b = metrics.f1_score(y_true, y_pred_b)
#print f1_score_a - f1_score_b
#
#model = bpm.build_model_paired(y_true, y_pred_a, y_pred_b, 1)
#model = bpm.build_model_paired(y_true, y_pred_a, y_pred_b, 2)
model = bpm.build_model_paired(y_true, y_pred_a, y_pred_b, 3)
#
trace = bpm.learn_model(model)
#
target_traces.append(trace)
#
print()
return target_traces
In [13]:
# compare NB-bern and NB-multi
comparison_AB = compare_classifiers(clf_y_hat[0], clf_y_hat[1])
In [14]:
# compare SVM-l1 and SVM-l2
comparison_CD = compare_classifiers(clf_y_hat[2], clf_y_hat[3])
In [15]:
# compare NB-multi and SVM-l2
comparison_BD = compare_classifiers(clf_y_hat[1], clf_y_hat[3])
In [16]:
import bperf_stats as bps
reload(bps)
Out[16]:
In [17]:
def show_comparison_results(target_traces):
for i in range(len(target_traces)):
trace = target_traces[i]
print("{:d}\t".format(i), end="")
print(bps.post_analysis(trace, 'delta', cmp_val=0, rope=[-0.05,+0.05]))
In [18]:
show_comparison_results(comparison_AB)
In [19]:
show_comparison_results(comparison_CD)
In [20]:
show_comparison_results(comparison_BD)
In [27]:
plot_results(comparison_BD, [10], val_lim=[-0.25,0.05], freq_lim=[0,25])
In [82]:
def compute_prior_traces(y_hat_a, y_hat_b):
prior_traces = []
for i in range(target_num):
print(i, end="")
y_true = np.array(y_tst == i)
y_pred_a = np.array(y_hat_a == i)
y_pred_b = np.array(y_hat_b == i)
model = bpm.build_model_paired(y_true, y_pred_a, y_pred_b, 0)
trace = bpm.learn_model(model)
prior_traces.append(trace)
print()
return prior_traces
In [83]:
comparison_00 = compute_prior_traces(clf_y_hat[1], clf_y_hat[3])
In [84]:
def show_bfsd_results(target_traces):
for i in range(len(target_traces)):
prio_trace = comparison_00[i]
post_trace = target_traces[i]
var = 'delta'
prio_sample = prio_trace[var]
post_sample = post_trace[var]
print("{:d}\t".format(i), end="")
print("{:s}\t{:.3f}".format(var, bps.bfsd(prio_sample, post_sample, cmp_val=0)))
In [85]:
show_bfsd_results(comparison_AB)
In [87]:
show_bfsd_results(comparison_CD)
In [88]:
show_bfsd_results(comparison_BD)
In [89]:
import bperf_plot as bpp
reload(bpp)
Out[89]:
In [90]:
%matplotlib inline
In [93]:
import matplotlib.pyplot as plt
def plot_results(target_traces, targets, burn=0, cmp_val=0, rope=[-0.05,0.05], val_lim=[-0.15,0.15], freq_lim=[0,30], bins=50):
for i in targets:
trace = target_traces[i]
sample = trace[burn:]['delta']
#
plt.figure()
bpp.plot_trace(sample, cmp_val=cmp_val, ylab='$\delta$', ylim=val_lim);
#plt.savefig("../bperf_paper/fig/trace_plot_{:02d}.pdf".format(i))
#
plt.figure()
bpp.plot_post(sample, cmp_val=cmp_val, rope=rope, xlab='$\delta$', xlim=val_lim, ylim=freq_lim, bins=bins);
#plt.savefig("../bperf_paper/fig/post_plot_{:02d}.pdf".format(i))
#
plt.figure()
prior_trace = comparison_00[i]
prior_sample = prior_trace[burn:]['delta']
bpp.plot_bfsd(prior_sample, sample, cmp_val=cmp_val, xlab='$\delta$', xlim=val_lim, ylim=freq_lim)
#plt.savefig("../bperf_paper/fig/bfsd_plot_{:02d}.pdf".format(i))
In [94]:
plot_results(comparison_BD, [7], val_lim=[-0.15,0.15], freq_lim=[0,25])
In [95]:
plot_results(comparison_BD, [8], val_lim=[-0.15,0.15], freq_lim=[0,25])
In [96]:
plot_results(comparison_BD, [9], val_lim=[-0.05,0.25], freq_lim=[0,25])
In [97]:
plot_results(comparison_BD, [10], val_lim=[-0.25,0.05], freq_lim=[0,25])
In [99]:
import timeit
start_time = timeit.default_timer()
comparison_BD = compare_classifiers(clf_y_hat[1], clf_y_hat[3])
elapsed = timeit.default_timer() - start_time
print(elapsed/20)
In [ ]: