In [1]:
# remove after testing
%load_ext autoreload
%autoreload 2
In [2]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from mclearn.active import run_active_learning_expt
from mclearn.heuristics import (random_h, margin_h, entropy_h,
qbb_margin_h, qbb_kl_h, pool_entropy_h, pool_variance_h)
from mclearn.tools import results_exist, load_results
from mclearn.preprocessing import balanced_train_test_split
from mclearn.viz import plot_learning_curve_df, plot_final_accuracy, order_learning_curves
%matplotlib inline
sns.set_style('ticks')
sns.set_palette('Dark2', 8)
In [3]:
running_expt = False # set to False if only want to load in results
fig_dir = '../thesis/figures/'
target_col = 'class'
sdss_features = ['psfMag_r_w14', 'psf_u_g_w14', 'psf_g_r_w14', 'psf_r_i_w14',
'psf_i_z_w14', 'petroMag_r_w14', 'petro_u_g_w14', 'petro_g_r_w14',
'petro_r_i_w14', 'petro_i_z_w14', 'petroRad_r']
xrange = np.arange(50, 301)
colors = {'Random': '#e6ab02',
'Entropy': '#e7298a',
'Margin': '#d95f02',
'QBB Margin': '#1b9e77',
'QBB KL': '#66a61e',
'Pool Variance': '#7570b3',
'Pool Entropy': '#a6761d',
'Thompson': '#666666'}
linestyles = {'Random': '-',
'Entropy': '-',
'Margin': '-',
'QBB Margin': '-',
'QBB KL': '-',
'Pool Variance': '-',
'Pool Entropy': '-',
'Thompson': '-'}
labels = ['Random', 'Entropy', 'Margin', 'QBB Margin', 'QBB KL', 'Pool Variance', 'Pool Entropy', 'Thompson']
upper_l = 0.90570974
upper_r = 0.93177572
In [4]:
if running_expt:
sdss = pd.read_hdf('../data/sdss.h5', 'sdss')
transformer = PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
X = np.array(sdss[sdss_features])
X_poly = transformer.fit_transform(X)
y = np.array(sdss[target_col])
In [5]:
if running_expt:
logistic = LogisticRegression(multi_class='ovr', penalty='l1', C=1, random_state=2, class_weight='auto')
logistic_committee = BaggingClassifier(logistic, n_estimators=11, n_jobs=-1, max_samples=300)
X_pool, _, y_pool, _ = balanced_train_test_split(X_poly, y, train_size=900000, test_size=0, random_state=11)
kfold = StratifiedShuffleSplit(y_pool, n_iter=10, test_size=10000, train_size=10000, random_state=19)
In [6]:
heuristics = [random_h, entropy_h, margin_h, qbb_margin_h, qbb_kl_h, pool_variance_h, pool_entropy_h]
thompson_path = '../pickle/07_thompson_sampling/sdss_balanced_logistic_thompson.pickle'
pickle_paths = ['../pickle/06_active_learning/sdss_balanced_logistic_random.pickle',
'../pickle/06_active_learning/sdss_balanced_logistic_entropy.pickle',
'../pickle/06_active_learning/sdss_balanced_logistic_margin.pickle',
'../pickle/06_active_learning/sdss_balanced_logistic_qbb_margin.pickle',
'../pickle/06_active_learning/sdss_balanced_logistic_qbb_kl.pickle',
'../pickle/06_active_learning/sdss_balanced_logistic_pool_variance.pickle',
'../pickle/06_active_learning/sdss_balanced_logistic_pool_entropy.pickle']
if not results_exist(pickle_paths):
run_active_learning_expt(X_pool, y_pool, kfold, logistic, logistic_committee, heuristics, pickle_paths)
bl_lcs, bl_sels = zip(*load_results(pickle_paths))
bl_lcs = list(bl_lcs)
bl_lcs.append(load_results(thompson_path))
bl_df = order_learning_curves(bl_lcs, labels)
bl_df.columns
Out[6]:
In [7]:
fig = plt.figure(figsize=(4, 4))
ax = plot_learning_curve_df(xrange, bl_df, bl_df.columns[6:], colors, linestyles, upper=upper_l, ylim=(.8, .94))
fig.savefig(fig_dir+'5_active/sdss_bl_ind_lower.pdf', bbox_inches='tight')
In [8]:
fig = plt.figure(figsize=(4, 4))
ax = plot_learning_curve_df(xrange, bl_df, bl_df.columns[:7], colors, linestyles, upper=upper_l, ylim=(.8, .94))
fig.savefig(fig_dir+'5_active/sdss_bl_ind_upper.pdf', bbox_inches='tight')
In [9]:
fig = plt.figure(figsize=(10, 2))
ax = plot_final_accuracy(bl_lcs, labels, colors, ylim=(.75, .95))
fig.savefig(fig_dir+'5_active/sdss_bl_ind_violin.pdf', bbox_inches='tight')
In [10]:
if running_expt:
rbf = SVC(kernel='rbf', gamma=0.1, C=10, cache_size=2000, class_weight='auto', probability=True)
rbf_committee = BaggingClassifier(rbf, n_estimators=11, n_jobs=-1, max_samples=300)
X_pool, _, y_pool, _ = balanced_train_test_split(X, y, train_size=900000, test_size=0, random_state=11)
kfold = StratifiedShuffleSplit(y_pool, n_iter=10, test_size=10000, train_size=10000, random_state=19)
In [11]:
heuristics = [random_h, entropy_h, margin_h, qbb_margin_h, qbb_kl_h, pool_variance_h, pool_entropy_h]
thompson_path = '../pickle/07_thompson_sampling/sdss_balanced_rbf_thompson.pickle'
pickle_paths = ['../pickle/06_active_learning/sdss_balanced_rbf_random.pickle',
'../pickle/06_active_learning/sdss_balanced_rbf_entropy.pickle',
'../pickle/06_active_learning/sdss_balanced_rbf_margin.pickle',
'../pickle/06_active_learning/sdss_balanced_rbf_qbb_margin.pickle',
'../pickle/06_active_learning/sdss_balanced_rbf_qbb_kl.pickle',
'../pickle/06_active_learning/sdss_balanced_rbf_pool_variance.pickle',
'../pickle/06_active_learning/sdss_balanced_rbf_pool_entropy.pickle']
if not results_exist(pickle_paths):
run_active_learning_expt(X_pool, y_pool, kfold, rbf, rbf_committee, heuristics, pickle_paths)
br_lcs, br_sels = zip(*load_results(pickle_paths))
br_lcs = list(br_lcs)
br_lcs.append(load_results(thompson_path))
br_df = order_learning_curves(br_lcs, labels)
br_df.columns
Out[11]:
In [12]:
fig = plt.figure(figsize=(4, 4))
ax = plot_learning_curve_df(xrange, br_df, br_df.columns[5:], colors, linestyles, upper=upper_r, ylim=(.8, .94))
fig.savefig(fig_dir+'5_active/sdss_br_ind_lower.pdf', bbox_inches='tight')
In [13]:
fig = plt.figure(figsize=(4, 4))
ax = plot_learning_curve_df(xrange, br_df, br_df.columns[:6], colors, linestyles, upper=upper_r, ylim=(.8, .94))
fig.savefig(fig_dir+'5_active/sdss_br_ind_upper.pdf', bbox_inches='tight')
In [14]:
fig = plt.figure(figsize=(10, 2))
ax = plot_final_accuracy(br_lcs, labels, colors, ylim=(.75, .95))
fig.savefig(fig_dir+'5_active/sdss_br_ind_violin.pdf', bbox_inches='tight')
In [15]:
if running_expt:
logistic = LogisticRegression(multi_class='ovr', penalty='l1', C=1, random_state=2, class_weight='auto')
logistic_committee = BaggingClassifier(logistic, n_estimators=11, n_jobs=-1, max_samples=300)
kfold = StratifiedShuffleSplit(y, n_iter=10, test_size=10000, train_size=10000, random_state=19)
In [16]:
heuristics = [random_h, entropy_h, margin_h, qbb_margin_h, qbb_kl_h, pool_variance_h, pool_entropy_h]
thompson_path = '../pickle/07_thompson_sampling/sdss_unbalanced_logistic_thompson.pickle'
pickle_paths = ['../pickle/06_active_learning/sdss_unbalanced_logistic_random.pickle',
'../pickle/06_active_learning/sdss_unbalanced_logistic_entropy.pickle',
'../pickle/06_active_learning/sdss_unbalanced_logistic_margin.pickle',
'../pickle/06_active_learning/sdss_unbalanced_logistic_qbb_margin.pickle',
'../pickle/06_active_learning/sdss_unbalanced_logistic_qbb_kl.pickle',
'../pickle/06_active_learning/sdss_unbalanced_logistic_pool_variance.pickle',
'../pickle/06_active_learning/sdss_unbalanced_logistic_pool_entropy.pickle']
if not results_exist(pickle_paths):
run_active_learning_expt(X_poly, y, kfold, logistic, logistic_committee, heuristics, pickle_paths)
ul_lcs, ul_sels = zip(*load_results(pickle_paths))
ul_lcs = list(ul_lcs)
ul_lcs.append(load_results(thompson_path))
ul_df = order_learning_curves(ul_lcs, labels)
ul_df.columns
Out[16]:
In [17]:
fig = plt.figure(figsize=(4, 4))
ax = plot_learning_curve_df(xrange, ul_df, ul_df.columns[6:], colors, linestyles, upper=upper_l, ylim=(.8, .94), loc='upper left')
fig.savefig(fig_dir+'5_active/sdss_ul_ind_lower.pdf', bbox_inches='tight')
In [18]:
fig = plt.figure(figsize=(4, 4))
ax = plot_learning_curve_df(xrange, ul_df, ul_df.columns[:7], colors, linestyles, upper=upper_l, ylim=(.8, .94), loc='upper left')
fig.savefig(fig_dir+'5_active/sdss_ul_ind_upper.pdf', bbox_inches='tight')
In [19]:
fig = plt.figure(figsize=(10, 2))
ax = plot_final_accuracy(ul_lcs, labels, colors, ylim=(.75, .95))
fig.savefig(fig_dir+'5_active/sdss_ul_ind_violin.pdf', bbox_inches='tight')
In [20]:
if running_expt:
rbf = SVC(kernel='rbf', gamma=0.1, C=10, cache_size=2000, class_weight='auto', probability=True)
rbf_committee = BaggingClassifier(rbf, n_estimators=11, n_jobs=-1, max_samples=300)
kfold = StratifiedShuffleSplit(y, n_iter=10, test_size=10000, train_size=10000, random_state=19)
In [21]:
heuristics = [random_h, entropy_h, margin_h, qbb_margin_h, qbb_kl_h, pool_variance_h, pool_entropy_h]
thompson_path = '../pickle/07_thompson_sampling/sdss_unbalanced_rbf_thompson.pickle'
pickle_paths = ['../pickle/06_active_learning/sdss_unbalanced_rbf_random.pickle',
'../pickle/06_active_learning/sdss_unbalanced_rbf_entropy.pickle',
'../pickle/06_active_learning/sdss_unbalanced_rbf_margin.pickle',
'../pickle/06_active_learning/sdss_unbalanced_rbf_qbb_margin.pickle',
'../pickle/06_active_learning/sdss_unbalanced_rbf_qbb_kl.pickle',
'../pickle/06_active_learning/sdss_unbalanced_rbf_pool_variance.pickle',
'../pickle/06_active_learning/sdss_unbalanced_rbf_pool_entropy.pickle']
if not results_exist(pickle_paths):
run_active_learning_expt(X, y, kfold, rbf, rbf_committee, heuristics, pickle_paths)
ur_lcs, ur_sels = zip(*load_results(pickle_paths))
ur_lcs = list(ur_lcs)
ur_lcs.append(load_results(thompson_path))
ur_df = order_learning_curves(ur_lcs, labels)
ur_df.columns
Out[21]:
In [22]:
fig = plt.figure(figsize=(4, 4))
ax = plot_learning_curve_df(xrange, ur_df, ur_df.columns[5:], colors, linestyles, upper=upper_r, ylim=(.8, .94))
fig.savefig(fig_dir+'5_active/sdss_ur_ind_lower.pdf', bbox_inches='tight')
In [23]:
fig = plt.figure(figsize=(4, 4))
ax = plot_learning_curve_df(xrange, ur_df, ur_df.columns[:6], colors, linestyles, upper=upper_r, ylim=(.8, .94), loc='upper left')
fig.savefig(fig_dir+'5_active/sdss_ur_ind_upper.pdf', bbox_inches='tight')
In [24]:
fig = plt.figure(figsize=(10, 2))
ax = plot_final_accuracy(ur_lcs, labels, colors, ylim=(.75, .95))
fig.savefig(fig_dir+'5_active/sdss_ur_ind_violin.pdf', bbox_inches='tight')