In [1]:
# remove after testing
%load_ext autoreload
%autoreload 2
In [2]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
from sklearn.cross_validation import KFold, ShuffleSplit, StratifiedShuffleSplit
from sklearn.preprocessing import PolynomialFeatures
from mclearn.active import ActiveLearner
from mclearn.classifier import grid_search_logistic
from mclearn.heuristics import (random_h, margin_h, entropy_h,
qbb_margin_h, qbb_kl_h,
pool_entropy_h, pool_variance_h)
from mclearn.preprocessing import balanced_train_test_split
from mclearn.tools import results_exist, load_results
from mclearn.viz import plot_average_learning_curve, plot_scatter_with_classes
%matplotlib inline
sns.set_style('ticks')
In [3]:
vstatlas = pd.read_hdf('../data/vstatlas.h5', 'vstatlas')
fig_dir = '../thesis/figures/'
target_col = 'class'
feature_cols = ['rmagC', 'umg', 'gmr', 'rmi', 'imz', 'rmw1', 'w1m2']
In [31]:
kfold = StratifiedShuffleSplit(y, n_iter=10, test_size=0.3, train_size=0.7, random_state=19)
In [37]:
pd.Series.plot?
In [40]:
logistic = LogisticRegression(multi_class='ovr', penalty='l1', C=100, random_state=2, class_weight='auto')
In [49]:
from sklearn import metrics
In [ ]:
In [51]:
for train_test, sel in zip(kfold, balanced_logistic_sels[0]):
train, test = train_test
print(pd.Series(y[train][sel]).value_counts())
logistic.fit(X[train][sel], y[train][sel])
y_pred = logistic.predict(X[test])
print(metrics.confusion_matrix(y[test], y_pred))
In [11]:
X = np.array(vstatlas[feature_cols])
y = np.array(vstatlas[target_col])
X_pool, _, y_pool, _ = balanced_train_test_split(X, y, train_size=2360, test_size=0, random_state=11)
kfold = StratifiedShuffleSplit(y_pool, n_iter=10, test_size=0.3, train_size=0.7, random_state=19)
for train_index, test_index in kfold:
X_train = X_pool[train_index]
y_train = y_pool[train_index]
break
pca = PCA(n_components=2)
X_p = pca.fit_transform(X)
#X_train_p = pca.transform(X_train)
classes = ['Star', 'Galaxy', 'Quasar', 'White Dwarf']
In [12]:
# These are the "Tableau 20" colors as RGB.
tableau10 = [(214, 39, 40), (31, 119, 180), (44, 160, 44),
(255, 127, 14), (148, 103, 189), (140, 86, 75),
(127, 127, 127), (23, 190, 207), (188, 189, 34), (227, 119, 194)]
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.
for i in range(len(tableau10)):
r, g, b = tableau10[i]
tableau10[i] = (r / 255., g / 255., b / 255.)
In [13]:
pickle_paths = ['../pickle/07_thompson_sampling/vstatlas_unbalanced_logistic_candidates.pickle']
balanced_logistic_sels = load_results(pickle_paths)
In [ ]:
In [14]:
fig = plt.figure(figsize=(9, 5))
ax = plot_average_learning_curve(np.arange(50, 301), [[balanced_logistic_lcs[0][0]]], ['Entropy'])
In [44]:
order = balanced_logistic_sels[0][0]
In [16]:
X_train[:, [5,6]]
Out[16]:
In [ ]:
In [30]:
pd.Series(y[order[50:]]).value_counts()
Out[30]:
In [18]:
X_p.shape
Out[18]:
In [19]:
fig = plt.figure(figsize=(10, 10))
ax = plot_scatter_with_classes(X_train[:, [5,6]], y_train, classes, alpha=0.1, size=10, scatterpoints=100)
ax.set_xlim(0.24, 0.45)
ax.set_ylim(0.32, 0.4)
d1 = X[:, [5,6]][order[50:]][:,0]
d2 = X[:, [5,6]][order[50:]][:,1]
ax.scatter(d1, d2, s=25, alpha=1, c=np.arange(250, 0, -1))
ax.set_xlim(0.25, 0.46)
ax.set_ylim(0.31, 0.4)
Out[19]:
In [94]:
fig = plt.figure(figsize=(10, 10))
ax = plot_scatter_with_classes(X_train[:, [5,6]], y_train, classes, alpha=0.2)
ax.set_xlim(0.2, 0.5)
ax.set_ylim(0.2, 0.5)
d1 = X_train[:, [5,6]][order[50:]][:,0]
d2 = X_train[:, [5,6]][order[50:]][:,1]
ax.scatter(d1, d2, s=25, alpha=1, c=np.arange(250, 0, -1))
ax.set_xlim(0.25, 0.46)
ax.set_ylim(0.31, 0.4)
Out[94]:
In [76]:
def plot_scatter_with_classes(data, targets, classes, size=15, alpha=0.2,
scatterpoints=10, ax=None):
if not ax:
ax = plt.gca()
class_data = {}
cls_scatters = []
for i, cls in enumerate(classes):
class_data[cls] = data[targets == cls]
cls_scatter = ax.scatter(class_data[cls][:,0], class_data[cls][:,1], s=size,
alpha=alpha, c=tableau10[i], label=cls)
cls_scatters.append(cls_scatter)
ax.legend(cls_scatters, classes, scatterpoints=scatterpoints, loc='upper right',
frameon=True, ncol=1)
ax.grid(False)
return ax
In [66]:
list(np.arange(10, 0, -1))
Out[66]:
In [80]:
Out[80]:
In [55]:
fig = plt.figure(figsize=(10, 10))
ax = plot_scatter_with_classes(X_train_p[order[50:]], y_train[order[50:]], classes, alpha=1)
ax.set_xlim(-7, 5)
ax.set_ylim(-7, 5)
Out[55]: