In [1]:
cd ..
In [2]:
%matplotlib inline
from time import strftime
import cPickle as pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import RandomizedPCA
from sklearn.cross_validation import train_test_split
from ipywidgets import interact
from utils import TrainSplit
def reduce_dim_by_pca(X, dim):
# TODO the pca should not be performed on the validation set
pca = RandomizedPCA(n_components=dim, random_state=42, whiten=True)
return pca.fit_transform(X)
def load_feats(fnames, max_dim=None):
feats = []
for fname in fnames:
feats.append(np.load(fname))
# return np.hstack(feats)
# if dim is not None:
# feats = [reduce_dim_by_pca(feat, dim) for feat in feats]
if max_dim is not None:
feats = [reduce_dim_by_pca(feat, max_dim) if feat.shape[1] > max_dim else feat for feat in feats]
return np.hstack(feats)
In [20]:
fnames = [
'model_features/cropped_jan03_gp_X_cropped_256_head_20151216.npy', # VGG192
'model_features/cropped_dec19_gp_X_cropped_256_head_20151216.npy', # VGG128
'model_features/cropped_dec21_3_gp_X_cropped_256_head_20151216.npy', # ResNet33333
'model_features/cropped_jan02_gp_X_cropped_256_head_20151216.npy', # ResNet3345
'model_features/cropped_jan05_2_gp_X_cropped_256_head_20151216.npy', # VGG192
'model_features/cropped_jan05_4_gp_X_cropped_256_head_20151216.npy', # ResNet34_23_4
# 'model_features/cropped_jan05_3_gp_X_cropped_256_head_20151216.npy' # Inception
]
X = load_feats(fnames)
# print len(X)
print X.shape
y = np.memmap('cache/y_cropped_256_head_20151216.npy', dtype=np.int32)
enc = pickle.load(open('models/encoder.pkl', 'r'))
y = enc.transform(y)
print y.shape
In [12]:
class FakeNet(object):
regression = False
def get_train_valid_split(X, y):
ts = TrainSplit(0.15, random_state=42, stratify=False)
X_train, X_test, y_train, y_test = ts(X, y, FakeNet())
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = get_train_valid_split(X, y)
print X_train.shape, X_test.shape, y_train.shape, y_test.shape
# onehot_encoder = OneHotEncoder(n_values=447, sparse=False).fit(y.reshape(-1, 1))
# y_train_oh = onehot_encoder.transform(y_train.reshape(-1, 1))
# y_test_oh = onehot_encoder.transform(y_test.reshape(-1, 1))
# print y_train_oh.shape, y_test_oh.shape
In [26]:
print np.where(np.bincount(y) > 25)[0].shape
In [7]:
# # Where a id is not present in the train set and the test set
print np.where(np.bincount(y_train) == 0)[0].shape
print np.where(np.bincount(y_test) == 0)[0].shape
In [17]:
plt.figure(figsize=(20, 5))
# plt.title('Local train / test split')
# _ = plt.hist(y_train, bins=range(447), label='train')
# _ = plt.hist(y_test, bins=range(447), label='valid/test')
# plt.legend()
# _ = plt.hist(y, bins=range(447))
_ = sns.countplot(y)
plt.title('Images per whale')
plt.ylabel('Number of images')
plt.xlabel('Whale ID')
_ = plt.xticks(range(0, 447, 5), rotation=90)
In [18]:
# plt.figure(figsize=(20, 10))
# for i in range(X_train.shape[1]):
# plt.scatter([i] * X_train.shape[0], X_train.T[i], lw=0, s=0.1, alpha=0.5)
# plt.xlim(0, X_train.shape[1])
In [5]:
def get_actual_pred_proba(raw_pred_proba):
pred_proba = np.zeros((len(raw_pred_proba), 447))
for i, label in enumerate(np.unique(y_train)):
pred_proba[:, label] = raw_pred_proba[:, i]
return pred_proba
In [6]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.decomposition import RandomizedPCA
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from xgboost import XGBClassifier
clf = Pipeline([
('preprocessing', StandardScaler()),
# ('pca', RandomizedPCA(n_components=500, random_state=42, whiten=True)),
# ('select_best', SelectKBest(k=50)),
('clf', LogisticRegression(
C=10, solver='lbfgs', penalty='l2', multi_class='multinomial', n_jobs=-1
))
# ('pca', RandomizedPCA(n_components=500, random_state=42, whiten=True)),
# ('clf', RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=10))
# ('clf', GradientBoostingClassifier(n_estimators=500, max_depth=100))
# ('clf', XGBClassifier(n_estimators=10, max_depth=10, nthread=-1))
])
clf.fit(X_train, y_train)
y_test_pred_proba_raw = clf.predict_proba(X_test)
y_test_pred_proba = get_actual_pred_proba(y_test_pred_proba_raw)
y_train_pred_proba_raw = clf.predict_proba(X_train)
y_train_pred_proba = get_actual_pred_proba(y_train_pred_proba_raw)
y_test_pred = y_test_pred_proba.argmax(axis=1)
y_train_pred = y_train_pred_proba.argmax(axis=1)
from sklearn.metrics import log_loss, accuracy_score
print accuracy_score(y_train, y_train_pred)
print log_loss(y_train_oh, y_train_pred_proba)
print
print accuracy_score(y_test, y_test_pred)
print log_loss(y_test_oh, y_test_pred_proba)
print
print accuracy_score(y_test[1::2], y_test_pred[1::2])
print log_loss(y_test_oh[1::2], y_test_pred_proba[1::2])
print
print accuracy_score(y_test[::2], y_test_pred[::2])
print log_loss(y_test_oh[::2], y_test_pred_proba[::2])
In [38]:
fnames = [
'model_features/cropped_jan03_gp_X_test_head_crop_localize_pts_dec31_256_tta_20160102.npy',
'model_features/cropped_dec19_gp_X_test_head_crop_localize_pts_dec31_256_tta_20160102.npy',
'model_features/cropped_dec21_3_gp_X_test_head_crop_localize_pts_dec31_256_tta_20160102.npy',
'model_features/cropped_jan02_gp_X_test_head_crop_localize_pts_dec31_256_tta_20160102.npy',
'model_features/cropped_jan05_2_gp_X_test_head_crop_localize_pts_dec31_256_tta_20160102.npy',
'model_features/cropped_jan05_4_gp_X_test_head_crop_localize_pts_dec31_256_tta_20160102.npy'
]
X_test = load_feats(fnames)
print X_test.shape
In [43]:
y_test_pred_proba_raw = clf.predict_proba(X_test)
y_test_pred_proba = get_actual_pred_proba(y_test_pred_proba_raw)
y_test_pred_kaggle = y_test_pred_proba.argmax(axis=1)
print y_test_pred_proba.shape
In [52]:
plt.figure(figsize=(20, 5))
plt.title('Whale label distribution')
_ = plt.hist(y_test_pred_kaggle, bins=range(447), label='kaggle_test')
_ = plt.hist(y_train, bins=range(447), label='train')
_ = plt.hist(y_test, bins=range(447), label='valid/test')
plt.xticks(range(0, 447, 5), rotation=90)
plt.legend()
Out[52]:
In [23]:
sample_df = pd.read_csv('data/sample_submission.csv')
fnames = sample_df[['Image']].values
values = np.hstack([fnames, y_test_pred_proba])
enc = pickle.load(open('models/encoder.pkl', 'r'))
classes = map(lambda x: 'whale_%05d' % x, enc.classes_)
submission_df = pd.DataFrame(values, columns=['Image'] + classes)
submission_df.head()
Out[23]:
In [24]:
def get_current_timestamp():
return strftime('%Y%m%d_%H%M%S')
submission_fname = 'submissions/stacking_%s.csv' % get_current_timestamp()
print submission_fname
submission_df.to_csv(submission_fname, index=False)