In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In this example, we will show how to use pystacks to do a Kaggle competition from scratch. Doing well on this competition requires significantly more effort than shown here, particularly with respect to inferring missing features (which we do naively here by filling in the most common value). Nevertheless, this example demonstrates how to feed in data into pystacks.
Description from Kaggle:
VARIABLE DESCRIPTIONS:
survival Survival
(0 = No; 1 = Yes)
pclass Passenger Class
(1 = 1st; 2 = 2nd; 3 = 3rd)
name Name
sex Sex
age Age
sibsp Number of Siblings/Spouses Aboard
parch Number of Parents/Children Aboard
ticket Ticket Number
fare Passenger Fare
cabin Cabin
embarked Port of Embarkation
(C = Cherbourg; Q = Queenstown; S = Southampton)
SPECIAL NOTES:
Pclass is a proxy for socio-economic status (SES)
1st ~ Upper; 2nd ~ Middle; 3rd ~ Lower
Age is in Years; Fractional if Age less than One (1)
If the Age is Estimated, it is in the form xx.5
With respect to the family relation variables (i.e. sibsp and parch)
some relations were ignored. The following are the definitions used
for sibsp and parch.
Sibling: Brother, Sister, Stepbrother, or Stepsister of Passenger Aboard Titanic
Spouse: Husband or Wife of Passenger Aboard Titanic (Mistresses and Fiances Ignored)
Parent: Mother or Father of Passenger Aboard Titanic
Child: Son, Daughter, Stepson, or Stepdaughter of Passenger Aboard Titanic
Other family relatives excluded from this study include cousins,
nephews/nieces, aunts/uncles, and in-laws. Some children travelled
only with a nanny, therefore parch=0 for them. As well, some
travelled with very close friends or neighbors in a village, however,
the definitions do not support such relations.
In [2]:
# we will use everything as features except for name, fare, and ticket (number)
continuous_features = ['Age', 'SibSp', 'Parch']
# furthermore, we will learn representations for discrete features
discrete_features = ['Pclass', 'Sex', 'Cabin']
In [3]:
import csv
print 'train file'
!head ./train.csv
print
print 'test file'
!head ./test.csv
In [4]:
# we're going to do a hack here where we discretize the cabin locations
def convert_data(fname):
Xcont, Xdis, Y = [], [], []
with open(fname) as f:
reader = csv.reader(f)
header = reader.next()
for i, row in enumerate(reader):
if 'Survived' in header:
Y.append(row[header.index('Survived')])
cabin = row[header.index('Cabin')]
if cabin:
cabin = cabin.split(' ')[0]
cabin = cabin[0]
row[header.index('Cabin')] = cabin
Xcont.append([row[header.index(n)] for n in continuous_features])
Xdis.append([row[header.index(n)] if row[header.index(n)] else 'MISSING' for n in discrete_features])
if len(Y):
return Xcont, Xdis, Y
else:
return Xcont, Xdis
train_cont, train_dis, train_Y = convert_data('train.csv')
test_cont, test_dis = convert_data('test.csv')
print train_cont[:5]
print train_dis[:5]
In [5]:
import numpy as np
np.random.seed(0)
# unfortunately, this dataset has empty data, hence we'll take a very simple approach and backfill
# the continous features with the mean and the discrete features with the most common value
def back_fill_cont(cont, means):
cols = zip(*cont)
for i, col in enumerate(cols):
valued = [float(v) for v in col if v != '']
if means[i] is None:
means[i] = np.mean(valued)
col = list(col)
for j, val in enumerate(col):
if val == '':
val = means[i]
col[j] = float(val)
cols[i] = col
cols = zip(*cols)
return np.array(cols)
means = [None for n in continuous_features]
Xtrain_cont = back_fill_cont(train_cont, means)
Xtest_cont = back_fill_cont(test_cont, means)
print 'backfilled'
print Xtrain_cont[:5]
def standarize(X, means=None, stds=None):
if means is None:
means = X.mean(axis=0)
if stds is None:
stds = X.std(axis=0)
return (X-means[np.newaxis, :]) / stds[np.newaxis, :], means, stds
Xtrain_cont, means, stds = standarize(Xtrain_cont)
Xtest_cont, _, _ = standarize(Xtest_cont, means, stds)
print 'standardized'
print Xtrain_cont[:5]
Y = np.array(train_Y)
In [6]:
from collections import Counter
from pystacks.utils.text.vocab import Vocab
def back_fill_dis(dis, most_commons):
cols = zip(*dis)
for i, col in enumerate(cols):
valued = [v for v in col if v != '']
if most_commons[i] is None:
most_commons[i] = Counter(valued).most_common()[0][0]
col = list(col)
for j, val in enumerate(col):
if val == '':
val = most_commons[i]
col[j] = val
cols[i] = col
cols = zip(*cols)
return cols
most_commons = [None for n in discrete_features]
Xtrain_dis = back_fill_dis(train_dis, most_commons)
Xtest_dis = back_fill_dis(test_dis, most_commons)
print 'backfilled'
print Xtrain_dis[:5]
def numericalize(X, vocabs, add=False):
cols = zip(*X)
for i, col in enumerate(cols):
vocab = vocabs[i]
col = [vocab.add(val) for val in col] if add else [vocab[val] for val in col]
cols[i] = col
cols = zip(*cols)
return np.array(cols)
vocabs = [Vocab() for f in discrete_features]
Xtrain_dis = numericalize(Xtrain_dis, vocabs, add=True)
Xtest_dis = numericalize(Xtest_dis, vocabs)
vocabs = {name:v for name, v in zip(discrete_features, vocabs)}
print 'vocab_size'
print [(name, len(v)) for name, v in vocabs.items()]
print 'numericalized'
print Xtrain_dis[:5]
In [7]:
# finally we will randomly split the training into train and dev to do validation
def split_train(X, Y, dev_portion=0.15):
total = len(X)
dev_count = int(dev_portion * total)
inds = np.random.permutation(total)
train_ind = inds[:total-dev_count]
dev_ind = inds[total-dev_count:]
return X[train_ind], Y[train_ind], X[dev_ind], Y[dev_ind]
Xtrain, Ytrain, Xdev, Ydev = split_train(np.concatenate([Xtrain_cont, Xtrain_dis], axis=1), Y.astype('int32'))
Xtrain_c = Xtrain[:, :len(continuous_features)].astype('float32')
Xtrain_d = Xtrain[:, len(continuous_features):].astype('int32')
Xdev_c = Xdev[:, :len(continuous_features)].astype('float32')
Xdev_d = Xdev[:, len(continuous_features):].astype('int32')
print 'train size', len(Xtrain), 'dev size', len(Xdev)
In [8]:
# make model
from theano import function, tensor as T
from pystacks.layers.container import Sequential, Parallel
from pystacks.layers.lookup import LookupTable
from pystacks.layers.common import *
from pystacks.transformer import UnitNorm
emb_size = 5
# we'll need 1 lookup tables per discrete feature
lookups = [LookupTable(vocab_size=len(vocabs[n]), embedding_size=emb_size, E_transformer=UnitNorm()) for n in discrete_features]
Xdiscrete_sym = T.imatrix()
discrete_net = Parallel(lookups)
discrete_feat = discrete_net.forward(Xdiscrete_sym)
debug_discrete = function([Xdiscrete_sym], discrete_feat)
In [9]:
from pprint import pprint
pprint(discrete_net)
In [10]:
print 'discrete input'
print Xtrain_d[:5]
print 'discrete features'
print debug_discrete(Xtrain_d[:5])
print debug_discrete(Xtrain_d[:5]).shape
print 'continous input'
print Xtrain_c[:5]
In [11]:
n_in = len(discrete_features) * emb_size + len(continuous_features)
n_hid1 = 100
n_hid2 = 200
n_out = 2
Xcontinous_sym = T.fmatrix()
Xin_sym = T.concatenate([discrete_feat, Xcontinous_sym], axis=1)
net = Sequential([
LinearLayer(n_in, n_hid1),
ReLU(),
Dropout(0.5),
LinearLayer(n_hid1, n_hid2),
ReLU(),
Dropout(0.5),
LinearLayer(n_hid2, n_out),
Softmax()
])
prob_sym = net.forward(Xin_sym)
debug_prob = function([Xdiscrete_sym, Xcontinous_sym], prob_sym)
In [12]:
print 'probability estimates'
print debug_prob(Xtrain_d, Xtrain_c)
In [13]:
from pystacks.optimizer import Adagrad
from pystacks.criteria import cross_entropy_loss
Y_sym = T.ivector()
lr_sym = T.fscalar()
loss_sym = cross_entropy_loss(prob_sym, Y_sym, one_hot_num_classes=2)
pred_sym = prob_sym.argmax(axis=1)
optimizer = Adagrad()
updates = net.grad_updates(loss=loss_sym, lr=lr_sym, optimizer=optimizer)
updates += discrete_net.grad_updates(loss=loss_sym, lr=lr_sym, optimizer=optimizer)
f_train = function([Xdiscrete_sym, Xcontinous_sym, Y_sym, lr_sym], loss_sym, updates=updates)
f_pred = function([Xdiscrete_sym, Xcontinous_sym], net.forward(Xin_sym, train=False).argmax(axis=1))
In [14]:
pprint(net)
In [15]:
print Xtest_dis[:5]
In [16]:
max_epoch = 200
lr = 3e-3
best_acc = 0
best_pred = None
train_accs, dev_accs = [], []
for epoch in xrange(max_epoch + 1):
loss = f_train(Xtrain_d, Xtrain_c, Ytrain, lr)
train_pred = f_pred(Xtrain_d, Xtrain_c)
dev_pred = f_pred(Xdev_d, Xdev_c)
train_acc = np.mean(Ytrain == train_pred)
dev_acc = np.mean(Ydev == dev_pred)
train_accs.append(train_acc)
dev_accs.append(dev_acc)
if epoch % 10 == 0:
print '*' * 10 + ' epoch', epoch, 'loss', loss, 'train', train_acc, 'dev', dev_acc
if dev_acc > best_acc:
print 'new best', dev_acc, 'at epoch', epoch
best_acc = dev_acc
best_pred = f_pred(Xtest_dis.astype('int32'), Xtest_cont.astype('float32'))
In [17]:
import matplotlib.pylab as P
fig, ax = P.subplots()
ax.plot(train_accs, label='train', color='b')
ax.plot(dev_accs, label='dev', color='r')
ax.set_ylabel('accuracy')
ax.set_xlabel('epoch')
ax.set_title('learning curve')
ax.legend(loc='lower right')
Out[17]:
In [18]:
with open('test.csv') as f_in, open('test.pred.csv', 'wb') as f_out:
reader = csv.reader(f_in)
writer = csv.writer(f_out)
header = reader.next()
writer.writerow( ('PassengerId', 'Survived') )
for row, survived in zip(reader, best_pred):
writer.writerow( (row[header.index('PassengerId')], survived))
In [19]:
!wc -l 'test.pred.csv'
!head 'test.pred.csv'
In [ ]: