In [27]:
%pylab inline
%load_ext autoreload
%autoreload 2

from datetime import datetime, timedelta

import os
import sys

import numpy as np
import pandas as pd
from scipy import sparse
import sklearn as sl
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
import theanets as tn


Populating the interactive namespace from numpy and matplotlib
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
WARNING: pylab import has clobbered these variables: ['datetime']
`%matplotlib` prevents importing * from pylab and numpy

In [2]:
if os.name == 'nt':
    TRAIN_PATH = r'D:\train.csv'
    PTRAIN_PATH = r'D:\train_preprocessed2.csv'
    TEST_PATH = r'D:\test.csv'
    GOOGNEWS_PATH = r'D:\GoogleNews-vectors-negative300.bin.gz'
    VOCAB_PATH = r'D:\big.txt'
else:
    TRAIN_PATH = r'/media/speedy/train.csv'
    PTRAIN_PATH = r'/media/speedy/train_preprocessed2.csv'
    TEST_PATH = r'/media/speedy/test.csv'
    GOOGNEWS_PATH = r'/media/speedy/GoogleNews-vectors-negative300.bin.gz'
    VOCAB_PATH = r'/media/speedy/big.txt'
df = pd.read_csv(PTRAIN_PATH, dtype=np.float32)

In [3]:
X = df.loc[:, df.columns != 'target']
y = df['target']

In [78]:
X_scaled = (X - X.mean())/X.std()


Out[78]:
VAR_0002 VAR_0003 VAR_0004 VAR_0006 VAR_0007 VAR_0013 VAR_0014 VAR_0015 VAR_0016 VAR_0017 ... VAR_0354=u VAR_0466=i VAR_0467=discharge na VAR_0467=discharged VAR_0467=dismissed VAR_1934=branch VAR_1934=caps VAR_1934=is VAR_1934=mobile VAR_1934=rec
0 0.828161 -0.702888 0.047078 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... -1.330274 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
1 -0.685551 -0.281913 0.059131 0.190857 -0.436626 0.190857 -0.443476 0.522908 0.487472 0.430090 ... -1.330274 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
2 0.074792 -0.679060 -0.021005 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
3 0.939771 1.679992 -0.042505 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 0.487472 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 -0.944553 -0.147476 -0.169824 7.105424
4 -0.232135 1.370217 -0.140231 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... -1.330274 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
5 -0.706478 -0.671117 0.057014 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... -1.330274 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
6 -0.315843 0.345579 2.954442 0.190857 0.414232 0.190857 0.453354 0.522908 0.487472 -0.423884 ... -1.330274 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
7 -0.643698 -0.107168 -0.009929 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... -1.330274 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
8 -0.615795 -0.575801 -0.116125 0.190857 0.414232 0.190857 0.453354 0.522908 -0.394987 0.430090 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 -0.944553 -0.147476 5.896003 -0.140548
9 -0.566966 -0.130997 -0.266461 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
10 -0.308867 -0.607573 -0.206522 0.190857 0.414232 0.190857 0.453354 0.522908 -0.394987 0.430090 ... -1.330274 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
11 0.242208 -0.702888 -0.143326 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 -9.239468 9.542653 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
12 1.065332 -0.591687 -0.018073 3.308491 1.265090 3.308491 0.453354 3.521017 4.899767 4.699960 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
13 -0.434429 0.297921 0.022646 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... -1.330274 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
14 -0.483258 0.083462 -0.180951 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
15 -0.692527 -0.512258 -0.246102 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
16 1.023478 -0.687002 -0.188524 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... -1.330274 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
17 -0.643698 -0.154826 -0.083224 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... -1.330274 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
18 -0.539063 -0.289856 0.104085 0.814384 0.414232 0.814384 0.453354 1.522277 1.369931 1.284064 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
19 -0.706478 0.282035 -0.109203 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
20 -0.615795 -0.512258 -0.070031 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
21 0.221281 1.108101 0.000251 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
22 0.102695 0.544152 -0.099512 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
23 -0.566966 -0.512258 -0.141860 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
24 -0.720430 -0.321628 -0.132087 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
25 -0.713454 -0.663174 -0.201310 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
26 0.158500 0.313807 -0.161731 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
27 -0.566966 0.496495 -0.123455 0.190857 0.414232 0.190857 0.453354 0.522908 1.369931 0.430090 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
28 -0.392575 -0.464600 0.234387 1.437911 1.265090 1.437911 1.350183 3.521017 -0.394987 1.284064 ... -1.330274 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
29 -0.720430 -0.702888 -0.101548 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
145201 -0.280965 -0.472543 0.754781 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
145202 0.186403 -0.273970 -0.083224 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
145203 -0.615795 0.004033 -0.102118 0.814384 0.414232 0.814384 0.453354 2.521647 4.017308 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
145204 0.577038 -0.448714 -0.109203 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
145205 1.442016 -0.226312 -0.124758 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... -1.330274 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
145206 0.688648 0.536209 -0.126794 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... -1.330274 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
145207 -0.636722 -0.123054 -0.126387 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... -1.330274 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
145208 1.518748 0.496495 -0.083224 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... -1.330274 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
145209 -0.643698 0.226435 -0.012617 3.308491 1.265090 3.308491 1.350183 1.522277 2.252390 1.284064 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
145210 -0.678576 -0.639345 -0.107656 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... -1.330274 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
145211 -0.315843 0.250264 -0.112216 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
145212 -0.280965 0.385294 0.129983 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
145213 -0.692527 -0.702888 -0.211490 0.190857 0.414232 0.190857 0.453354 0.522908 1.369931 -0.423884 ... -1.330274 NaN -0.027034 -9.239468 9.542653 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
145214 -0.455356 -0.162769 0.014991 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... -1.330274 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
145215 -0.239111 0.051690 -0.126794 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
145216 -0.525112 0.154948 -0.165884 0.814384 0.414232 0.814384 0.453354 0.522908 0.487472 0.430090 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
145217 -0.260038 0.353522 -0.221670 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
145218 -0.483258 -0.559916 0.301167 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
145219 -0.441404 0.742726 0.038934 2.684965 0.414232 2.684965 0.453354 1.522277 0.487472 0.430090 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
145220 -0.294916 -0.250141 -0.132087 0.190857 0.414232 0.190857 0.453354 0.522908 -0.394987 -0.423884 ... -1.330274 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
145221 0.388696 -0.337513 -0.164663 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... -1.330274 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
145222 -0.629746 -0.647288 -0.072393 0.814384 0.414232 0.814384 0.453354 0.522908 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
145223 3.199876 -0.702888 -0.099512 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
145224 -0.650673 -0.694945 -0.172807 0.190857 0.414232 0.190857 0.453354 0.522908 0.487472 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
145225 -0.566966 -0.432829 0.018086 0.190857 0.414232 0.190857 0.453354 -0.476462 -0.394987 -0.423884 ... -1.330274 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
145226 1.190894 0.440894 -0.058792 0.814384 2.115948 0.814384 2.247012 0.522908 0.487472 0.430090 ... -1.330274 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
145227 -0.462331 2.585486 -0.073370 1.437911 0.414232 1.437911 0.453354 4.520387 2.252390 2.992012 ... -1.330274 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
145228 -0.015891 -0.456657 -0.091042 0.190857 0.414232 0.190857 0.453354 0.522908 -0.394987 0.430090 ... 0.751523 NaN -0.027034 0.108326 -0.104757 1.084650 -0.944553 -0.147476 -0.169824 -0.140548
145229 -0.190282 -0.607573 0.049114 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548
145230 0.856063 0.250264 1.203509 -0.432670 -0.436626 -0.432670 -0.443476 -0.476462 -0.394987 -0.423884 ... 0.751523 NaN -0.027034 0.108326 -0.104757 -0.923033 1.058519 -0.147476 -0.169824 -0.140548

145231 rows × 3000 columns

PCA and plot


In [18]:
pca = PCA(n_components=2)
X_pca_ed = pca.fit_transform(X_scaled)

In [50]:
XT = X_pca_ed.T
f = figure()
ax = f.gca()
ax.scatter(XT[0, np.asarray(y, dtype=np.bool)], XT[1, np.asarray(y, dtype=np.bool)], c='g', alpha=0.5)
ax.scatter(XT[0, ~np.asarray(y, dtype=np.bool)], XT[1, ~np.asarray(y, dtype=np.bool)], c='r', alpha=0.5)


Out[50]:
<matplotlib.collections.PathCollection at 0x7f92c08cbbd0>

Autoencode and plot


In [ ]:
exp = tn.Experiment(tn.Autoencoder,
                    layers=(X_scaled.shape[1], X_scaled.shape[1]*2,
                            2,
                            (X_scaled.shape[1]*2, 'tied'), (X_scaled.shape[1], 'tied')))

kwargs = {'algorithm': 'rmsprop',
          'learning_rate': 0.0001, 'momentum': 0.9,
          'input_dropout': 0.3, 'hidden_dropout': 0.3}

for train, _ in exp.itertrain(X_scaled, **kwargs):
    print 'training loss:', train['loss']
exp.save('/home/mtambos/kaggle/springleaf/autoencoder_trainset2.pickle')

In [69]:
X_autoencoded = np.empty((X_scaled.shape[0], 2))
step = 20000
for i in xrange(0, X_autoencoded.shape[0], step):
    print "encoding %s:%s" % (i, i+step)
    X_autoencoded[i:i+step, :] = exp.network.encode(X_scaled[i:i+step, :])


encoding 0:20000
encoding 20000:40000
encoding 40000:60000
encoding 60000:80000
encoding 80000:100000
encoding 100000:120000
encoding 120000:140000
encoding 140000:160000

In [72]:
XT = X_autoencoded.T
f = figure()
ax = f.gca()
ax.scatter(XT[0, np.asarray(y, dtype=np.bool)], XT[1, np.asarray(y, dtype=np.bool)], c='g', alpha=0.5)
ax.scatter(XT[0, ~np.asarray(y, dtype=np.bool)], XT[1, ~np.asarray(y, dtype=np.bool)], c='r', alpha=0.5)


Out[72]:
<matplotlib.collections.PathCollection at 0x7f92b19d77d0>