In [27]:
%pylab inline
%load_ext autoreload
%autoreload 2
from datetime import datetime, timedelta
import os
import sys
import numpy as np
import pandas as pd
from scipy import sparse
import sklearn as sl
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
import theanets as tn
Populating the interactive namespace from numpy and matplotlib
The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
WARNING: pylab import has clobbered these variables: ['datetime']
`%matplotlib` prevents importing * from pylab and numpy
In [2]:
if os.name == 'nt':
TRAIN_PATH = r'D:\train.csv'
PTRAIN_PATH = r'D:\train_preprocessed2.csv'
TEST_PATH = r'D:\test.csv'
GOOGNEWS_PATH = r'D:\GoogleNews-vectors-negative300.bin.gz'
VOCAB_PATH = r'D:\big.txt'
else:
TRAIN_PATH = r'/media/speedy/train.csv'
PTRAIN_PATH = r'/media/speedy/train_preprocessed2.csv'
TEST_PATH = r'/media/speedy/test.csv'
GOOGNEWS_PATH = r'/media/speedy/GoogleNews-vectors-negative300.bin.gz'
VOCAB_PATH = r'/media/speedy/big.txt'
df = pd.read_csv(PTRAIN_PATH, dtype=np.float32)
In [3]:
X = df.loc[:, df.columns != 'target']
y = df['target']
In [78]:
X_scaled = (X - X.mean())/X.std()
Out[78]:
VAR_0002
VAR_0003
VAR_0004
VAR_0006
VAR_0007
VAR_0013
VAR_0014
VAR_0015
VAR_0016
VAR_0017
...
VAR_0354=u
VAR_0466=i
VAR_0467=discharge na
VAR_0467=discharged
VAR_0467=dismissed
VAR_1934=branch
VAR_1934=caps
VAR_1934=is
VAR_1934=mobile
VAR_1934=rec
0
0.828161
-0.702888
0.047078
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
1
-0.685551
-0.281913
0.059131
0.190857
-0.436626
0.190857
-0.443476
0.522908
0.487472
0.430090
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
2
0.074792
-0.679060
-0.021005
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
3
0.939771
1.679992
-0.042505
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
0.487472
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
-0.944553
-0.147476
-0.169824
7.105424
4
-0.232135
1.370217
-0.140231
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
5
-0.706478
-0.671117
0.057014
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
6
-0.315843
0.345579
2.954442
0.190857
0.414232
0.190857
0.453354
0.522908
0.487472
-0.423884
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
7
-0.643698
-0.107168
-0.009929
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
8
-0.615795
-0.575801
-0.116125
0.190857
0.414232
0.190857
0.453354
0.522908
-0.394987
0.430090
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
-0.944553
-0.147476
5.896003
-0.140548
9
-0.566966
-0.130997
-0.266461
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
10
-0.308867
-0.607573
-0.206522
0.190857
0.414232
0.190857
0.453354
0.522908
-0.394987
0.430090
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
11
0.242208
-0.702888
-0.143326
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
-9.239468
9.542653
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
12
1.065332
-0.591687
-0.018073
3.308491
1.265090
3.308491
0.453354
3.521017
4.899767
4.699960
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
13
-0.434429
0.297921
0.022646
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
14
-0.483258
0.083462
-0.180951
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
15
-0.692527
-0.512258
-0.246102
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
16
1.023478
-0.687002
-0.188524
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
17
-0.643698
-0.154826
-0.083224
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
18
-0.539063
-0.289856
0.104085
0.814384
0.414232
0.814384
0.453354
1.522277
1.369931
1.284064
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
19
-0.706478
0.282035
-0.109203
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
20
-0.615795
-0.512258
-0.070031
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
21
0.221281
1.108101
0.000251
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
22
0.102695
0.544152
-0.099512
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
23
-0.566966
-0.512258
-0.141860
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
24
-0.720430
-0.321628
-0.132087
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
25
-0.713454
-0.663174
-0.201310
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
26
0.158500
0.313807
-0.161731
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
27
-0.566966
0.496495
-0.123455
0.190857
0.414232
0.190857
0.453354
0.522908
1.369931
0.430090
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
28
-0.392575
-0.464600
0.234387
1.437911
1.265090
1.437911
1.350183
3.521017
-0.394987
1.284064
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
29
-0.720430
-0.702888
-0.101548
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
145201
-0.280965
-0.472543
0.754781
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
145202
0.186403
-0.273970
-0.083224
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
145203
-0.615795
0.004033
-0.102118
0.814384
0.414232
0.814384
0.453354
2.521647
4.017308
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
145204
0.577038
-0.448714
-0.109203
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
145205
1.442016
-0.226312
-0.124758
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
145206
0.688648
0.536209
-0.126794
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
145207
-0.636722
-0.123054
-0.126387
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
145208
1.518748
0.496495
-0.083224
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
145209
-0.643698
0.226435
-0.012617
3.308491
1.265090
3.308491
1.350183
1.522277
2.252390
1.284064
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
145210
-0.678576
-0.639345
-0.107656
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
145211
-0.315843
0.250264
-0.112216
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
145212
-0.280965
0.385294
0.129983
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
145213
-0.692527
-0.702888
-0.211490
0.190857
0.414232
0.190857
0.453354
0.522908
1.369931
-0.423884
...
-1.330274
NaN
-0.027034
-9.239468
9.542653
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
145214
-0.455356
-0.162769
0.014991
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
145215
-0.239111
0.051690
-0.126794
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
145216
-0.525112
0.154948
-0.165884
0.814384
0.414232
0.814384
0.453354
0.522908
0.487472
0.430090
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
145217
-0.260038
0.353522
-0.221670
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
145218
-0.483258
-0.559916
0.301167
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
145219
-0.441404
0.742726
0.038934
2.684965
0.414232
2.684965
0.453354
1.522277
0.487472
0.430090
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
145220
-0.294916
-0.250141
-0.132087
0.190857
0.414232
0.190857
0.453354
0.522908
-0.394987
-0.423884
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
145221
0.388696
-0.337513
-0.164663
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
145222
-0.629746
-0.647288
-0.072393
0.814384
0.414232
0.814384
0.453354
0.522908
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
145223
3.199876
-0.702888
-0.099512
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
145224
-0.650673
-0.694945
-0.172807
0.190857
0.414232
0.190857
0.453354
0.522908
0.487472
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
145225
-0.566966
-0.432829
0.018086
0.190857
0.414232
0.190857
0.453354
-0.476462
-0.394987
-0.423884
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
145226
1.190894
0.440894
-0.058792
0.814384
2.115948
0.814384
2.247012
0.522908
0.487472
0.430090
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
145227
-0.462331
2.585486
-0.073370
1.437911
0.414232
1.437911
0.453354
4.520387
2.252390
2.992012
...
-1.330274
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
145228
-0.015891
-0.456657
-0.091042
0.190857
0.414232
0.190857
0.453354
0.522908
-0.394987
0.430090
...
0.751523
NaN
-0.027034
0.108326
-0.104757
1.084650
-0.944553
-0.147476
-0.169824
-0.140548
145229
-0.190282
-0.607573
0.049114
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
145230
0.856063
0.250264
1.203509
-0.432670
-0.436626
-0.432670
-0.443476
-0.476462
-0.394987
-0.423884
...
0.751523
NaN
-0.027034
0.108326
-0.104757
-0.923033
1.058519
-0.147476
-0.169824
-0.140548
145231 rows × 3000 columns
In [18]:
pca = PCA(n_components=2)
X_pca_ed = pca.fit_transform(X_scaled)
In [50]:
XT = X_pca_ed.T
f = figure()
ax = f.gca()
ax.scatter(XT[0, np.asarray(y, dtype=np.bool)], XT[1, np.asarray(y, dtype=np.bool)], c='g', alpha=0.5)
ax.scatter(XT[0, ~np.asarray(y, dtype=np.bool)], XT[1, ~np.asarray(y, dtype=np.bool)], c='r', alpha=0.5)
Out[50]:
<matplotlib.collections.PathCollection at 0x7f92c08cbbd0>
In [ ]:
exp = tn.Experiment(tn.Autoencoder,
layers=(X_scaled.shape[1], X_scaled.shape[1]*2,
2,
(X_scaled.shape[1]*2, 'tied'), (X_scaled.shape[1], 'tied')))
kwargs = {'algorithm': 'rmsprop',
'learning_rate': 0.0001, 'momentum': 0.9,
'input_dropout': 0.3, 'hidden_dropout': 0.3}
for train, _ in exp.itertrain(X_scaled, **kwargs):
print 'training loss:', train['loss']
exp.save('/home/mtambos/kaggle/springleaf/autoencoder_trainset2.pickle')
In [69]:
X_autoencoded = np.empty((X_scaled.shape[0], 2))
step = 20000
for i in xrange(0, X_autoencoded.shape[0], step):
print "encoding %s:%s" % (i, i+step)
X_autoencoded[i:i+step, :] = exp.network.encode(X_scaled[i:i+step, :])
encoding 0:20000
encoding 20000:40000
encoding 40000:60000
encoding 60000:80000
encoding 80000:100000
encoding 100000:120000
encoding 120000:140000
encoding 140000:160000
In [72]:
XT = X_autoencoded.T
f = figure()
ax = f.gca()
ax.scatter(XT[0, np.asarray(y, dtype=np.bool)], XT[1, np.asarray(y, dtype=np.bool)], c='g', alpha=0.5)
ax.scatter(XT[0, ~np.asarray(y, dtype=np.bool)], XT[1, ~np.asarray(y, dtype=np.bool)], c='r', alpha=0.5)
Out[72]:
<matplotlib.collections.PathCollection at 0x7f92b19d77d0>
Content source: mtambos/springleaf
Similar notebooks: