In [1]:
import random
import collections
import hashlib
import pandas as pd
import numpy as np
In [2]:
np.random.seed(0)
In [18]:
X_train_orig = pd.read_csv("train.csv")
X_test_orig = pd.read_csv("test.csv")
In [19]:
X_train_orig
Out[19]:
id
cat1
cat2
cat3
cat4
cat5
cat6
cat7
cat8
cat9
...
cont6
cont7
cont8
cont9
cont10
cont11
cont12
cont13
cont14
loss
0
1
A
B
A
B
A
A
A
A
B
...
0.718367
0.335060
0.30260
0.67135
0.83510
0.569745
0.594646
0.822493
0.714843
2213.18
1
2
A
B
A
A
A
A
A
A
B
...
0.438917
0.436585
0.60087
0.35127
0.43919
0.338312
0.366307
0.611431
0.304496
1283.60
2
5
A
B
A
A
B
A
A
A
B
...
0.289648
0.315545
0.27320
0.26076
0.32446
0.381398
0.373424
0.195709
0.774425
3005.09
3
10
B
B
A
B
A
A
A
A
B
...
0.440945
0.391128
0.31796
0.32128
0.44467
0.327915
0.321570
0.605077
0.602642
939.85
4
11
A
B
A
B
A
A
A
A
B
...
0.178193
0.247408
0.24564
0.22089
0.21230
0.204687
0.202213
0.246011
0.432606
2763.85
5
13
A
B
A
A
A
A
A
A
B
...
0.364464
0.401162
0.26847
0.46226
0.50556
0.366788
0.359249
0.345247
0.726792
5142.87
6
14
A
A
A
A
B
A
A
A
A
...
0.381515
0.363768
0.24564
0.40455
0.47225
0.334828
0.352251
0.342239
0.382931
1132.22
7
20
A
B
A
B
A
A
A
A
B
...
0.867021
0.583389
0.90267
0.84847
0.80218
0.644013
0.785706
0.859764
0.242416
3585.75
8
23
A
B
B
B
B
A
A
A
B
...
0.628534
0.384099
0.61229
0.38249
0.51111
0.682315
0.669033
0.756454
0.361191
10280.20
9
24
A
B
A
A
B
B
A
A
B
...
0.713343
0.469223
0.30260
0.67135
0.83510
0.863052
0.879347
0.822493
0.294523
6184.59
10
25
A
B
A
A
A
A
A
A
B
...
0.429383
0.877905
0.39455
0.53565
0.50556
0.550529
0.538473
0.336261
0.715009
6396.85
11
33
A
B
A
A
B
A
A
A
B
...
0.314683
0.370419
0.58354
0.46226
0.38016
0.644013
0.665644
0.339244
0.799124
5965.73
12
34
B
A
A
A
B
A
A
A
A
...
0.408772
0.363312
0.32843
0.32128
0.44467
0.327915
0.321570
0.605077
0.818358
1193.05
13
41
B
A
A
A
B
B
A
A
A
...
0.241574
0.255339
0.58934
0.32496
0.26029
0.257148
0.253044
0.276878
0.477578
1071.77
14
47
A
A
A
A
B
A
A
A
A
...
0.894903
0.586433
0.80058
0.93383
0.78770
0.880469
0.871011
0.822493
0.251278
585.18
15
48
A
A
A
A
B
B
A
A
A
...
0.570733
0.547756
0.80438
0.44352
0.63026
0.385085
0.377003
0.516660
0.340325
1395.45
16
49
A
B
B
A
A
A
A
A
B
...
0.411902
0.593548
0.31796
0.38846
0.48889
0.457203
0.447145
0.301535
0.205651
6609.32
17
51
A
A
A
A
A
B
A
A
A
...
0.688705
0.437192
0.67263
0.83505
0.59334
0.678924
0.665644
0.684242
0.407411
2658.70
18
52
A
A
B
A
A
B
A
A
A
...
0.443265
0.637086
0.36636
0.52938
0.39068
0.678924
0.665644
0.304350
0.310796
4167.32
19
55
A
A
A
B
A
A
A
A
A
...
0.436312
0.544355
0.48864
0.36285
0.20496
0.388786
0.406090
0.648701
0.830931
3797.89
20
57
B
B
A
B
A
A
A
A
B
...
0.441525
0.437192
0.31796
0.32128
0.44467
0.377724
0.369858
0.605077
0.743810
1155.48
21
60
A
A
A
B
A
B
A
A
A
...
0.349885
0.381185
0.81542
0.32311
0.36458
0.453334
0.454705
0.651733
0.354002
891.14
22
61
B
A
A
A
B
B
A
A
A
...
0.183243
0.253560
0.40028
0.21374
0.19431
0.167024
0.165648
0.404520
0.725941
765.97
23
66
B
A
A
B
A
A
A
A
A
...
0.373500
0.381883
0.36083
0.44352
0.45017
0.338312
0.366307
0.339244
0.793518
771.58
24
73
B
A
A
A
A
A
A
A
A
...
0.382070
0.451203
0.33906
0.47900
0.54433
0.812519
0.800726
0.246011
0.215055
7256.49
25
76
A
A
A
B
A
A
A
A
A
...
0.592478
0.496452
0.29758
0.46226
0.51111
0.434083
0.424625
0.357400
0.311644
1528.73
26
86
A
A
A
A
A
B
A
A
A
...
0.435733
0.769905
0.60087
0.40252
0.28677
0.550529
0.538473
0.298734
0.698006
4787.07
27
89
B
A
A
B
A
A
A
A
A
...
0.373500
0.356037
0.36083
0.44352
0.45017
0.291268
0.295524
0.339244
0.804795
2163.97
28
90
A
B
A
B
A
B
A
A
B
...
0.671307
0.464924
0.33906
0.62542
0.66076
0.607500
0.594646
0.678452
0.285224
11673.03
29
93
A
A
A
A
B
A
A
A
A
...
0.557431
0.402942
0.34445
0.52728
0.79139
0.377724
0.369858
0.687115
0.297788
1753.50
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
188288
587563
A
A
A
B
A
A
A
A
A
...
0.482425
0.414750
0.67263
0.51890
0.60401
0.464956
0.454705
0.407736
0.675983
2384.79
188289
587564
A
A
A
A
A
B
A
A
A
...
0.690216
0.498919
0.33906
0.62542
0.73106
0.622276
0.609277
0.687115
0.360712
961.10
188290
587566
A
A
A
B
A
A
A
A
A
...
0.688705
0.490407
0.33906
0.62542
0.73106
0.622276
0.609277
0.687115
0.342155
2786.15
188291
587567
B
A
A
A
B
A
A
A
A
...
0.808048
0.694312
0.94145
0.64103
0.80218
0.745820
0.753252
0.717751
0.216113
2157.66
188292
587569
B
A
A
A
A
B
A
A
A
...
0.484775
0.480521
0.28768
0.42289
0.46119
0.430255
0.420899
0.282249
0.238973
644.29
188293
587570
A
A
A
B
A
A
A
A
A
...
0.850938
0.611159
0.68823
0.91644
0.83510
0.569745
0.576121
0.828258
0.243950
4301.82
188294
587572
A
A
A
B
A
B
A
A
A
...
0.197932
0.314927
0.41762
0.26401
0.23545
0.207238
0.204687
0.271571
0.813596
4446.20
188295
587573
A
B
A
A
A
A
A
A
B
...
0.651024
0.452181
0.33906
0.62542
0.69471
0.492200
0.481306
0.678452
0.382540
1996.00
188296
587574
A
A
A
A
B
A
B
A
A
...
0.625784
0.606340
0.51256
0.42084
0.57172
0.665172
0.651918
0.614594
0.836524
16569.90
188297
587575
A
B
A
A
A
A
A
A
B
...
0.448496
0.735978
0.36083
0.40657
0.40666
0.776962
0.800726
0.287682
0.804795
4620.56
188298
587578
A
A
B
A
A
B
A
A
A
...
0.415039
0.395131
0.24123
0.32865
0.40666
0.352419
0.345316
0.624025
0.290736
3201.50
188299
587579
A
A
A
A
B
A
A
A
A
...
0.563226
0.451570
0.54829
0.29618
0.36974
0.472726
0.462286
0.657761
0.239309
1946.11
188300
587580
A
A
A
B
A
A
A
A
A
...
0.835720
0.794598
0.53046
0.50840
0.67554
0.742852
0.729856
0.663739
0.804769
839.41
188301
587584
A
A
A
A
B
A
A
A
A
...
0.425928
0.636286
0.27797
0.50420
0.31003
0.742852
0.780521
0.333292
0.359434
896.57
188302
587592
A
A
A
B
B
A
A
A
A
...
0.349083
0.368005
0.41762
0.41675
0.39068
0.275431
0.270746
0.256038
0.313505
1667.38
188303
587595
A
B
A
B
A
A
A
A
B
...
0.806951
0.555567
0.74629
0.93383
0.78770
0.757468
0.772574
0.812550
0.843080
4003.79
188304
587601
A
A
A
A
B
A
A
B
A
...
0.437758
0.535749
0.54236
0.47900
0.51111
0.705501
0.692256
0.357400
0.283936
12065.38
188305
587602
A
A
A
A
A
B
A
A
A
...
0.674671
0.699628
0.30768
0.38249
0.69471
0.607500
0.594646
0.684242
0.383437
4958.36
188306
587603
A
B
A
A
B
A
A
A
B
...
0.728484
0.414750
0.30260
0.67135
0.83510
0.872013
0.879347
0.833874
0.708475
2594.72
188307
587605
B
A
A
A
A
B
A
A
A
...
0.599275
0.548122
0.48864
0.45391
0.64056
0.592525
0.590961
0.701266
0.362479
1173.30
188308
587606
A
A
A
A
B
A
A
A
A
...
0.201125
0.259395
0.24564
0.30859
0.21983
0.207238
0.204687
0.357400
0.348217
2161.12
188309
587607
A
B
A
B
B
B
A
A
B
...
0.269520
0.338963
0.33906
0.28066
0.30529
0.245410
0.261799
0.181433
0.398571
4080.42
188310
587611
A
B
A
A
B
A
A
A
B
...
0.186254
0.317274
0.27797
0.32128
0.24355
0.180456
0.178698
0.304350
0.381660
4659.57
188311
587612
A
A
A
A
B
A
A
A
A
...
0.502705
0.473897
0.43518
0.66201
0.58257
0.415029
0.406090
0.354344
0.377315
994.85
188312
587619
A
A
A
A
A
B
A
A
A
...
0.445008
0.377930
0.36636
0.29095
0.44467
0.327915
0.321570
0.731059
0.721499
804.28
188313
587620
A
B
A
A
A
A
A
A
B
...
0.242437
0.289949
0.24564
0.30859
0.32935
0.223038
0.220003
0.333292
0.208216
1198.62
188314
587624
A
A
A
A
A
B
A
A
A
...
0.334270
0.382000
0.63475
0.40455
0.47779
0.307628
0.301921
0.318646
0.305872
1108.34
188315
587630
A
B
A
A
A
A
A
B
B
...
0.345883
0.370534
0.24564
0.45808
0.47779
0.445614
0.443374
0.339244
0.503888
5762.64
188316
587632
A
B
A
A
A
A
A
A
B
...
0.704364
0.562866
0.34987
0.44767
0.53881
0.863052
0.852865
0.654753
0.721707
1562.87
188317
587633
B
A
A
B
A
A
A
A
A
...
0.844563
0.533048
0.97123
0.93383
0.83814
0.932195
0.946432
0.810511
0.721460
4751.72
188318 rows × 132 columns
In [20]:
X_train_orig.columns
Out[20]:
Index([u'id', u'cat1', u'cat2', u'cat3', u'cat4', u'cat5', u'cat6', u'cat7',
u'cat8', u'cat9',
...
u'cont6', u'cont7', u'cont8', u'cont9', u'cont10', u'cont11', u'cont12',
u'cont13', u'cont14', u'loss'],
dtype='object', length=132)
In [21]:
X_train_orig.describe()
Out[21]:
id
cont1
cont2
cont3
cont4
cont5
cont6
cont7
cont8
cont9
cont10
cont11
cont12
cont13
cont14
loss
count
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
mean
294135.982561
0.493861
0.507188
0.498918
0.491812
0.487428
0.490945
0.484970
0.486437
0.485506
0.498066
0.493511
0.493150
0.493138
0.495717
3037.337686
std
169336.084867
0.187640
0.207202
0.202105
0.211292
0.209027
0.205273
0.178450
0.199370
0.181660
0.185877
0.209737
0.209427
0.212777
0.222488
2904.086186
min
1.000000
0.000016
0.001149
0.002634
0.176921
0.281143
0.012683
0.069503
0.236880
0.000080
0.000000
0.035321
0.036232
0.000228
0.179722
0.670000
25%
147748.250000
0.346090
0.358319
0.336963
0.327354
0.281143
0.336105
0.350175
0.312800
0.358970
0.364580
0.310961
0.311661
0.315758
0.294610
1204.460000
50%
294539.500000
0.475784
0.555782
0.527991
0.452887
0.422268
0.440945
0.438285
0.441060
0.441450
0.461190
0.457203
0.462286
0.363547
0.407403
2115.570000
75%
440680.500000
0.623912
0.681761
0.634224
0.652072
0.643315
0.655021
0.591045
0.623580
0.566820
0.614590
0.678924
0.675759
0.689974
0.724623
3864.045000
max
587633.000000
0.984975
0.862654
0.944251
0.954297
0.983674
0.997162
1.000000
0.980200
0.995400
0.994980
0.998742
0.998484
0.988494
0.844848
121012.250000
In [22]:
X_train_num = X_train_orig.copy()
X_test_num = X_test_orig.copy()
X_train_num.drop('id', axis=1, inplace=True)
X_test_num.drop('id', axis=1, inplace=True)
In [23]:
feats = [x for x in X_train_orig.keys() if 'cat' in x]
L_MAX_32_BIT_INT = (1<<31) - 1
def chash(value):
val = hashlib.sha256(str(value).encode('utf-8')).hexdigest()
return int(val, 16) & L_MAX_32_BIT_INT
def hashfeats(df, feats, hash_feature_size=13):
def hashf(v):
return chash(v) % hash_feature_size
data = []
for r in range(0, df.shape[0]):
values = np.zeros(hash_feature_size)
for feat in feats:
k = hashf(df[feat][r])
values[k] = values[k] + 1
data.append(int(''.join([str(int(x)) for x in values])))
df['hf'] = pd.Series(data, index=df.index)
hashfeats(X_train_num, feats)
hashfeats(X_test_num, feats)
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-23-cc21628dded7> in <module>()
25 df['hf'] = pd.Series(data, index=df.index)
26
---> 27 hashfeats(X_train_num, feats)
28 hashfeats(X_test_num, feats)
<ipython-input-23-cc21628dded7> in hashfeats(df, feats, hash_feature_size)
19 values = np.zeros(hash_feature_size)
20 for feat in feats:
---> 21 k = hashf(df[feat][r])
22 values[k] = values[k] + 1
23 data.append(int(''.join([str(int(x)) for x in values])))
/Users/guilherme/anaconda/envs/ipy2/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
599 key = com._apply_if_callable(key, self)
600 try:
--> 601 result = self.index.get_value(self, key)
602
603 if not is_scalar(result):
/Users/guilherme/anaconda/envs/ipy2/lib/python2.7/site-packages/pandas/indexes/base.pyc in get_value(self, series, key)
2164 k = _values_from_object(key)
2165
-> 2166 k = self._convert_scalar_indexer(k, kind='getitem')
2167 try:
2168 return self._engine.get_value(s, k,
/Users/guilherme/anaconda/envs/ipy2/lib/python2.7/site-packages/pandas/indexes/numeric.pyc in _convert_scalar_indexer(self, key, kind)
156 # don't coerce ilocs to integers
157 if kind != 'iloc':
--> 158 key = self._maybe_cast_indexer(key)
159 return (super(Int64Index, self)
160 ._convert_scalar_indexer(key, kind=kind))
KeyboardInterrupt:
In [10]:
X_train_num.drop(feats, inplace=True, axis=1)
X_test_num.drop(feats, inplace=True, axis=1)
X_train_num.columns
Out[10]:
Index([u'cont1', u'cont2', u'cont3', u'cont4', u'cont5', u'cont6', u'cont7',
u'cont8', u'cont9', u'cont10', u'cont11', u'cont12', u'cont13',
u'cont14', u'loss', u'hf'],
dtype='object')
In [11]:
X_train_num.describe()
Out[11]:
cont1
cont2
cont3
cont4
cont5
cont6
cont7
cont8
cont9
cont10
cont11
cont12
cont13
cont14
loss
hf
count
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
188318.000000
1.883180e+05
mean
0.493861
0.507188
0.498918
0.491812
0.487428
0.490945
0.484970
0.486437
0.485506
0.498066
0.493511
0.493150
0.493138
0.495717
3037.337686
1.152495e+14
std
0.187640
0.207202
0.202105
0.211292
0.209027
0.205273
0.178450
0.199370
0.181660
0.185877
0.209737
0.209427
0.212777
0.222488
2904.086186
1.248313e+14
min
0.000016
0.001149
0.002634
0.176921
0.281143
0.012683
0.069503
0.236880
0.000080
0.000000
0.035321
0.036232
0.000228
0.179722
0.670000
8.430069e+09
25%
0.346090
0.358319
0.336963
0.327354
0.281143
0.336105
0.350175
0.312800
0.358970
0.364580
0.310961
0.311661
0.315758
0.294610
1204.460000
2.005861e+13
50%
0.475784
0.555782
0.527991
0.452887
0.422268
0.440945
0.438285
0.441060
0.441450
0.461190
0.457203
0.462286
0.363547
0.407403
2115.570000
1.101833e+14
75%
0.623912
0.681761
0.634224
0.652072
0.643315
0.655021
0.591045
0.623580
0.566820
0.614590
0.678924
0.675759
0.689974
0.724623
3864.045000
2.001852e+14
max
0.984975
0.862654
0.944251
0.954297
0.983674
0.997162
1.000000
0.980200
0.995400
0.994980
0.998742
0.998484
0.988494
0.844848
121012.250000
4.200702e+15
In [15]:
X_test_num.describe()
Out[15]:
cont1
cont2
cont3
cont4
cont5
cont6
cont7
cont8
cont9
cont10
...
cont1-cont5
cont1-cont6
cont1-cont7
cont1-cont8
cont1-cont9
cont1-cont10
cont1-cont11
cont1-cont12
cont1-cont13
cont1-cont14
count
125546.000000
125546.000000
125546.000000
125546.000000
125546.000000
125546.000000
125546.000000
125546.000000
125546.000000
125546.000000
...
125546.000000
1.255460e+05
125546.000000
125546.000000
1.255460e+05
125546.000000
1.255460e+05
1.255460e+05
1.255460e+05
125546.000000
mean
0.494447
0.506939
0.498255
0.492334
0.487640
0.492188
0.485945
0.487401
0.486015
0.498909
...
0.240173
2.726631e-01
0.252613
0.254561
2.722084e-01
0.274926
2.680390e-01
2.685777e-01
2.661135e-01
0.247440
std
0.187961
0.206837
0.201746
0.210815
0.209119
0.205574
0.178650
0.199549
0.182134
0.185950
...
0.136126
2.034714e-01
0.150466
0.179247
2.099015e-01
0.194088
1.873743e-01
1.896551e-01
1.996170e-01
0.156191
min
0.000016
0.001149
0.002634
0.176921
0.281143
0.012683
0.069503
0.236880
0.000080
0.000000
...
0.000004
2.029280e-07
0.000001
0.000004
1.280000e-09
0.000000
5.651360e-07
5.797120e-07
3.648000e-09
0.000003
25%
0.347403
0.358319
0.336963
0.327354
0.281143
0.336105
0.352087
0.317960
0.358970
0.364580
...
0.135799
1.323890e-01
0.137707
0.132376
1.212191e-01
0.130200
1.222663e-01
1.221549e-01
1.252380e-01
0.125863
50%
0.475784
0.555782
0.527991
0.452887
0.422268
0.441525
0.438893
0.441060
0.441450
0.466720
...
0.208022
2.053983e-01
0.222403
0.196415
2.097371e-01
0.218093
2.228239e-01
2.211248e-01
1.874492e-01
0.206353
75%
0.626630
0.681761
0.634224
0.652072
0.643315
0.659261
0.591284
0.629180
0.568890
0.619840
...
0.317955
3.631238e-01
0.327785
0.320391
3.420890e-01
0.348750
3.726575e-01
3.687722e-01
3.578798e-01
0.336209
max
0.984975
0.862654
0.944251
0.956046
0.983107
0.997162
1.000000
0.982800
0.995400
0.994980
...
0.876207
9.818093e-01
0.893101
0.960450
9.788583e-01
0.979567
9.272702e-01
9.298775e-01
9.073386e-01
0.828459
8 rows × 29 columns
In [16]:
# for i in [k for k in X_train_num.keys() if k not in ('hf', 'loss')]:
# for j in [k for k in X_train_num.keys() if k not in ('hf', 'loss')]:
# X_train_num[str(i+'-'+j)] = X_train_num[i]*X_train_num[j]
# X_test_num[str(i+'-'+j)] = X_test_num[i]*X_test_num[j]
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-16-1889526d0f1d> in <module>()
2 for j in [k for k in X_train_num.keys() if k not in ('hf', 'loss')]:
3 X_train_num[str(i+'-'+j)] = X_train_num[i]*X_train_num[j]
----> 4 X_test_num[str(i+'-'+j)] = X_test_num[i]*X_test_num[j]
/Users/guilherme/anaconda/envs/ipy2/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
2057 return self._getitem_multilevel(key)
2058 else:
-> 2059 return self._getitem_column(key)
2060
2061 def _getitem_column(self, key):
/Users/guilherme/anaconda/envs/ipy2/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
2064 # get column
2065 if self.columns.is_unique:
-> 2066 return self._get_item_cache(key)
2067
2068 # duplicate columns & possible reduce dimensionality
/Users/guilherme/anaconda/envs/ipy2/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
1384 res = cache.get(item)
1385 if res is None:
-> 1386 values = self._data.get(item)
1387 res = self._box_item_values(item, values)
1388 cache[item] = res
/Users/guilherme/anaconda/envs/ipy2/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
3539
3540 if not isnull(item):
-> 3541 loc = self.items.get_loc(item)
3542 else:
3543 indexer = np.arange(len(self.items))[isnull(self.items)]
/Users/guilherme/anaconda/envs/ipy2/lib/python2.7/site-packages/pandas/indexes/base.pyc in get_loc(self, key, method, tolerance)
2134 return self._engine.get_loc(key)
2135 except KeyError:
-> 2136 return self._engine.get_loc(self._maybe_cast_indexer(key))
2137
2138 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4443)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4289)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13733)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13687)()
KeyError: 'cont1-loss'
In [ ]:
X_test_num.describe()
In [ ]:
_index_train = []
_index_test = []
for i in range(0, X_train_num.shape[0]):
if np.random.binomial(1, 0.75, 1):
_index_train.append(i)
else:
_index_test.append(i)
print('Training: {}\nTesting: {}'.format(len(_index_train)/float(X_train_num.shape[0]), len(_index_test)/float(X_train_num.shape[0])))
In [ ]:
X_validation_df = X_test_num.copy()
X_train_df = X_train_num.iloc[_index_train,:]
X_test_df = X_train_num.iloc[_index_test,:]
In [ ]:
#PCA
from sklearn.decomposition import PCA
X_train = X_train_df.drop('loss', inplace=False, axis=1).as_matrix()
Y_train = X_train_df['loss'].as_matrix()
pca = PCA()
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
In [ ]:
pca.explained_variance_ratio_[:6]
In [ ]:
n_pca_components = 6
In [ ]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression(fit_intercept=True, normalize=True)
In [ ]:
lm.fit(X_train_pca[:,:n_pca_components], Y_train.ravel())
In [ ]:
lm.score(X_train_pca[:,:n_pca_components], Y_train.ravel())
In [ ]:
X_test = X_test_df.drop('loss', inplace=False, axis=1).as_matrix()
Y_test = X_test_df['loss'].as_matrix()
pca.fit(X_test)
X_test_pca = pca.transform(X_test)
In [ ]:
lm.score(X_test_pca[:,:n_pca_components], Y_test.ravel())
In [ ]:
X_validate = X_validation_df.as_matrix()
pca.fit(X_validate)
X_validate_pca = pca.transform(X_validate)
predictions = lm.predict(X_validate_pca[:,:n_pca_components])
In [ ]:
predictions
In [ ]:
import csv
with open('predictions-04.csv', 'w') as fp:
writer = csv.writer(fp)
writer.writerow(['id','loss'])
for (row, val) in zip(X_test_orig['id'].tolist(), predictions):
writer.writerow([row, val])
0.089751837048630212, 0.087374891924143272