In [1]:
import random
import collections
import hashlib
    
import pandas as pd
import numpy as np

In [2]:
np.random.seed(0)

In [18]:
X_train_orig = pd.read_csv("train.csv")
X_test_orig = pd.read_csv("test.csv")

In [19]:
X_train_orig


Out[19]:
id cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 ... cont6 cont7 cont8 cont9 cont10 cont11 cont12 cont13 cont14 loss
0 1 A B A B A A A A B ... 0.718367 0.335060 0.30260 0.67135 0.83510 0.569745 0.594646 0.822493 0.714843 2213.18
1 2 A B A A A A A A B ... 0.438917 0.436585 0.60087 0.35127 0.43919 0.338312 0.366307 0.611431 0.304496 1283.60
2 5 A B A A B A A A B ... 0.289648 0.315545 0.27320 0.26076 0.32446 0.381398 0.373424 0.195709 0.774425 3005.09
3 10 B B A B A A A A B ... 0.440945 0.391128 0.31796 0.32128 0.44467 0.327915 0.321570 0.605077 0.602642 939.85
4 11 A B A B A A A A B ... 0.178193 0.247408 0.24564 0.22089 0.21230 0.204687 0.202213 0.246011 0.432606 2763.85
5 13 A B A A A A A A B ... 0.364464 0.401162 0.26847 0.46226 0.50556 0.366788 0.359249 0.345247 0.726792 5142.87
6 14 A A A A B A A A A ... 0.381515 0.363768 0.24564 0.40455 0.47225 0.334828 0.352251 0.342239 0.382931 1132.22
7 20 A B A B A A A A B ... 0.867021 0.583389 0.90267 0.84847 0.80218 0.644013 0.785706 0.859764 0.242416 3585.75
8 23 A B B B B A A A B ... 0.628534 0.384099 0.61229 0.38249 0.51111 0.682315 0.669033 0.756454 0.361191 10280.20
9 24 A B A A B B A A B ... 0.713343 0.469223 0.30260 0.67135 0.83510 0.863052 0.879347 0.822493 0.294523 6184.59
10 25 A B A A A A A A B ... 0.429383 0.877905 0.39455 0.53565 0.50556 0.550529 0.538473 0.336261 0.715009 6396.85
11 33 A B A A B A A A B ... 0.314683 0.370419 0.58354 0.46226 0.38016 0.644013 0.665644 0.339244 0.799124 5965.73
12 34 B A A A B A A A A ... 0.408772 0.363312 0.32843 0.32128 0.44467 0.327915 0.321570 0.605077 0.818358 1193.05
13 41 B A A A B B A A A ... 0.241574 0.255339 0.58934 0.32496 0.26029 0.257148 0.253044 0.276878 0.477578 1071.77
14 47 A A A A B A A A A ... 0.894903 0.586433 0.80058 0.93383 0.78770 0.880469 0.871011 0.822493 0.251278 585.18
15 48 A A A A B B A A A ... 0.570733 0.547756 0.80438 0.44352 0.63026 0.385085 0.377003 0.516660 0.340325 1395.45
16 49 A B B A A A A A B ... 0.411902 0.593548 0.31796 0.38846 0.48889 0.457203 0.447145 0.301535 0.205651 6609.32
17 51 A A A A A B A A A ... 0.688705 0.437192 0.67263 0.83505 0.59334 0.678924 0.665644 0.684242 0.407411 2658.70
18 52 A A B A A B A A A ... 0.443265 0.637086 0.36636 0.52938 0.39068 0.678924 0.665644 0.304350 0.310796 4167.32
19 55 A A A B A A A A A ... 0.436312 0.544355 0.48864 0.36285 0.20496 0.388786 0.406090 0.648701 0.830931 3797.89
20 57 B B A B A A A A B ... 0.441525 0.437192 0.31796 0.32128 0.44467 0.377724 0.369858 0.605077 0.743810 1155.48
21 60 A A A B A B A A A ... 0.349885 0.381185 0.81542 0.32311 0.36458 0.453334 0.454705 0.651733 0.354002 891.14
22 61 B A A A B B A A A ... 0.183243 0.253560 0.40028 0.21374 0.19431 0.167024 0.165648 0.404520 0.725941 765.97
23 66 B A A B A A A A A ... 0.373500 0.381883 0.36083 0.44352 0.45017 0.338312 0.366307 0.339244 0.793518 771.58
24 73 B A A A A A A A A ... 0.382070 0.451203 0.33906 0.47900 0.54433 0.812519 0.800726 0.246011 0.215055 7256.49
25 76 A A A B A A A A A ... 0.592478 0.496452 0.29758 0.46226 0.51111 0.434083 0.424625 0.357400 0.311644 1528.73
26 86 A A A A A B A A A ... 0.435733 0.769905 0.60087 0.40252 0.28677 0.550529 0.538473 0.298734 0.698006 4787.07
27 89 B A A B A A A A A ... 0.373500 0.356037 0.36083 0.44352 0.45017 0.291268 0.295524 0.339244 0.804795 2163.97
28 90 A B A B A B A A B ... 0.671307 0.464924 0.33906 0.62542 0.66076 0.607500 0.594646 0.678452 0.285224 11673.03
29 93 A A A A B A A A A ... 0.557431 0.402942 0.34445 0.52728 0.79139 0.377724 0.369858 0.687115 0.297788 1753.50
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
188288 587563 A A A B A A A A A ... 0.482425 0.414750 0.67263 0.51890 0.60401 0.464956 0.454705 0.407736 0.675983 2384.79
188289 587564 A A A A A B A A A ... 0.690216 0.498919 0.33906 0.62542 0.73106 0.622276 0.609277 0.687115 0.360712 961.10
188290 587566 A A A B A A A A A ... 0.688705 0.490407 0.33906 0.62542 0.73106 0.622276 0.609277 0.687115 0.342155 2786.15
188291 587567 B A A A B A A A A ... 0.808048 0.694312 0.94145 0.64103 0.80218 0.745820 0.753252 0.717751 0.216113 2157.66
188292 587569 B A A A A B A A A ... 0.484775 0.480521 0.28768 0.42289 0.46119 0.430255 0.420899 0.282249 0.238973 644.29
188293 587570 A A A B A A A A A ... 0.850938 0.611159 0.68823 0.91644 0.83510 0.569745 0.576121 0.828258 0.243950 4301.82
188294 587572 A A A B A B A A A ... 0.197932 0.314927 0.41762 0.26401 0.23545 0.207238 0.204687 0.271571 0.813596 4446.20
188295 587573 A B A A A A A A B ... 0.651024 0.452181 0.33906 0.62542 0.69471 0.492200 0.481306 0.678452 0.382540 1996.00
188296 587574 A A A A B A B A A ... 0.625784 0.606340 0.51256 0.42084 0.57172 0.665172 0.651918 0.614594 0.836524 16569.90
188297 587575 A B A A A A A A B ... 0.448496 0.735978 0.36083 0.40657 0.40666 0.776962 0.800726 0.287682 0.804795 4620.56
188298 587578 A A B A A B A A A ... 0.415039 0.395131 0.24123 0.32865 0.40666 0.352419 0.345316 0.624025 0.290736 3201.50
188299 587579 A A A A B A A A A ... 0.563226 0.451570 0.54829 0.29618 0.36974 0.472726 0.462286 0.657761 0.239309 1946.11
188300 587580 A A A B A A A A A ... 0.835720 0.794598 0.53046 0.50840 0.67554 0.742852 0.729856 0.663739 0.804769 839.41
188301 587584 A A A A B A A A A ... 0.425928 0.636286 0.27797 0.50420 0.31003 0.742852 0.780521 0.333292 0.359434 896.57
188302 587592 A A A B B A A A A ... 0.349083 0.368005 0.41762 0.41675 0.39068 0.275431 0.270746 0.256038 0.313505 1667.38
188303 587595 A B A B A A A A B ... 0.806951 0.555567 0.74629 0.93383 0.78770 0.757468 0.772574 0.812550 0.843080 4003.79
188304 587601 A A A A B A A B A ... 0.437758 0.535749 0.54236 0.47900 0.51111 0.705501 0.692256 0.357400 0.283936 12065.38
188305 587602 A A A A A B A A A ... 0.674671 0.699628 0.30768 0.38249 0.69471 0.607500 0.594646 0.684242 0.383437 4958.36
188306 587603 A B A A B A A A B ... 0.728484 0.414750 0.30260 0.67135 0.83510 0.872013 0.879347 0.833874 0.708475 2594.72
188307 587605 B A A A A B A A A ... 0.599275 0.548122 0.48864 0.45391 0.64056 0.592525 0.590961 0.701266 0.362479 1173.30
188308 587606 A A A A B A A A A ... 0.201125 0.259395 0.24564 0.30859 0.21983 0.207238 0.204687 0.357400 0.348217 2161.12
188309 587607 A B A B B B A A B ... 0.269520 0.338963 0.33906 0.28066 0.30529 0.245410 0.261799 0.181433 0.398571 4080.42
188310 587611 A B A A B A A A B ... 0.186254 0.317274 0.27797 0.32128 0.24355 0.180456 0.178698 0.304350 0.381660 4659.57
188311 587612 A A A A B A A A A ... 0.502705 0.473897 0.43518 0.66201 0.58257 0.415029 0.406090 0.354344 0.377315 994.85
188312 587619 A A A A A B A A A ... 0.445008 0.377930 0.36636 0.29095 0.44467 0.327915 0.321570 0.731059 0.721499 804.28
188313 587620 A B A A A A A A B ... 0.242437 0.289949 0.24564 0.30859 0.32935 0.223038 0.220003 0.333292 0.208216 1198.62
188314 587624 A A A A A B A A A ... 0.334270 0.382000 0.63475 0.40455 0.47779 0.307628 0.301921 0.318646 0.305872 1108.34
188315 587630 A B A A A A A B B ... 0.345883 0.370534 0.24564 0.45808 0.47779 0.445614 0.443374 0.339244 0.503888 5762.64
188316 587632 A B A A A A A A B ... 0.704364 0.562866 0.34987 0.44767 0.53881 0.863052 0.852865 0.654753 0.721707 1562.87
188317 587633 B A A B A A A A A ... 0.844563 0.533048 0.97123 0.93383 0.83814 0.932195 0.946432 0.810511 0.721460 4751.72

188318 rows × 132 columns


In [20]:
X_train_orig.columns


Out[20]:
Index([u'id', u'cat1', u'cat2', u'cat3', u'cat4', u'cat5', u'cat6', u'cat7',
       u'cat8', u'cat9',
       ...
       u'cont6', u'cont7', u'cont8', u'cont9', u'cont10', u'cont11', u'cont12',
       u'cont13', u'cont14', u'loss'],
      dtype='object', length=132)

In [21]:
X_train_orig.describe()


Out[21]:
id cont1 cont2 cont3 cont4 cont5 cont6 cont7 cont8 cont9 cont10 cont11 cont12 cont13 cont14 loss
count 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000
mean 294135.982561 0.493861 0.507188 0.498918 0.491812 0.487428 0.490945 0.484970 0.486437 0.485506 0.498066 0.493511 0.493150 0.493138 0.495717 3037.337686
std 169336.084867 0.187640 0.207202 0.202105 0.211292 0.209027 0.205273 0.178450 0.199370 0.181660 0.185877 0.209737 0.209427 0.212777 0.222488 2904.086186
min 1.000000 0.000016 0.001149 0.002634 0.176921 0.281143 0.012683 0.069503 0.236880 0.000080 0.000000 0.035321 0.036232 0.000228 0.179722 0.670000
25% 147748.250000 0.346090 0.358319 0.336963 0.327354 0.281143 0.336105 0.350175 0.312800 0.358970 0.364580 0.310961 0.311661 0.315758 0.294610 1204.460000
50% 294539.500000 0.475784 0.555782 0.527991 0.452887 0.422268 0.440945 0.438285 0.441060 0.441450 0.461190 0.457203 0.462286 0.363547 0.407403 2115.570000
75% 440680.500000 0.623912 0.681761 0.634224 0.652072 0.643315 0.655021 0.591045 0.623580 0.566820 0.614590 0.678924 0.675759 0.689974 0.724623 3864.045000
max 587633.000000 0.984975 0.862654 0.944251 0.954297 0.983674 0.997162 1.000000 0.980200 0.995400 0.994980 0.998742 0.998484 0.988494 0.844848 121012.250000

In [22]:
X_train_num = X_train_orig.copy()
X_test_num = X_test_orig.copy()

X_train_num.drop('id', axis=1, inplace=True)
X_test_num.drop('id', axis=1, inplace=True)

In [23]:
feats = [x for x in X_train_orig.keys() if 'cat' in x]

L_MAX_32_BIT_INT = (1<<31) - 1

def chash(value):
    
    val = hashlib.sha256(str(value).encode('utf-8')).hexdigest()

    return int(val, 16) & L_MAX_32_BIT_INT

def hashfeats(df, feats, hash_feature_size=13):

    def hashf(v):
        return chash(v) % hash_feature_size
    
    data = []
    
    for r in range(0, df.shape[0]):
        values = np.zeros(hash_feature_size)
        for feat in feats:
            k = hashf(df[feat][r])
            values[k] =  values[k] + 1
        data.append(int(''.join([str(int(x)) for x in values])))
   
    df['hf'] = pd.Series(data, index=df.index)
            
hashfeats(X_train_num, feats)
hashfeats(X_test_num, feats)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-23-cc21628dded7> in <module>()
     25     df['hf'] = pd.Series(data, index=df.index)
     26 
---> 27 hashfeats(X_train_num, feats)
     28 hashfeats(X_test_num, feats)

<ipython-input-23-cc21628dded7> in hashfeats(df, feats, hash_feature_size)
     19         values = np.zeros(hash_feature_size)
     20         for feat in feats:
---> 21             k = hashf(df[feat][r])
     22             values[k] =  values[k] + 1
     23         data.append(int(''.join([str(int(x)) for x in values])))

/Users/guilherme/anaconda/envs/ipy2/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
    599         key = com._apply_if_callable(key, self)
    600         try:
--> 601             result = self.index.get_value(self, key)
    602 
    603             if not is_scalar(result):

/Users/guilherme/anaconda/envs/ipy2/lib/python2.7/site-packages/pandas/indexes/base.pyc in get_value(self, series, key)
   2164         k = _values_from_object(key)
   2165 
-> 2166         k = self._convert_scalar_indexer(k, kind='getitem')
   2167         try:
   2168             return self._engine.get_value(s, k,

/Users/guilherme/anaconda/envs/ipy2/lib/python2.7/site-packages/pandas/indexes/numeric.pyc in _convert_scalar_indexer(self, key, kind)
    156         # don't coerce ilocs to integers
    157         if kind != 'iloc':
--> 158             key = self._maybe_cast_indexer(key)
    159         return (super(Int64Index, self)
    160                 ._convert_scalar_indexer(key, kind=kind))

KeyboardInterrupt: 

In [10]:
X_train_num.drop(feats, inplace=True, axis=1)
X_test_num.drop(feats, inplace=True, axis=1)

X_train_num.columns


Out[10]:
Index([u'cont1', u'cont2', u'cont3', u'cont4', u'cont5', u'cont6', u'cont7',
       u'cont8', u'cont9', u'cont10', u'cont11', u'cont12', u'cont13',
       u'cont14', u'loss', u'hf'],
      dtype='object')

In [11]:
X_train_num.describe()


Out[11]:
cont1 cont2 cont3 cont4 cont5 cont6 cont7 cont8 cont9 cont10 cont11 cont12 cont13 cont14 loss hf
count 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 188318.000000 1.883180e+05
mean 0.493861 0.507188 0.498918 0.491812 0.487428 0.490945 0.484970 0.486437 0.485506 0.498066 0.493511 0.493150 0.493138 0.495717 3037.337686 1.152495e+14
std 0.187640 0.207202 0.202105 0.211292 0.209027 0.205273 0.178450 0.199370 0.181660 0.185877 0.209737 0.209427 0.212777 0.222488 2904.086186 1.248313e+14
min 0.000016 0.001149 0.002634 0.176921 0.281143 0.012683 0.069503 0.236880 0.000080 0.000000 0.035321 0.036232 0.000228 0.179722 0.670000 8.430069e+09
25% 0.346090 0.358319 0.336963 0.327354 0.281143 0.336105 0.350175 0.312800 0.358970 0.364580 0.310961 0.311661 0.315758 0.294610 1204.460000 2.005861e+13
50% 0.475784 0.555782 0.527991 0.452887 0.422268 0.440945 0.438285 0.441060 0.441450 0.461190 0.457203 0.462286 0.363547 0.407403 2115.570000 1.101833e+14
75% 0.623912 0.681761 0.634224 0.652072 0.643315 0.655021 0.591045 0.623580 0.566820 0.614590 0.678924 0.675759 0.689974 0.724623 3864.045000 2.001852e+14
max 0.984975 0.862654 0.944251 0.954297 0.983674 0.997162 1.000000 0.980200 0.995400 0.994980 0.998742 0.998484 0.988494 0.844848 121012.250000 4.200702e+15

In [15]:
X_test_num.describe()


Out[15]:
cont1 cont2 cont3 cont4 cont5 cont6 cont7 cont8 cont9 cont10 ... cont1-cont5 cont1-cont6 cont1-cont7 cont1-cont8 cont1-cont9 cont1-cont10 cont1-cont11 cont1-cont12 cont1-cont13 cont1-cont14
count 125546.000000 125546.000000 125546.000000 125546.000000 125546.000000 125546.000000 125546.000000 125546.000000 125546.000000 125546.000000 ... 125546.000000 1.255460e+05 125546.000000 125546.000000 1.255460e+05 125546.000000 1.255460e+05 1.255460e+05 1.255460e+05 125546.000000
mean 0.494447 0.506939 0.498255 0.492334 0.487640 0.492188 0.485945 0.487401 0.486015 0.498909 ... 0.240173 2.726631e-01 0.252613 0.254561 2.722084e-01 0.274926 2.680390e-01 2.685777e-01 2.661135e-01 0.247440
std 0.187961 0.206837 0.201746 0.210815 0.209119 0.205574 0.178650 0.199549 0.182134 0.185950 ... 0.136126 2.034714e-01 0.150466 0.179247 2.099015e-01 0.194088 1.873743e-01 1.896551e-01 1.996170e-01 0.156191
min 0.000016 0.001149 0.002634 0.176921 0.281143 0.012683 0.069503 0.236880 0.000080 0.000000 ... 0.000004 2.029280e-07 0.000001 0.000004 1.280000e-09 0.000000 5.651360e-07 5.797120e-07 3.648000e-09 0.000003
25% 0.347403 0.358319 0.336963 0.327354 0.281143 0.336105 0.352087 0.317960 0.358970 0.364580 ... 0.135799 1.323890e-01 0.137707 0.132376 1.212191e-01 0.130200 1.222663e-01 1.221549e-01 1.252380e-01 0.125863
50% 0.475784 0.555782 0.527991 0.452887 0.422268 0.441525 0.438893 0.441060 0.441450 0.466720 ... 0.208022 2.053983e-01 0.222403 0.196415 2.097371e-01 0.218093 2.228239e-01 2.211248e-01 1.874492e-01 0.206353
75% 0.626630 0.681761 0.634224 0.652072 0.643315 0.659261 0.591284 0.629180 0.568890 0.619840 ... 0.317955 3.631238e-01 0.327785 0.320391 3.420890e-01 0.348750 3.726575e-01 3.687722e-01 3.578798e-01 0.336209
max 0.984975 0.862654 0.944251 0.956046 0.983107 0.997162 1.000000 0.982800 0.995400 0.994980 ... 0.876207 9.818093e-01 0.893101 0.960450 9.788583e-01 0.979567 9.272702e-01 9.298775e-01 9.073386e-01 0.828459

8 rows × 29 columns


In [16]:
# for i in [k for k in X_train_num.keys() if k not in ('hf', 'loss')]:
#     for j in [k for k in X_train_num.keys() if k not in ('hf', 'loss')]:
#         X_train_num[str(i+'-'+j)] = X_train_num[i]*X_train_num[j]
#         X_test_num[str(i+'-'+j)] = X_test_num[i]*X_test_num[j]


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-16-1889526d0f1d> in <module>()
      2     for j in [k for k in X_train_num.keys() if k not in ('hf', 'loss')]:
      3         X_train_num[str(i+'-'+j)] = X_train_num[i]*X_train_num[j]
----> 4         X_test_num[str(i+'-'+j)] = X_test_num[i]*X_test_num[j]

/Users/guilherme/anaconda/envs/ipy2/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

/Users/guilherme/anaconda/envs/ipy2/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   2064         # get column
   2065         if self.columns.is_unique:
-> 2066             return self._get_item_cache(key)
   2067 
   2068         # duplicate columns & possible reduce dimensionality

/Users/guilherme/anaconda/envs/ipy2/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1384         res = cache.get(item)
   1385         if res is None:
-> 1386             values = self._data.get(item)
   1387             res = self._box_item_values(item, values)
   1388             cache[item] = res

/Users/guilherme/anaconda/envs/ipy2/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   3539 
   3540             if not isnull(item):
-> 3541                 loc = self.items.get_loc(item)
   3542             else:
   3543                 indexer = np.arange(len(self.items))[isnull(self.items)]

/Users/guilherme/anaconda/envs/ipy2/lib/python2.7/site-packages/pandas/indexes/base.pyc in get_loc(self, key, method, tolerance)
   2134                 return self._engine.get_loc(key)
   2135             except KeyError:
-> 2136                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2137 
   2138         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4443)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4289)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13733)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13687)()

KeyError: 'cont1-loss'

In [ ]:
X_test_num.describe()

In [ ]:
_index_train = []
_index_test = []

for i in range(0, X_train_num.shape[0]):
    if np.random.binomial(1, 0.75, 1):
        _index_train.append(i)
    else:
        _index_test.append(i)

print('Training: {}\nTesting: {}'.format(len(_index_train)/float(X_train_num.shape[0]), len(_index_test)/float(X_train_num.shape[0])))

In [ ]:
X_validation_df = X_test_num.copy()
X_train_df = X_train_num.iloc[_index_train,:]
X_test_df = X_train_num.iloc[_index_test,:]

In [ ]:
#PCA
from sklearn.decomposition import PCA

X_train = X_train_df.drop('loss', inplace=False, axis=1).as_matrix()
Y_train = X_train_df['loss'].as_matrix()

pca = PCA()
pca.fit(X_train)
X_train_pca = pca.transform(X_train)

In [ ]:
pca.explained_variance_ratio_[:6]

In [ ]:
n_pca_components = 6

In [ ]:
from sklearn.linear_model import LinearRegression

lm = LinearRegression(fit_intercept=True, normalize=True)

In [ ]:
lm.fit(X_train_pca[:,:n_pca_components], Y_train.ravel())

In [ ]:
lm.score(X_train_pca[:,:n_pca_components], Y_train.ravel())

In [ ]:
X_test = X_test_df.drop('loss', inplace=False, axis=1).as_matrix()
Y_test = X_test_df['loss'].as_matrix()

pca.fit(X_test)
X_test_pca = pca.transform(X_test)

In [ ]:
lm.score(X_test_pca[:,:n_pca_components], Y_test.ravel())

In [ ]:
X_validate = X_validation_df.as_matrix()

pca.fit(X_validate)
X_validate_pca = pca.transform(X_validate)

predictions = lm.predict(X_validate_pca[:,:n_pca_components])

In [ ]:
predictions

In [ ]:
import csv
with open('predictions-04.csv', 'w') as fp:
    writer = csv.writer(fp)
    
    writer.writerow(['id','loss'])
    
    for (row, val) in zip(X_test_orig['id'].tolist(), predictions):
        writer.writerow([row, val])

0.089751837048630212, 0.087374891924143272