In [1]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import explained_variance_score,r2_score,mean_squared_error
from sklearn import preprocessing
from random import randint
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, rand
import sys
import os
import tensorflow as tf
from tensorflow.contrib import learn
import tensorflow.contrib.learn as skflow
from sklearn.cross_validation import KFold
from utils import encode_numeric_zscore_list
from patsy import dmatrices, dmatrix
from itertools import permutations, combinations
import sklearn.feature_selection as fs
from sklearn.ensemble import RandomForestRegressor

tf.logging.set_verbosity(tf.logging.INFO)

train=pd.read_csv('./data/t81_558_train.csv')
train.columns = ['Id', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'outcome']
train = train.reindex(np.random.permutation(train.index))
train.reset_index(inplace=True, drop=True)


/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
y = train["outcome"]
x = train.drop(["Id", "outcome"], axis=1)
forest = RandomForestRegressor(n_estimators = 200, random_state=0, verbose=True)
forest.fit(x,y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],axis=0)
indices = np.argsort(importances)[::-1]


[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:  1.3min finished

In [5]:
# Print the feature ranking
print("Feature ranking:")
names = x.columns
print (sorted(zip(map(lambda x: round(x, 4), importances), names), reverse=True))


Feature ranking:
[(0.17299999999999999, 'f'), (0.16, 'a'), (0.15629999999999999, 'c'), (0.15529999999999999, 'b'), (0.1215, 'd'), (0.1191, 'g'), (0.1148, 'e')]

In [6]:
test=pd.read_csv('./data/t81_558_test.csv')
test['outcome'] = 0.0

joined = pd.concat([train, test])
idsplusoutcome = joined[["Id", "outcome"]]

In [7]:
joined.head()


Out[7]:
Id a b c d e f g outcome
0 25216 0.892848 -0.856285 0.881588 -0.662125 0.774099 -0.509136 -0.352580 -30.829022
1 12469 0.535534 -0.522371 -0.739895 0.948267 -0.225478 -0.388076 -0.550246 -0.105294
2 17310 -0.941762 -0.781017 0.010367 0.740697 0.072541 0.748063 0.787869 1.605958
3 28482 0.391951 -0.226443 -0.520205 0.378066 -0.328574 0.967261 0.819377 -1.308662
4 17708 -0.707722 0.925603 -0.858180 0.625837 0.490596 0.859047 0.917338 0.442356

In [8]:
most_important_features = "facb"
comb2 = list(combinations(most_important_features,2))
#comb3 = list(combinations('abcdefg',3))
#comb4 = list(combinations('abcdefg',4))
#comb5 = list(combinations('abcdefg',5))

In [9]:
divide_2 = " + ".join(map(lambda x: "I(" + x[0] + "/" + x[1] + ")", comb2))
subtract_2 = " + ".join(map(lambda x: "I(" + x[0] + "-" + x[1] + ")", comb2))
mul_2 = " + ".join(map(lambda x: "I(" + x[0] + "*" + x[1] + ")", comb2))
add_2 = " + ".join(map(lambda x: "I(" + x[0] + "+" + x[1] + ")", comb2))
combo_string_2 = divide_2 + " + " + subtract_2 + " + " + mul_2 + " + " + add_2
print (combo_string_2)


I(f/a) + I(f/c) + I(f/b) + I(a/c) + I(a/b) + I(c/b) + I(f-a) + I(f-c) + I(f-b) + I(a-c) + I(a-b) + I(c-b) + I(f*a) + I(f*c) + I(f*b) + I(a*c) + I(a*b) + I(c*b) + I(f+a) + I(f+c) + I(f+b) + I(a+c) + I(a+b) + I(c+b)

In [89]:
#a3 = " + ".join(map(lambda x: "I(" + x[0] + "+" + x[1] + "+" + x[2] + ")", comb3))
#print (a3)


I(a+b+c) + I(a+b+d) + I(a+b+e) + I(a+b+f) + I(a+b+g) + I(a+c+d) + I(a+c+e) + I(a+c+f) + I(a+c+g) + I(a+d+e) + I(a+d+f) + I(a+d+g) + I(a+e+f) + I(a+e+g) + I(a+f+g) + I(b+c+d) + I(b+c+e) + I(b+c+f) + I(b+c+g) + I(b+d+e) + I(b+d+f) + I(b+d+g) + I(b+e+f) + I(b+e+g) + I(b+f+g) + I(c+d+e) + I(c+d+f) + I(c+d+g) + I(c+e+f) + I(c+e+g) + I(c+f+g) + I(d+e+f) + I(d+e+g) + I(d+f+g) + I(e+f+g)

In [90]:
#a4 = " + ".join(map(lambda x: "I(" + x[0] + "+" + x[1] + "+" + x[2] + "+" + x[3] + ")", comb4))
#print (a4)


I(a+b+c+d) + I(a+b+c+e) + I(a+b+c+f) + I(a+b+c+g) + I(a+b+d+e) + I(a+b+d+f) + I(a+b+d+g) + I(a+b+e+f) + I(a+b+e+g) + I(a+b+f+g) + I(a+c+d+e) + I(a+c+d+f) + I(a+c+d+g) + I(a+c+e+f) + I(a+c+e+g) + I(a+c+f+g) + I(a+d+e+f) + I(a+d+e+g) + I(a+d+f+g) + I(a+e+f+g) + I(b+c+d+e) + I(b+c+d+f) + I(b+c+d+g) + I(b+c+e+f) + I(b+c+e+g) + I(b+c+f+g) + I(b+d+e+f) + I(b+d+e+g) + I(b+d+f+g) + I(b+e+f+g) + I(c+d+e+f) + I(c+d+e+g) + I(c+d+f+g) + I(c+e+f+g) + I(d+e+f+g)

In [91]:
#a5 = " + ".join(map(lambda x: "I(" + x[0] + "+" + x[1] + "+" + x[2] + "+" + x[3] + "+" + x[4] + ")", comb5)) 
#print (a5)


I(a+b+c+d+e) + I(a+b+c+d+f) + I(a+b+c+d+g) + I(a+b+c+e+f) + I(a+b+c+e+g) + I(a+b+c+f+g) + I(a+b+d+e+f) + I(a+b+d+e+g) + I(a+b+d+f+g) + I(a+b+e+f+g) + I(a+c+d+e+f) + I(a+c+d+e+g) + I(a+c+d+f+g) + I(a+c+e+f+g) + I(a+d+e+f+g) + I(b+c+d+e+f) + I(b+c+d+e+g) + I(b+c+d+f+g) + I(b+c+e+f+g) + I(b+d+e+f+g) + I(c+d+e+f+g)

In [10]:
formula = "outcome ~ a + b + c + d + e + f + g + " + combo_string_2 + " - 1"
print (formula)


outcome ~ a + b + c + d + e + f + g + I(f/a) + I(f/c) + I(f/b) + I(a/c) + I(a/b) + I(c/b) + I(f-a) + I(f-c) + I(f-b) + I(a-c) + I(a-b) + I(c-b) + I(f*a) + I(f*c) + I(f*b) + I(a*c) + I(a*b) + I(c*b) + I(f+a) + I(f+c) + I(f+b) + I(a+c) + I(a+b) + I(c+b) - 1

In [11]:
#outcome, predictors = dmatrices("outcome ~ a + b + c + d + e + f + g + I(a+b) + I(a+b+c) + I(b+c) + I(b+c+d) + I(c+d) + I(c+d+e) + I(d+e) + I(d+e+f) + I(e+f) + I(e+f+g) a*b*c*d*e*f*g - 1", joined)
outcome, predictors = dmatrices(formula, joined)
predictors = pd.DataFrame(predictors)
predictors.head()


Out[11]:
0 1 2 3 4 5 6 7 8 9 ... 21 22 23 24 25 26 27 28 29 30
0 0.892848 -0.856285 0.881588 -0.662125 0.774099 -0.509136 -0.352580 -0.570238 -0.577521 0.594587 ... 0.435965 0.787124 -0.764533 -0.754890 0.383713 0.372452 -1.365421 1.774436 0.036563 0.025303
1 0.535534 -0.522371 -0.739895 0.948267 -0.225478 -0.388076 -0.550246 -0.724652 0.524501 0.742912 ... 0.202719 -0.396239 -0.279748 0.386500 0.147458 -1.127970 -0.910447 -0.204361 0.013163 -1.262266
2 -0.941762 -0.781017 0.010367 0.740697 0.072541 0.748063 0.787869 -0.794322 72.157765 -0.957806 ... -0.584249 -0.009763 0.735532 -0.008097 -0.193699 0.758430 -0.032954 -0.931395 -1.722779 -0.770650
3 0.391951 -0.226443 -0.520205 0.378066 -0.328574 0.967261 0.819377 2.467813 -1.859382 -4.271545 ... -0.219029 -0.203895 -0.088754 0.117797 1.359211 0.447055 0.740818 -0.128255 0.165508 -0.746648
4 -0.707722 0.925603 -0.858180 0.625837 0.490596 0.859047 0.917338 -1.213821 -1.001011 0.928095 ... 0.795137 0.607352 -0.655070 -0.794334 0.151326 0.000868 1.784651 -1.565901 0.217882 0.067424

5 rows × 31 columns


In [12]:
idsplusoutcome.shape
idsplusoutcome.reset_index(inplace=True, drop=True)

In [13]:
predictors.shape
predictors.reset_index(inplace=True, drop=True)

In [14]:
predictors.shape


Out[14]:
(79670, 31)

In [15]:
joined = pd.concat([idsplusoutcome, pd.DataFrame(predictors)], axis=1)

In [16]:
print(list(joined.columns))


['Id', 'outcome', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]

In [17]:
encode_numeric_zscore_list(joined,[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30])

train = joined[joined['outcome'] != 0.0]
test = joined[joined['outcome'] == 0.0]

In [18]:
y = train['outcome']
x = train.drop(['outcome', 'Id'], 1)

test_ids = test["Id"]
test = test.drop(["outcome", "Id"], 1)

In [ ]:
n_folds  = 5
kf = KFold(x.shape[0], n_folds=n_folds)
pred_df = pd.DataFrame()

for i, (train_index, test_index) in enumerate(kf):
    print('\n Fold %d' % (i + 1))
    x_train, x_valid = x.iloc[train_index], x.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]    

    feature_columns = learn.infer_real_valued_columns_from_input(x_train) 
    model = skflow.DNNRegressor(hidden_units=[100, 150, 75, 5],feature_columns=feature_columns)

    model.fit(x_train, y_train,steps=10000)
    valid_predictions = model.predict(x_valid)
    valid_rmse = np.sqrt(mean_squared_error(y_valid, valid_predictions))
    print("Fold: {}; RMSE: {}".format(i+1,valid_rmse))
    
    testpreds = model.predict(test).flatten()
    pred_df["Pred{}".format(i+1)] = pd.Series(testpreds)

submission = pd.DataFrame()
submission['Id'] = test_ids    
submission['outcome'] = pred_df.mean(axis=1)
submission = submission.drop(submission[submission.Id > 10000].index)
print("Writing submission file")
submission.to_csv('./data/sub_t81_transformed_cv10.csv', index=False)


WARNING:tensorflow:float64 is not supported by many models, consider casting to float32.
WARNING:tensorflow:Change warning: default value of `enable_centered_bias` will change after 2016-10-09. It will be disabled by default.Instructions for keeping existing behaviour:
Explicitly set `enable_centered_bias` to 'True' if you want to keep existing behaviour.
WARNING:tensorflow:Using temporary folder as model directory: /tmp/tmpb3c5aeg8
WARNING:tensorflow:Using default config.
INFO:tensorflow:Using config: {'keep_checkpoint_every_n_hours': 10000, 'evaluation_master': '', 'task': 0, '_is_chief': True, 'num_ps_replicas': 0, 'tf_random_seed': None, 'save_summary_steps': 100, '_job_name': None, 'master': '', 'keep_checkpoint_max': 5, 'cluster_spec': None, 'tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, 'save_checkpoints_secs': 600}
WARNING:tensorflow:float64 is not supported by many models, consider casting to float32.
WARNING:tensorflow:float64 is not supported by many models, consider casting to float32.
INFO:tensorflow:Setting feature info to TensorSignature(dtype=tf.float64, shape=TensorShape([Dimension(None), Dimension(31)]), is_sparse=False)
INFO:tensorflow:Setting targets info to TensorSignature(dtype=tf.float64, shape=TensorShape([Dimension(None)]), is_sparse=False)
INFO:tensorflow:Transforming feature_column _RealValuedColumn(column_name='', dimension=31, default_value=None, dtype=tf.float64, normalizer=None)
 Fold 1
INFO:tensorflow:Create CheckpointSaverHook
INFO:tensorflow:loss = 412.603, step = 1
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpb3c5aeg8/model.ckpt.
INFO:tensorflow:loss = 375.289, step = 101
INFO:tensorflow:loss = 263.561, step = 201

In [ ]: