In [1]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import explained_variance_score,r2_score,mean_squared_error
from sklearn import preprocessing
from random import randint
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, rand
import sys
import os
import tensorflow as tf
from tensorflow.contrib import learn
import tensorflow.contrib.learn as skflow
from sklearn.cross_validation import KFold
from utils import encode_numeric_zscore_list
from patsy import dmatrices, dmatrix
from itertools import permutations, combinations
import sklearn.feature_selection as fs
from sklearn.ensemble import RandomForestRegressor
tf.logging.set_verbosity(tf.logging.INFO)
train=pd.read_csv('./data/t81_558_train.csv')
train.columns = ['Id', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'outcome']
train = train.reindex(np.random.permutation(train.index))
train.reset_index(inplace=True, drop=True)
In [2]:
y = train["outcome"]
x = train.drop(["Id", "outcome"], axis=1)
forest = RandomForestRegressor(n_estimators = 200, random_state=0, verbose=True)
forest.fit(x,y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],axis=0)
indices = np.argsort(importances)[::-1]
In [5]:
# Print the feature ranking
print("Feature ranking:")
names = x.columns
print (sorted(zip(map(lambda x: round(x, 4), importances), names), reverse=True))
In [6]:
test=pd.read_csv('./data/t81_558_test.csv')
test['outcome'] = 0.0
joined = pd.concat([train, test])
idsplusoutcome = joined[["Id", "outcome"]]
In [7]:
joined.head()
Out[7]:
In [8]:
most_important_features = "facb"
comb2 = list(combinations(most_important_features,2))
#comb3 = list(combinations('abcdefg',3))
#comb4 = list(combinations('abcdefg',4))
#comb5 = list(combinations('abcdefg',5))
In [9]:
divide_2 = " + ".join(map(lambda x: "I(" + x[0] + "/" + x[1] + ")", comb2))
subtract_2 = " + ".join(map(lambda x: "I(" + x[0] + "-" + x[1] + ")", comb2))
mul_2 = " + ".join(map(lambda x: "I(" + x[0] + "*" + x[1] + ")", comb2))
add_2 = " + ".join(map(lambda x: "I(" + x[0] + "+" + x[1] + ")", comb2))
combo_string_2 = divide_2 + " + " + subtract_2 + " + " + mul_2 + " + " + add_2
print (combo_string_2)
In [89]:
#a3 = " + ".join(map(lambda x: "I(" + x[0] + "+" + x[1] + "+" + x[2] + ")", comb3))
#print (a3)
In [90]:
#a4 = " + ".join(map(lambda x: "I(" + x[0] + "+" + x[1] + "+" + x[2] + "+" + x[3] + ")", comb4))
#print (a4)
In [91]:
#a5 = " + ".join(map(lambda x: "I(" + x[0] + "+" + x[1] + "+" + x[2] + "+" + x[3] + "+" + x[4] + ")", comb5))
#print (a5)
In [10]:
formula = "outcome ~ a + b + c + d + e + f + g + " + combo_string_2 + " - 1"
print (formula)
In [11]:
#outcome, predictors = dmatrices("outcome ~ a + b + c + d + e + f + g + I(a+b) + I(a+b+c) + I(b+c) + I(b+c+d) + I(c+d) + I(c+d+e) + I(d+e) + I(d+e+f) + I(e+f) + I(e+f+g) a*b*c*d*e*f*g - 1", joined)
outcome, predictors = dmatrices(formula, joined)
predictors = pd.DataFrame(predictors)
predictors.head()
Out[11]:
In [12]:
idsplusoutcome.shape
idsplusoutcome.reset_index(inplace=True, drop=True)
In [13]:
predictors.shape
predictors.reset_index(inplace=True, drop=True)
In [14]:
predictors.shape
Out[14]:
In [15]:
joined = pd.concat([idsplusoutcome, pd.DataFrame(predictors)], axis=1)
In [16]:
print(list(joined.columns))
In [17]:
encode_numeric_zscore_list(joined,[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30])
train = joined[joined['outcome'] != 0.0]
test = joined[joined['outcome'] == 0.0]
In [18]:
y = train['outcome']
x = train.drop(['outcome', 'Id'], 1)
test_ids = test["Id"]
test = test.drop(["outcome", "Id"], 1)
In [ ]:
n_folds = 5
kf = KFold(x.shape[0], n_folds=n_folds)
pred_df = pd.DataFrame()
for i, (train_index, test_index) in enumerate(kf):
print('\n Fold %d' % (i + 1))
x_train, x_valid = x.iloc[train_index], x.iloc[test_index]
y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
feature_columns = learn.infer_real_valued_columns_from_input(x_train)
model = skflow.DNNRegressor(hidden_units=[100, 150, 75, 5],feature_columns=feature_columns)
model.fit(x_train, y_train,steps=10000)
valid_predictions = model.predict(x_valid)
valid_rmse = np.sqrt(mean_squared_error(y_valid, valid_predictions))
print("Fold: {}; RMSE: {}".format(i+1,valid_rmse))
testpreds = model.predict(test).flatten()
pred_df["Pred{}".format(i+1)] = pd.Series(testpreds)
submission = pd.DataFrame()
submission['Id'] = test_ids
submission['outcome'] = pred_df.mean(axis=1)
submission = submission.drop(submission[submission.Id > 10000].index)
print("Writing submission file")
submission.to_csv('./data/sub_t81_transformed_cv10.csv', index=False)
In [ ]: