In [108]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import tree
import pydot
import StringIO

In [98]:
train = pd.read_csv('codetest_train.txt', sep='\t')

In [99]:
train.head()


Out[99]:
target f_0 f_1 f_2 f_3 f_4 f_5 f_6 f_7 f_8 ... f_244 f_245 f_246 f_247 f_248 f_249 f_250 f_251 f_252 f_253
0 3.066056 -0.653 0.255 -0.615 -1.833 -0.736 NaN 1.115 -0.171 -0.351 ... -1.607 -1.400 -0.920 -0.198 -0.945 -0.573 0.170 -0.418 -1.244 -0.503
1 -1.910473 1.179 -0.093 -0.556 0.811 -0.468 -0.005 -0.116 -1.243 1.985 ... 1.282 0.032 -0.061 NaN -0.061 -0.302 1.281 -0.850 0.821 -0.260
2 7.830711 0.181 -0.778 -0.919 0.113 0.887 -0.762 1.872 -1.709 0.135 ... -0.237 -0.660 1.073 -0.193 0.570 -0.267 1.435 1.332 -1.147 2.580
3 -2.180862 0.745 -0.245 -1.343 1.163 -0.169 -0.151 -1.100 0.225 1.223 ... 0.709 -0.203 -0.136 -0.571 1.682 0.243 -0.381 0.613 1.033 0.400
4 5.462784 1.217 -1.324 -0.958 0.448 -2.873 -0.856 0.603 0.763 0.020 ... 0.892 -0.433 -0.877 0.289 0.654 1.230 0.457 -0.754 -0.025 -0.931

5 rows × 255 columns


In [100]:
train.describe()


Out[100]:
target f_0 f_1 f_2 f_3 f_4 f_5 f_6 f_7 f_8 ... f_244 f_245 f_246 f_247 f_248 f_249 f_250 f_251 f_252 f_253
count 5000.000000 4903.000000 4928.000000 4908.000000 4910.000000 4907.000000 4912.000000 4897.000000 4904.000000 4893.000000 ... 4910.000000 4883.000000 4914.000000 4894.000000 4902.000000 4886.000000 4900.000000 4921.000000 4904.000000 4904.000000
mean 1.143878 -0.000433 0.002564 0.028877 -0.005437 -0.006759 0.005569 0.001536 -0.001019 0.009744 ... 0.013529 0.004941 0.023261 -0.018447 -0.009842 0.016958 -0.004947 0.016868 -0.001351 0.010334
std 5.259896 0.999739 0.997930 1.019337 0.990351 1.006293 0.995799 1.004624 0.997356 0.988310 ... 1.001453 0.997330 0.996458 1.005000 0.989228 1.011331 0.991577 1.001358 1.003411 1.006896
min -26.705570 -3.941000 -3.847000 -3.818000 -3.434000 -3.400000 -4.051000 -3.179000 -3.890000 -3.857000 ... -3.585000 -3.494000 -3.485000 -4.012000 -3.252000 -3.821000 -3.376000 -3.373000 -3.950000 -3.728000
25% -2.034383 -0.673000 -0.685000 -0.651000 -0.655000 -0.685500 -0.660000 -0.672000 -0.679000 -0.662000 ... -0.665750 -0.676000 -0.661750 -0.692000 -0.663000 -0.647750 -0.679500 -0.647000 -0.695000 -0.677000
50% 1.166835 -0.011000 -0.003000 0.047500 0.003500 -0.007000 -0.008000 -0.003000 -0.021500 0.017000 ... 0.026000 -0.028000 0.027000 -0.035000 -0.010500 0.002500 0.010000 0.020000 0.002500 0.015000
75% 4.439549 0.677000 0.674500 0.719000 0.668000 0.654000 0.649000 0.679000 0.670250 0.698000 ... 0.671750 0.670000 0.702500 0.655000 0.651000 0.709750 0.660250 0.692000 0.672250 0.705250
max 26.347818 3.831000 3.996000 3.199000 4.962000 3.106000 4.296000 4.166000 3.798000 4.195000 ... 3.365000 3.456000 3.881000 3.690000 3.629000 4.144000 3.873000 3.187000 3.724000 3.956000

8 rows × 251 columns


In [101]:
labels = train.target
train.drop('target', axis=1, inplace=True)
cat = ['f_61','f_121','f_215','f_237']
train_cat = pd.get_dummies(train[cat])

In [102]:
train.drop(train[cat], axis=1, inplace=True)
train = np.hstack((train, train_cat))

In [103]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(train)
train = imp.transform(train)

In [104]:
x_train, x_test, y_train, y_test = train_test_split(train, labels.values, test_size = 0.2)

In [120]:
clf = RandomForestRegressor(n_estimators=2000)

In [121]:
clf.fit(x_train, y_train)


Out[121]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=2000, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [117]:
y_pred = clf.predict(x_test)

In [118]:
clf.score(x_test, y_test)


Out[118]:
0.55683411565692364

In [119]:
mean_squared_error(y_test, y_pred)


Out[119]:
12.477766923823133

In [ ]: