In [108]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import tree
import pydot
import StringIO
In [98]:
train = pd.read_csv('codetest_train.txt', sep='\t')
In [99]:
train.head()
Out[99]:
In [100]:
train.describe()
Out[100]:
In [101]:
labels = train.target
train.drop('target', axis=1, inplace=True)
cat = ['f_61','f_121','f_215','f_237']
train_cat = pd.get_dummies(train[cat])
In [102]:
train.drop(train[cat], axis=1, inplace=True)
train = np.hstack((train, train_cat))
In [103]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(train)
train = imp.transform(train)
In [104]:
x_train, x_test, y_train, y_test = train_test_split(train, labels.values, test_size = 0.2)
In [120]:
clf = RandomForestRegressor(n_estimators=2000)
In [121]:
clf.fit(x_train, y_train)
Out[121]:
In [117]:
y_pred = clf.predict(x_test)
In [118]:
clf.score(x_test, y_test)
Out[118]:
In [119]:
mean_squared_error(y_test, y_pred)
Out[119]:
In [ ]: