In [1]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
import pandas as pd, numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, SVR
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from gensim.models import word2vec
import nltk
from scipy import stats
from itertools import combinations
import pickle
import warnings
warnings.filterwarnings("ignore")
In [2]:
train = pd.read_csv('data_files/train.csv')
In [3]:
train.head()
Out[3]:
In [4]:
train.shape
Out[4]:
In [5]:
train.dtypes
Out[5]:
In [6]:
categorical_vars = ['cat{}'.format(i+1) for i in range(116)]
In [7]:
for var in categorical_vars:
train = pd.get_dummies(train, columns=[var])
In [8]:
train.head()
Out[8]:
In [9]:
def multi_model_prediction(test_df, models):
preds = list()
for model in models:
preds.append(model.predict(test_df))
return [np.mean(p) for p in np.array(preds).T]
In [ ]:
In [ ]:
In [10]:
# rf = RandomForestRegressor(n_estimators=30, max_depth=10, max_features='sqrt')
# lr = LinearRegression()
# X, y = train_test_split(train)
In [11]:
# rf.fit(X.drop(['loss'], axis=1), X.loss)
# lr.fit(X.drop(['loss'], axis=1), X.loss)
In [12]:
#preds = multi_model_prediction(y.drop(['loss'], axis=1), [rf, lr])
In [ ]:
#np.mean([abs(prediction - loss) for prediction, loss in zip(preds, y.loss)])
In [ ]:
In [ ]:
# n_sample = 1000
# errors = list()
# for _ in range(3):
# sample_data = train.sample(n_sample)
# X, y = train_test_split(sample_data)
# rf = RandomForestRegressor(n_estimators=50, max_depth=10, max_features='sqrt')
# rf.fit(X.drop(['loss'], axis=1), X.loss)
# lr = LinearRegression()
# lr.fit(X.drop(['loss'], axis=1), X.loss)
# gbt = GradientBoostingRegressor(n_estimators=50, max_depth=10, max_features='sqrt')
# gbt.fit(X.drop(['loss'], axis=1), X.loss)
# knn = KNeighborsRegressor(n_neighbors=7)
# knn.fit(X.drop(['loss'], axis=1), X.loss)
# svr = SVR(kernel='poly', degree=4)
# svr.fit(X.drop(['loss'], axis=1), X.loss)
# model_list = [rf, lr, gbt, knn, svr]
# preds = multi_model_prediction(y.drop(['loss'], axis=1), model_list)
# errors.append(np.mean([abs(p - loss) for p, loss in zip(preds, y.loss)]))
# np.mean(errors)
In [10]:
test = pd.read_csv('data_files/test.csv')
In [11]:
for var in categorical_vars:
test = pd.get_dummies(test, columns=[var])
In [12]:
test.head()
Out[12]:
In [ ]:
rf = RandomForestRegressor(n_estimators=10, max_depth=10, max_features='sqrt')
rf.fit(train.drop(['loss'], axis=1), train.loss)
lr = LinearRegression()
lr.fit(train.drop(['loss'], axis=1), train.loss)
gbt = GradientBoostingRegressor(n_estimators=10, max_depth=10, max_features='sqrt')
gbt.fit(train.drop(['loss'], axis=1), train.loss)
knn = KNeighborsRegressor(n_neighbors=7)
knn.fit(train.drop(['loss'], axis=1), train.loss)
svr = SVR(kernel='poly', degree=4)
svr.fit(train.drop(['loss'], axis=1), train.loss)
model_list = [rf, lr, gbt, knn, svr]
test['loss'] = multi_model_prediction(test, model_list)
In [ ]:
test[['id', 'loss']].head()
In [ ]:
import csv
with open('tate_submission1.csv', 'a') as file:
writer = csv.writer(file)
writer.writerow(['id', 'loss'])
writer.writerows(test[['id', 'loss']].values.tolist())
In [ ]:
In [13]:
predictions = rf.predict(y.drop(['loss'], axis=1))
In [14]:
np.mean([abs(prediction - loss) for prediction, loss in zip(predictions, y.loss)])
Out[14]:
In [16]:
# def mae(estimator, X, y):
# return np.mean([abs(prediction - value)
# for prediction, value in zip(estimator.predict(X), y)])
In [17]:
# param_grid = {'n_estimators': np.arange(50, 251, 50),
# 'max_depth': np.arange(5, 21, 5),
# 'max_features': ['auto', 'sqrt']}
# random_forest = RandomForestRegressor()
# cv = GridSearchCV(random_forest, param_grid, scoring=mae)
In [20]:
#cv.fit(train.drop(['loss'], axis=1), train.loss)
Out[20]:
In [21]:
#cv
Out[21]:
In [22]:
predictions = cv.predict(y.drop(['loss'], axis=1))
In [23]:
np.mean([abs(prediction - loss) for prediction, loss in zip(predictions, y.loss)])
Out[23]:
In [ ]: