In [1]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
import pandas as pd, numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, SVR
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from gensim.models import word2vec
import nltk
from scipy import stats
from itertools import combinations
import pickle 
import warnings
warnings.filterwarnings("ignore")


/Users/tate/anaconda/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/Users/tate/anaconda/lib/python3.5/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

In [2]:
train = pd.read_csv('data_files/train.csv')

In [3]:
train.head()


Out[3]:
id cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 ... cont6 cont7 cont8 cont9 cont10 cont11 cont12 cont13 cont14 loss
0 1 A B A B A A A A B ... 0.718367 0.335060 0.30260 0.67135 0.83510 0.569745 0.594646 0.822493 0.714843 2213.18
1 2 A B A A A A A A B ... 0.438917 0.436585 0.60087 0.35127 0.43919 0.338312 0.366307 0.611431 0.304496 1283.60
2 5 A B A A B A A A B ... 0.289648 0.315545 0.27320 0.26076 0.32446 0.381398 0.373424 0.195709 0.774425 3005.09
3 10 B B A B A A A A B ... 0.440945 0.391128 0.31796 0.32128 0.44467 0.327915 0.321570 0.605077 0.602642 939.85
4 11 A B A B A A A A B ... 0.178193 0.247408 0.24564 0.22089 0.21230 0.204687 0.202213 0.246011 0.432606 2763.85

5 rows × 132 columns


In [4]:
train.shape


Out[4]:
(188318, 132)

In [5]:
train.dtypes


Out[5]:
id          int64
cat1       object
cat2       object
cat3       object
cat4       object
cat5       object
cat6       object
cat7       object
cat8       object
cat9       object
cat10      object
cat11      object
cat12      object
cat13      object
cat14      object
cat15      object
cat16      object
cat17      object
cat18      object
cat19      object
cat20      object
cat21      object
cat22      object
cat23      object
cat24      object
cat25      object
cat26      object
cat27      object
cat28      object
cat29      object
           ...   
cat102     object
cat103     object
cat104     object
cat105     object
cat106     object
cat107     object
cat108     object
cat109     object
cat110     object
cat111     object
cat112     object
cat113     object
cat114     object
cat115     object
cat116     object
cont1     float64
cont2     float64
cont3     float64
cont4     float64
cont5     float64
cont6     float64
cont7     float64
cont8     float64
cont9     float64
cont10    float64
cont11    float64
cont12    float64
cont13    float64
cont14    float64
loss      float64
dtype: object

cat1 - cat116 are categorical


In [6]:
categorical_vars = ['cat{}'.format(i+1) for i in range(116)]

In [7]:
for var in categorical_vars:
    train = pd.get_dummies(train, columns=[var])

In [8]:
train.head()


Out[8]:
id cont1 cont2 cont3 cont4 cont5 cont6 cont7 cont8 cont9 ... cat116_P cat116_Q cat116_R cat116_S cat116_T cat116_U cat116_V cat116_W cat116_X cat116_Y
0 1 0.726300 0.245921 0.187583 0.789639 0.310061 0.718367 0.335060 0.30260 0.67135 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 2 0.330514 0.737068 0.592681 0.614134 0.885834 0.438917 0.436585 0.60087 0.35127 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 5 0.261841 0.358319 0.484196 0.236924 0.397069 0.289648 0.315545 0.27320 0.26076 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 10 0.321594 0.555782 0.527991 0.373816 0.422268 0.440945 0.391128 0.31796 0.32128 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 11 0.273204 0.159990 0.527991 0.473202 0.704268 0.178193 0.247408 0.24564 0.22089 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 1155 columns


In [9]:
def multi_model_prediction(test_df, models):
    preds = list()
    for model in models:
        preds.append(model.predict(test_df))
    return [np.mean(p) for p in np.array(preds).T]

In [ ]:


In [ ]:


In [10]:
# rf = RandomForestRegressor(n_estimators=30, max_depth=10, max_features='sqrt')
# lr = LinearRegression()
# X, y = train_test_split(train)

In [11]:
# rf.fit(X.drop(['loss'], axis=1), X.loss)
# lr.fit(X.drop(['loss'], axis=1), X.loss)

In [12]:
#preds = multi_model_prediction(y.drop(['loss'], axis=1), [rf, lr])

In [ ]:
#np.mean([abs(prediction - loss) for prediction, loss in zip(preds, y.loss)])

In [ ]:


In [ ]:
# n_sample = 1000
# errors = list()
# for _ in range(3):
#     sample_data = train.sample(n_sample)
#     X, y = train_test_split(sample_data)
#     rf = RandomForestRegressor(n_estimators=50, max_depth=10, max_features='sqrt')
#     rf.fit(X.drop(['loss'], axis=1), X.loss)
#     lr = LinearRegression()
#     lr.fit(X.drop(['loss'], axis=1), X.loss)
#     gbt = GradientBoostingRegressor(n_estimators=50, max_depth=10, max_features='sqrt')
#     gbt.fit(X.drop(['loss'], axis=1), X.loss)
#     knn = KNeighborsRegressor(n_neighbors=7)
#     knn.fit(X.drop(['loss'], axis=1), X.loss)
#     svr = SVR(kernel='poly', degree=4)
#     svr.fit(X.drop(['loss'], axis=1), X.loss)
#     model_list = [rf, lr, gbt, knn, svr]
#     preds = multi_model_prediction(y.drop(['loss'], axis=1), model_list)
#     errors.append(np.mean([abs(p - loss) for p, loss in zip(preds, y.loss)]))
# np.mean(errors)

In [10]:
test = pd.read_csv('data_files/test.csv')

In [11]:
for var in categorical_vars:
    test = pd.get_dummies(test, columns=[var])

In [12]:
test.head()


Out[12]:
id cont1 cont2 cont3 cont4 cont5 cont6 cont7 cont8 cont9 ... cat116_MW cat116_MX cat116_N cat116_O cat116_Q cat116_R cat116_S cat116_T cat116_U cat116_Y
0 4 0.321594 0.299102 0.246911 0.402922 0.281143 0.466591 0.317681 0.61229 0.34365 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 6 0.634734 0.620805 0.654310 0.946616 0.836443 0.482425 0.443760 0.71330 0.51890 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 9 0.290813 0.737068 0.711159 0.412789 0.718531 0.212308 0.325779 0.29758 0.34365 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 12 0.268622 0.681761 0.592681 0.354893 0.397069 0.369930 0.342355 0.40028 0.33237 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 15 0.553846 0.299102 0.263570 0.696873 0.302678 0.398862 0.391833 0.23688 0.43731 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 1117 columns


In [ ]:
rf = RandomForestRegressor(n_estimators=10, max_depth=10, max_features='sqrt')
rf.fit(train.drop(['loss'], axis=1), train.loss)
lr = LinearRegression()
lr.fit(train.drop(['loss'], axis=1), train.loss)
gbt = GradientBoostingRegressor(n_estimators=10, max_depth=10, max_features='sqrt')
gbt.fit(train.drop(['loss'], axis=1), train.loss)
knn = KNeighborsRegressor(n_neighbors=7)
knn.fit(train.drop(['loss'], axis=1), train.loss)
svr = SVR(kernel='poly', degree=4)
svr.fit(train.drop(['loss'], axis=1), train.loss)
model_list = [rf, lr, gbt, knn, svr]
test['loss'] = multi_model_prediction(test, model_list)

In [ ]:
test[['id', 'loss']].head()

In [ ]:
import csv 
with open('tate_submission1.csv', 'a') as file:
    writer = csv.writer(file)
    writer.writerow(['id', 'loss'])
    writer.writerows(test[['id', 'loss']].values.tolist())

In [ ]:


In [13]:
predictions = rf.predict(y.drop(['loss'], axis=1))

In [14]:
np.mean([abs(prediction - loss) for prediction, loss in zip(predictions, y.loss)])


Out[14]:
1263.9585367398727

In [16]:
# def mae(estimator, X, y):
#     return np.mean([abs(prediction - value) 
#                     for prediction, value in zip(estimator.predict(X), y)])

In [17]:
# param_grid = {'n_estimators': np.arange(50, 251, 50), 
#               'max_depth': np.arange(5, 21, 5),
#              'max_features': ['auto', 'sqrt']}
# random_forest = RandomForestRegressor()
# cv = GridSearchCV(random_forest, param_grid, scoring=mae)

In [20]:
#cv.fit(train.drop(['loss'], axis=1), train.loss)


Out[20]:
GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': array([ 5, 10, 15, 20]), 'n_estimators': array([ 50, 100, 150, 200, 250]), 'max_features': ['auto', 'sqrt']},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=<function mae at 0x19c23fe18>, verbose=0)

In [21]:
#cv


Out[21]:
GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': array([ 5, 10, 15, 20]), 'n_estimators': array([ 50, 100, 150, 200, 250]), 'max_features': ['auto', 'sqrt']},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=<function mae at 0x19c23fe18>, verbose=0)

In [22]:
predictions = cv.predict(y.drop(['loss'], axis=1))

In [23]:
np.mean([abs(prediction - loss) for prediction, loss in zip(predictions, y.loss)])


Out[23]:
1546.402822185109

In [ ]: