notebook.community

Edit and run



In [1]:

    
import pandas as pd, numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
import pandas as pd, numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, SVR
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from gensim.models import word2vec
import nltk
from scipy import stats
from itertools import combinations
import pickle 
import warnings
warnings.filterwarnings("ignore")









    



/Users/tate/anaconda/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/Users/tate/anaconda/lib/python3.5/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)



In [2]:

    
train = pd.read_csv('data_files/train.csv')



In [3]:

    
train.head()









    Out[3]:






  
    
      
      id
      cat1
      cat2
      cat3
      cat4
      cat5
      cat6
      cat7
      cat8
      cat9
      ...
      cont6
      cont7
      cont8
      cont9
      cont10
      cont11
      cont12
      cont13
      cont14
      loss
    
  
  
    
      0
      1
      A
      B
      A
      B
      A
      A
      A
      A
      B
      ...
      0.718367
      0.335060
      0.30260
      0.67135
      0.83510
      0.569745
      0.594646
      0.822493
      0.714843
      2213.18
    
    
      1
      2
      A
      B
      A
      A
      A
      A
      A
      A
      B
      ...
      0.438917
      0.436585
      0.60087
      0.35127
      0.43919
      0.338312
      0.366307
      0.611431
      0.304496
      1283.60
    
    
      2
      5
      A
      B
      A
      A
      B
      A
      A
      A
      B
      ...
      0.289648
      0.315545
      0.27320
      0.26076
      0.32446
      0.381398
      0.373424
      0.195709
      0.774425
      3005.09
    
    
      3
      10
      B
      B
      A
      B
      A
      A
      A
      A
      B
      ...
      0.440945
      0.391128
      0.31796
      0.32128
      0.44467
      0.327915
      0.321570
      0.605077
      0.602642
      939.85
    
    
      4
      11
      A
      B
      A
      B
      A
      A
      A
      A
      B
      ...
      0.178193
      0.247408
      0.24564
      0.22089
      0.21230
      0.204687
      0.202213
      0.246011
      0.432606
      2763.85
    
  

5 rows × 132 columns



In [4]:

    
train.shape









    Out[4]:





(188318, 132)



In [5]:

    
train.dtypes









    Out[5]:





id          int64
cat1       object
cat2       object
cat3       object
cat4       object
cat5       object
cat6       object
cat7       object
cat8       object
cat9       object
cat10      object
cat11      object
cat12      object
cat13      object
cat14      object
cat15      object
cat16      object
cat17      object
cat18      object
cat19      object
cat20      object
cat21      object
cat22      object
cat23      object
cat24      object
cat25      object
cat26      object
cat27      object
cat28      object
cat29      object
           ...   
cat102     object
cat103     object
cat104     object
cat105     object
cat106     object
cat107     object
cat108     object
cat109     object
cat110     object
cat111     object
cat112     object
cat113     object
cat114     object
cat115     object
cat116     object
cont1     float64
cont2     float64
cont3     float64
cont4     float64
cont5     float64
cont6     float64
cont7     float64
cont8     float64
cont9     float64
cont10    float64
cont11    float64
cont12    float64
cont13    float64
cont14    float64
loss      float64
dtype: object

cat1 - cat116 are categorical



In [6]:

    
categorical_vars = ['cat{}'.format(i+1) for i in range(116)]



In [7]:

    
for var in categorical_vars:
    train = pd.get_dummies(train, columns=[var])



In [8]:

    
train.head()









    Out[8]:






  
    
      
      id
      cont1
      cont2
      cont3
      cont4
      cont5
      cont6
      cont7
      cont8
      cont9
      ...
      cat116_P
      cat116_Q
      cat116_R
      cat116_S
      cat116_T
      cat116_U
      cat116_V
      cat116_W
      cat116_X
      cat116_Y
    
  
  
    
      0
      1
      0.726300
      0.245921
      0.187583
      0.789639
      0.310061
      0.718367
      0.335060
      0.30260
      0.67135
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      2
      0.330514
      0.737068
      0.592681
      0.614134
      0.885834
      0.438917
      0.436585
      0.60087
      0.35127
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      5
      0.261841
      0.358319
      0.484196
      0.236924
      0.397069
      0.289648
      0.315545
      0.27320
      0.26076
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      10
      0.321594
      0.555782
      0.527991
      0.373816
      0.422268
      0.440945
      0.391128
      0.31796
      0.32128
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      11
      0.273204
      0.159990
      0.527991
      0.473202
      0.704268
      0.178193
      0.247408
      0.24564
      0.22089
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 1155 columns



In [9]:

    
def multi_model_prediction(test_df, models):
    preds = list()
    for model in models:
        preds.append(model.predict(test_df))
    return [np.mean(p) for p in np.array(preds).T]



In [ ]:



In [ ]:



In [10]:

    
# rf = RandomForestRegressor(n_estimators=30, max_depth=10, max_features='sqrt')
# lr = LinearRegression()
# X, y = train_test_split(train)



In [11]:

    
# rf.fit(X.drop(['loss'], axis=1), X.loss)
# lr.fit(X.drop(['loss'], axis=1), X.loss)



In [12]:

    
#preds = multi_model_prediction(y.drop(['loss'], axis=1), [rf, lr])



In [ ]:

    
#np.mean([abs(prediction - loss) for prediction, loss in zip(preds, y.loss)])



In [ ]:



In [ ]:

    
# n_sample = 1000
# errors = list()
# for _ in range(3):
#     sample_data = train.sample(n_sample)
#     X, y = train_test_split(sample_data)
#     rf = RandomForestRegressor(n_estimators=50, max_depth=10, max_features='sqrt')
#     rf.fit(X.drop(['loss'], axis=1), X.loss)
#     lr = LinearRegression()
#     lr.fit(X.drop(['loss'], axis=1), X.loss)
#     gbt = GradientBoostingRegressor(n_estimators=50, max_depth=10, max_features='sqrt')
#     gbt.fit(X.drop(['loss'], axis=1), X.loss)
#     knn = KNeighborsRegressor(n_neighbors=7)
#     knn.fit(X.drop(['loss'], axis=1), X.loss)
#     svr = SVR(kernel='poly', degree=4)
#     svr.fit(X.drop(['loss'], axis=1), X.loss)
#     model_list = [rf, lr, gbt, knn, svr]
#     preds = multi_model_prediction(y.drop(['loss'], axis=1), model_list)
#     errors.append(np.mean([abs(p - loss) for p, loss in zip(preds, y.loss)]))
# np.mean(errors)



In [10]:

    
test = pd.read_csv('data_files/test.csv')



In [11]:

    
for var in categorical_vars:
    test = pd.get_dummies(test, columns=[var])



In [12]:

    
test.head()









    Out[12]:






  
    
      
      id
      cont1
      cont2
      cont3
      cont4
      cont5
      cont6
      cont7
      cont8
      cont9
      ...
      cat116_MW
      cat116_MX
      cat116_N
      cat116_O
      cat116_Q
      cat116_R
      cat116_S
      cat116_T
      cat116_U
      cat116_Y
    
  
  
    
      0
      4
      0.321594
      0.299102
      0.246911
      0.402922
      0.281143
      0.466591
      0.317681
      0.61229
      0.34365
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      6
      0.634734
      0.620805
      0.654310
      0.946616
      0.836443
      0.482425
      0.443760
      0.71330
      0.51890
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      9
      0.290813
      0.737068
      0.711159
      0.412789
      0.718531
      0.212308
      0.325779
      0.29758
      0.34365
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      12
      0.268622
      0.681761
      0.592681
      0.354893
      0.397069
      0.369930
      0.342355
      0.40028
      0.33237
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      15
      0.553846
      0.299102
      0.263570
      0.696873
      0.302678
      0.398862
      0.391833
      0.23688
      0.43731
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 1117 columns



In [ ]:

    
rf = RandomForestRegressor(n_estimators=10, max_depth=10, max_features='sqrt')
rf.fit(train.drop(['loss'], axis=1), train.loss)
lr = LinearRegression()
lr.fit(train.drop(['loss'], axis=1), train.loss)
gbt = GradientBoostingRegressor(n_estimators=10, max_depth=10, max_features='sqrt')
gbt.fit(train.drop(['loss'], axis=1), train.loss)
knn = KNeighborsRegressor(n_neighbors=7)
knn.fit(train.drop(['loss'], axis=1), train.loss)
svr = SVR(kernel='poly', degree=4)
svr.fit(train.drop(['loss'], axis=1), train.loss)
model_list = [rf, lr, gbt, knn, svr]
test['loss'] = multi_model_prediction(test, model_list)



In [ ]:

    
test[['id', 'loss']].head()



In [ ]:

    
import csv 
with open('tate_submission1.csv', 'a') as file:
    writer = csv.writer(file)
    writer.writerow(['id', 'loss'])
    writer.writerows(test[['id', 'loss']].values.tolist())



In [ ]:



In [13]:

    
predictions = rf.predict(y.drop(['loss'], axis=1))



In [14]:

    
np.mean([abs(prediction - loss) for prediction, loss in zip(predictions, y.loss)])









    Out[14]:





1263.9585367398727



In [16]:

    
# def mae(estimator, X, y):
#     return np.mean([abs(prediction - value) 
#                     for prediction, value in zip(estimator.predict(X), y)])



In [17]:

    
# param_grid = {'n_estimators': np.arange(50, 251, 50), 
#               'max_depth': np.arange(5, 21, 5),
#              'max_features': ['auto', 'sqrt']}
# random_forest = RandomForestRegressor()
# cv = GridSearchCV(random_forest, param_grid, scoring=mae)



In [20]:

    
#cv.fit(train.drop(['loss'], axis=1), train.loss)









    Out[20]:





GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': array([ 5, 10, 15, 20]), 'n_estimators': array([ 50, 100, 150, 200, 250]), 'max_features': ['auto', 'sqrt']},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=<function mae at 0x19c23fe18>, verbose=0)



In [21]:

    
#cv









    Out[21]:





GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': array([ 5, 10, 15, 20]), 'n_estimators': array([ 50, 100, 150, 200, 250]), 'max_features': ['auto', 'sqrt']},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=<function mae at 0x19c23fe18>, verbose=0)



In [22]:

    
predictions = cv.predict(y.drop(['loss'], axis=1))



In [23]:

    
np.mean([abs(prediction - loss) for prediction, loss in zip(predictions, y.loss)])









    Out[23]:





1546.402822185109



In [ ]:

	id	cat1	cat2	cat3	cat4	cat5	cat6	cat7	cat8	cat9	...	cont6	cont7	cont8	cont9	cont10	cont11	cont12	cont13	cont14	loss
0	1	A	B	A	B	A	A	A	A	B	...	0.718367	0.335060	0.30260	0.67135	0.83510	0.569745	0.594646	0.822493	0.714843	2213.18
1	2	A	B	A	A	A	A	A	A	B	...	0.438917	0.436585	0.60087	0.35127	0.43919	0.338312	0.366307	0.611431	0.304496	1283.60
2	5	A	B	A	A	B	A	A	A	B	...	0.289648	0.315545	0.27320	0.26076	0.32446	0.381398	0.373424	0.195709	0.774425	3005.09
3	10	B	B	A	B	A	A	A	A	B	...	0.440945	0.391128	0.31796	0.32128	0.44467	0.327915	0.321570	0.605077	0.602642	939.85
4	11	A	B	A	B	A	A	A	A	B	...	0.178193	0.247408	0.24564	0.22089	0.21230	0.204687	0.202213	0.246011	0.432606	2763.85

	id	cont1	cont2	cont3	cont4	cont5	cont6	cont7	cont8	cont9	...
0	1	0.726300	0.245921	0.187583	0.789639	0.310061	0.718367	0.335060	0.30260	0.67135	...
1	2	0.330514	0.737068	0.592681	0.614134	0.885834	0.438917	0.436585	0.60087	0.35127	...
2	5	0.261841	0.358319	0.484196	0.236924	0.397069	0.289648	0.315545	0.27320	0.26076	...
3	10	0.321594	0.555782	0.527991	0.373816	0.422268	0.440945	0.391128	0.31796	0.32128	...
4	11	0.273204	0.159990	0.527991	0.473202	0.704268	0.178193	0.247408	0.24564	0.22089	...

	id	cont1	cont2	cont3	cont4	cont5	cont6	cont7	cont8	cont9	...
0	4	0.321594	0.299102	0.246911	0.402922	0.281143	0.466591	0.317681	0.61229	0.34365	...
1	6	0.634734	0.620805	0.654310	0.946616	0.836443	0.482425	0.443760	0.71330	0.51890	...
2	9	0.290813	0.737068	0.711159	0.412789	0.718531	0.212308	0.325779	0.29758	0.34365	...
3	12	0.268622	0.681761	0.592681	0.354893	0.397069	0.369930	0.342355	0.40028	0.33237	...
4	15	0.553846	0.299102	0.263570	0.696873	0.302678	0.398862	0.391833	0.23688	0.43731	...