notebook.community

Edit and run



In [603]:

    
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import randint as sp_randint
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from itertools import product
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import cross_val_score

%matplotlib inline



In [668]:

    
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")



In [669]:

    
# train = pd.read_csv("../cleanedData/data.train.matrix.csv")
# test = pd.read_csv("../cleanedData/data.test.matrix.csv")



In [670]:

    
# y = train['SalePrice']
# cols = [col for col in train.columns if col not in ['SalePrice']]
# X_train = train[cols]



In [671]:

    
train = pd.read_csv("../rawData/train.csv")
test = pd.read_csv("../rawData/test.csv")



In [672]:

    
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))
matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
prices = pd.DataFrame({"price":train["SalePrice"], "log(price + 1)":np.log1p(train["SalePrice"])})
# prices.hist()
#log transform the target:
train["SalePrice"] = np.log1p(train["SalePrice"])

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())


X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice



In [673]:

    
def rmse_cv(model, X_train, y):
    rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 10))
    return(rmse)



In [674]:

    
# more features
all_data.head()









    Out[674]:






  
    
      
      MSSubClass
      LotFrontage
      LotArea
      OverallQual
      OverallCond
      YearBuilt
      YearRemodAdd
      MasVnrArea
      BsmtFinSF1
      BsmtFinSF2
      ...
      SaleType_ConLw
      SaleType_New
      SaleType_Oth
      SaleType_WD
      SaleCondition_Abnorml
      SaleCondition_AdjLand
      SaleCondition_Alloca
      SaleCondition_Family
      SaleCondition_Normal
      SaleCondition_Partial
    
  
  
    
      0
      4.110874
      4.189655
      9.042040
      7
      5
      2003
      2003
      5.283204
      6.561031
      0.0
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
    
    
      1
      3.044522
      4.394449
      9.169623
      6
      8
      1976
      1976
      0.000000
      6.886532
      0.0
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
    
    
      2
      4.110874
      4.234107
      9.328212
      7
      5
      2001
      2002
      5.093750
      6.188264
      0.0
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
    
    
      3
      4.262680
      4.110874
      9.164401
      7
      5
      1915
      1970
      0.000000
      5.379897
      0.0
      ...
      0
      0
      0
      1
      1
      0
      0
      0
      0
      0
    
    
      4
      4.110874
      4.442651
      9.565284
      8
      5
      2000
      2000
      5.860786
      6.486161
      0.0
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
    
  

5 rows × 288 columns



In [675]:

    
model_ridge = Ridge()



In [676]:

    
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
cv_ridge = [rmse_cv(Ridge(alpha = alpha), X_train, y).mean() 
            for alpha in alphas]



In [677]:

    
cv_ridge = pd.Series(cv_ridge, index = alphas)

# cv_ridge.plot(title = "Validation - Just Do It")
# plt.xlabel("alpha")
# plt.ylabel("rmse")



In [678]:

    
cv_ridge.mean()









    Out[678]:





0.13065920469395489



In [679]:

    
model_lasso = LassoCV(alphas = [1, 0.5, 0.1, 0.005, 0.001, 0.0005]).fit(X_train, y)



In [680]:

    
# list(rmse_cv(model_lasso))



In [681]:

    
cv_lasso = pd.Series(list(rmse_cv(model_lasso, X_train, y)))
# cv_ridge.plot()
# plt.xlabel("alpha")
# plt.ylabel("rmse")



In [682]:

    
cv_lasso = rmse_cv(model_lasso,X_train, y)
cv_lasso.mean()









    Out[682]:





0.12118535046904196



In [726]:

    
X_train_reduced = X_train[X_train.columns[model_lasso.coef_ > 0]]
X_test_reduced = X_test[X_test.columns[model_lasso.coef_ > 0]]



In [721]:

    
model_lasso = LassoCV(alphas = [1, 0.5, 0.1, 0.005, 0.001, 0.0005]).fit(X_train, y)
cv_lasso = pd.Series(list(rmse_cv(model_lasso, X_train, y)))
cv_lasso = rmse_cv(model_lasso, X_train, y)
cv_lasso.mean()









    Out[721]:





0.12118535046904196



In [722]:

    
model_enet = ElasticNet(alpha=0.0005, l1_ratio=0.8)
rmse_cv(model_enet, X_train, y).mean()









    Out[722]:





0.12096858294376119



In [723]:

    
model_lasso_reduced = LassoCV(alphas = [1, 0.5, 0.1, 0.005, 0.001, 0.0005]).fit(X_train_reduced, y)
cv_lasso = pd.Series(list(rmse_cv(model_lasso_reduced, X_train_reduced, y)))
cv_lasso = rmse_cv(model_lasso_reduced, X_train_reduced, y)
cv_lasso.mean()









    Out[723]:





0.12355136973347081



In [725]:

    
model_enet_reduced = ElasticNet(alpha=0.0005, l1_ratio=0.8)
rmse_cv(model_enet_reduced, X_train_reduced, y).mean()









    Out[725]:





0.12326889785537645



In [664]:

    
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
cv_ridge = [rmse_cv(Ridge(alpha = alpha), X_train_reduced, y).mean() 
            for alpha in alphas]



In [665]:

    
cv_ridge = pd.Series(cv_ridge, index = alphas)
cv_ridge.mean()









    Out[665]:





0.12463070050069092



In [ ]:



In [ ]:



In [ ]:

    
clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.9)
rmse_cv(clf2, X_train_reduced, y).mean()



In [635]:

    
#ELASTIC NET
clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.9)
rmse_cv(clf2).mean()









    Out[635]:





0.12108641434225838



In [590]:

    
#LASSO MODEL
clf1 = LassoCV(alphas = [1, 0.1, 0.001, 0.0005, 5e-4])
clf1.fit(X_train, y)
lasso_preds = np.expm1(clf1.predict(X_test))

#ELASTIC NET
clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.9)
clf2.fit(X_train, y)
elas_preds = np.expm1(clf2.predict(X_test))



In [730]:

    
clf1 = LassoCV(alphas = [1, 0.1, 0.001, 0.0005, 5e-4])
clf1.fit(X_train, y)
lasso_preds = np.expm1(clf1.predict(X_test))


clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.8)
clf2.fit(X_train, y)
elas_preds = np.expm1(clf2.predict(X_test))



In [731]:

    
clf3 = LassoCV(alphas = [1, 0.1, 0.001, 0.0005, 5e-4])
clf3.fit(X_train, y)
lasso_preds_reduced = np.expm1(clf3.predict(X_test))


clf4 = ElasticNet(alpha=0.0005, l1_ratio=0.8)
clf4.fit(X_train, y)
elas_preds_reduced = np.expm1(clf4.predict(X_test))



In [743]:

    
final_result = 0.5 * elas_preds + 0.3 * lasso_preds + 0.1 * elas_preds_reduced + 0.1 * lasso_preds_reduced



In [738]:

    
final_result









    Out[738]:





array([ 107984.60355267,  136466.17793046,  162250.4778204 , ...,
        152202.39372745,  106790.58972291,  204497.66980462])



In [458]:

    
model_lasso = LassoCV(alphas = [1, 0.5, 0.1, 0.005, 0.001, 0.0005]).fit(X_train, y)









    



/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:479: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)



In [298]:

    
# model_elastic = ElasticNetCV(l1_ratio = [num / 100 for num in range(1, 100)], cv=5).fit(X_train, y)



In [300]:

    
param_bundle = {
    
    'randomForest': {
        "max_depth": [None], 
        "n_estimators": [2000],
        'n_jobs':[-1]
    },
    'gbm': {
        "max_depth": [None], 
        "n_estimators": [500],
        "learning_rate": [0.1, 0.05]
    }
    
}



In [301]:

    
model_bundle = {
    'randomForest': RandomForestRegressor,
    'gbm': GradientBoostingRegressor
    
}



In [302]:

    
# for key, val in param_bunble.items():
# param_grid = param_bundle['gbm']
# model = model_bundle['gbm']()
# #     clf = model(param)
# grid_search = GridSearchCV(model, param_grid=param_grid, n_jobs = -1, cv= 8)
# grid_search.fit(X_train, y)
# report(grid_search.cv_results_)
cv_result = rmse_cv(GradientBoostingRegressor(n_estimators=500))
print(cv_result.mean())









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-302-7156e9c40fe3> in <module>()
      6 # grid_search.fit(X_train, y)
      7 # report(grid_search.cv_results_)
----> 8 cv_result = rmse_cv(GradientBoostingRegressor(n_estimators=500))
      9 print(cv_result.mean())

<ipython-input-293-2195637862bb> in rmse_cv(model)
      1 def rmse_cv(model):
----> 2     rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 10))
      3     return(rmse)
      4 # X_train = X_train[X_train.columns[model_lasso.coef_ > 0]]

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
    138                                               train, test, verbose, None,
    139                                               fit_params)
--> 140                       for train, test in cv.split(X, y, groups))
    141     return np.array(scores)[:, 0]
    142 

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    756             # was dispatched. In particular this covers the edge
    757             # case of Parallel used with an exhausted iterator.
--> 758             while self.dispatch_one_batch(iterator):
    759                 self._iterating = True
    760             else:

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    606                 return False
    607             else:
--> 608                 self._dispatch(tasks)
    609                 return True
    610 

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    569         dispatch_timestamp = time.time()
    570         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571         job = self._backend.apply_async(batch, callback=cb)
    572         self._jobs.append(job)
    573 

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    107     def apply_async(self, func, callback=None):
    108         """Schedule a func to be run"""
--> 109         result = ImmediateResult(func)
    110         if callback:
    111             callback(result)

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    320         # Don't delay the application, to avoid keeping the input
    321         # arguments in memory
--> 322         self.results = batch()
    323 
    324     def get(self):

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
    236             estimator.fit(X_train, **fit_params)
    237         else:
--> 238             estimator.fit(X_train, y_train, **fit_params)
    239 
    240     except Exception as e:

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/ensemble/gradient_boosting.py in fit(self, X, y, sample_weight, monitor)
   1026         # fit the boosting stages
   1027         n_stages = self._fit_stages(X, y, y_pred, sample_weight, random_state,
-> 1028                                     begin_at_stage, monitor, X_idx_sorted)
   1029         # change shape of arrays after fit (early-stopping or additional ests)
   1030         if n_stages != self.estimators_.shape[0]:

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/ensemble/gradient_boosting.py in _fit_stages(self, X, y, y_pred, sample_weight, random_state, begin_at_stage, monitor, X_idx_sorted)
   1081             y_pred = self._fit_stage(i, X, y, y_pred, sample_weight,
   1082                                      sample_mask, random_state, X_idx_sorted,
-> 1083                                      X_csc, X_csr)
   1084 
   1085             # track deviance (= loss)

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/ensemble/gradient_boosting.py in _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask, random_state, X_idx_sorted, X_csc, X_csr)
    785             else:
    786                 tree.fit(X, residual, sample_weight=sample_weight,
--> 787                          check_input=False, X_idx_sorted=X_idx_sorted)
    788 
    789             # update tree leaves

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/tree/tree.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
    375                                            max_leaf_nodes, self.min_impurity_split)
    376 
--> 377         builder.build(self.tree_, X, y, sample_weight, X_idx_sorted)
    378 
    379         if self.n_outputs_ == 1:

KeyboardInterrupt:



In [ ]:



In [366]:

    
clf = GradientBoostingRegressor(n_estimators=500)
clf.fit(X_train, y)
print(cv_result)









    



[ 0.13839848  0.09917672  0.10317121  0.16675129  0.15574974  0.10727297
  0.13311965  0.10372274  0.11709455  0.13873188]



In [368]:

    
X_test['PriceRange'] =  clf.predict(X_test)









    



/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [373]:

    
X_test['PriceRange'] =  clf.predict(X_test)
X_test = pd.get_dummies(X_test)



In [378]:

    
X_test.shape









    Out[378]:





(1459, 292)



In [127]:

    
clf = GradientBoostingRegressor(n_estimators=500)

param_grid = {"max_depth": [3, 5, None], "learning_rate": [0.1, 0.05, 0.001]}    
# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs = -1, cv= 8)

grid_search.fit(X_train, y)









    Out[127]:





GridSearchCV(cv=8, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=200,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'learning_rate': [0.1, 0.05], 'max_depth': [1, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)



In [123]:

    
report(grid_search.cv_results_)









    



Model with rank: 1
Mean validation score: 0.906 (std: 0.016)
Parameters: {'learning_rate': 0.1, 'max_depth': 3}

Model with rank: 2
Mean validation score: 0.902 (std: 0.015)
Parameters: {'learning_rate': 0.05, 'max_depth': 3}

Model with rank: 3
Mean validation score: 0.884 (std: 0.018)
Parameters: {'learning_rate': 0.1, 'max_depth': 1}



In [124]:

    
cv_rf = rmse_cv(grid_search)



In [54]:

    
cv_rf.mean()









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-54-455094036f49> in <module>()
----> 1 cv_rf.mean()

NameError: name 'cv_rf' is not defined



In [98]:

    
learning_rates = [0.05, 0.1]
cv_gbm = [rmse_cv(GradientBoostingRegressor(n_estimators= 500, learning_rate = learning_rate)).mean() 
            for learning_rate in learning_rates]
cv_gbm = pd.Series(cv_gbm, index = learning_rates)



In [99]:

    
cv_gbm.min()









    Out[99]:





0.12565536164521188



In [53]:

    
rmse_cv(clf1).mean()









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-53-22c91a9e6add> in <module>()
----> 1 rmse_cv(clf1).mean()

NameError: name 'clf1' is not defined



In [744]:

    
sample = pd.read_csv("../rawData/sample_submission.csv")



In [745]:

    
sample.head()









    Out[745]:






  
    
      
      Id
      SalePrice
    
  
  
    
      0
      1461
      169277.052498
    
    
      1
      1462
      187758.393989
    
    
      2
      1463
      183583.683570
    
    
      3
      1464
      179317.477511
    
    
      4
      1465
      150730.079977



In [746]:

    
sample['SalePrice'] = final_result



In [747]:

    
sample.to_csv("../submission/15th.csv", index = False)



In [75]:

    
# X_train.to_csv("../cleanedData/data.train.matrix2.csv")



In [ ]:

	MSSubClass	LotFrontage	LotArea	OverallQual	OverallCond	YearBuilt	YearRemodAdd	MasVnrArea	BsmtFinSF1	...	SaleType_WD	SaleCondition_Abnorml	SaleCondition_Normal
0	4.110874	4.189655	9.042040	7	5	2003	2003	5.283204	6.561031	...	1	0	1
1	3.044522	4.394449	9.169623	6	8	1976	1976	0.000000	6.886532	...	1	0	1
2	4.110874	4.234107	9.328212	7	5	2001	2002	5.093750	6.188264	...	1	0	1
3	4.262680	4.110874	9.164401	7	5	1915	1970	0.000000	5.379897	...	1	1	0
4	4.110874	4.442651	9.565284	8	5	2000	2000	5.860786	6.486161	...	1	0	1

	Id	SalePrice
0	1461	169277.052498
1	1462	187758.393989
2	1463	183583.683570
3	1464	179317.477511
4	1465	150730.079977