In [603]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import randint as sp_randint
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from itertools import product
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import cross_val_score

%matplotlib inline

In [668]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [669]:
# train = pd.read_csv("../cleanedData/data.train.matrix.csv")
# test = pd.read_csv("../cleanedData/data.test.matrix.csv")

In [670]:
# y = train['SalePrice']
# cols = [col for col in train.columns if col not in ['SalePrice']]
# X_train = train[cols]

In [671]:
train = pd.read_csv("../rawData/train.csv")
test = pd.read_csv("../rawData/test.csv")

In [672]:
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))
matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
prices = pd.DataFrame({"price":train["SalePrice"], "log(price + 1)":np.log1p(train["SalePrice"])})
# prices.hist()
#log transform the target:
train["SalePrice"] = np.log1p(train["SalePrice"])

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())


X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice

In [673]:
def rmse_cv(model, X_train, y):
    rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 10))
    return(rmse)

In [674]:
# more features
all_data.head()


Out[674]:
MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 ... SaleType_ConLw SaleType_New SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal SaleCondition_Partial
0 4.110874 4.189655 9.042040 7 5 2003 2003 5.283204 6.561031 0.0 ... 0 0 0 1 0 0 0 0 1 0
1 3.044522 4.394449 9.169623 6 8 1976 1976 0.000000 6.886532 0.0 ... 0 0 0 1 0 0 0 0 1 0
2 4.110874 4.234107 9.328212 7 5 2001 2002 5.093750 6.188264 0.0 ... 0 0 0 1 0 0 0 0 1 0
3 4.262680 4.110874 9.164401 7 5 1915 1970 0.000000 5.379897 0.0 ... 0 0 0 1 1 0 0 0 0 0
4 4.110874 4.442651 9.565284 8 5 2000 2000 5.860786 6.486161 0.0 ... 0 0 0 1 0 0 0 0 1 0

5 rows × 288 columns


In [675]:
model_ridge = Ridge()

In [676]:
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
cv_ridge = [rmse_cv(Ridge(alpha = alpha), X_train, y).mean() 
            for alpha in alphas]

In [677]:
cv_ridge = pd.Series(cv_ridge, index = alphas)

# cv_ridge.plot(title = "Validation - Just Do It")
# plt.xlabel("alpha")
# plt.ylabel("rmse")

In [678]:
cv_ridge.mean()


Out[678]:
0.13065920469395489

In [679]:
model_lasso = LassoCV(alphas = [1, 0.5, 0.1, 0.005, 0.001, 0.0005]).fit(X_train, y)

In [680]:
# list(rmse_cv(model_lasso))

In [681]:
cv_lasso = pd.Series(list(rmse_cv(model_lasso, X_train, y)))
# cv_ridge.plot()
# plt.xlabel("alpha")
# plt.ylabel("rmse")

In [682]:
cv_lasso = rmse_cv(model_lasso,X_train, y)
cv_lasso.mean()


Out[682]:
0.12118535046904196

In [726]:
X_train_reduced = X_train[X_train.columns[model_lasso.coef_ > 0]]
X_test_reduced = X_test[X_test.columns[model_lasso.coef_ > 0]]

In [721]:
model_lasso = LassoCV(alphas = [1, 0.5, 0.1, 0.005, 0.001, 0.0005]).fit(X_train, y)
cv_lasso = pd.Series(list(rmse_cv(model_lasso, X_train, y)))
cv_lasso = rmse_cv(model_lasso, X_train, y)
cv_lasso.mean()


Out[721]:
0.12118535046904196

In [722]:
model_enet = ElasticNet(alpha=0.0005, l1_ratio=0.8)
rmse_cv(model_enet, X_train, y).mean()


Out[722]:
0.12096858294376119

In [723]:
model_lasso_reduced = LassoCV(alphas = [1, 0.5, 0.1, 0.005, 0.001, 0.0005]).fit(X_train_reduced, y)
cv_lasso = pd.Series(list(rmse_cv(model_lasso_reduced, X_train_reduced, y)))
cv_lasso = rmse_cv(model_lasso_reduced, X_train_reduced, y)
cv_lasso.mean()


Out[723]:
0.12355136973347081

In [725]:
model_enet_reduced = ElasticNet(alpha=0.0005, l1_ratio=0.8)
rmse_cv(model_enet_reduced, X_train_reduced, y).mean()


Out[725]:
0.12326889785537645

In [664]:
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
cv_ridge = [rmse_cv(Ridge(alpha = alpha), X_train_reduced, y).mean() 
            for alpha in alphas]

In [665]:
cv_ridge = pd.Series(cv_ridge, index = alphas)
cv_ridge.mean()


Out[665]:
0.12463070050069092

In [ ]:


In [ ]:


In [ ]:
clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.9)
rmse_cv(clf2, X_train_reduced, y).mean()

In [635]:
#ELASTIC NET
clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.9)
rmse_cv(clf2).mean()


Out[635]:
0.12108641434225838

In [590]:
#LASSO MODEL
clf1 = LassoCV(alphas = [1, 0.1, 0.001, 0.0005, 5e-4])
clf1.fit(X_train, y)
lasso_preds = np.expm1(clf1.predict(X_test))

#ELASTIC NET
clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.9)
clf2.fit(X_train, y)
elas_preds = np.expm1(clf2.predict(X_test))

In [730]:
clf1 = LassoCV(alphas = [1, 0.1, 0.001, 0.0005, 5e-4])
clf1.fit(X_train, y)
lasso_preds = np.expm1(clf1.predict(X_test))


clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.8)
clf2.fit(X_train, y)
elas_preds = np.expm1(clf2.predict(X_test))

In [731]:
clf3 = LassoCV(alphas = [1, 0.1, 0.001, 0.0005, 5e-4])
clf3.fit(X_train, y)
lasso_preds_reduced = np.expm1(clf3.predict(X_test))


clf4 = ElasticNet(alpha=0.0005, l1_ratio=0.8)
clf4.fit(X_train, y)
elas_preds_reduced = np.expm1(clf4.predict(X_test))

In [743]:
final_result = 0.5 * elas_preds + 0.3 * lasso_preds + 0.1 * elas_preds_reduced + 0.1 * lasso_preds_reduced

In [738]:
final_result


Out[738]:
array([ 107984.60355267,  136466.17793046,  162250.4778204 , ...,
        152202.39372745,  106790.58972291,  204497.66980462])

In [458]:
model_lasso = LassoCV(alphas = [1, 0.5, 0.1, 0.005, 0.001, 0.0005]).fit(X_train, y)


/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:479: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)

In [298]:
# model_elastic = ElasticNetCV(l1_ratio = [num / 100 for num in range(1, 100)], cv=5).fit(X_train, y)

In [300]:
param_bundle = {
    
    'randomForest': {
        "max_depth": [None], 
        "n_estimators": [2000],
        'n_jobs':[-1]
    },
    'gbm': {
        "max_depth": [None], 
        "n_estimators": [500],
        "learning_rate": [0.1, 0.05]
    }
    
}

In [301]:
model_bundle = {
    'randomForest': RandomForestRegressor,
    'gbm': GradientBoostingRegressor
    
}

In [302]:
# for key, val in param_bunble.items():
# param_grid = param_bundle['gbm']
# model = model_bundle['gbm']()
# #     clf = model(param)
# grid_search = GridSearchCV(model, param_grid=param_grid, n_jobs = -1, cv= 8)
# grid_search.fit(X_train, y)
# report(grid_search.cv_results_)
cv_result = rmse_cv(GradientBoostingRegressor(n_estimators=500))
print(cv_result.mean())


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-302-7156e9c40fe3> in <module>()
      6 # grid_search.fit(X_train, y)
      7 # report(grid_search.cv_results_)
----> 8 cv_result = rmse_cv(GradientBoostingRegressor(n_estimators=500))
      9 print(cv_result.mean())

<ipython-input-293-2195637862bb> in rmse_cv(model)
      1 def rmse_cv(model):
----> 2     rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 10))
      3     return(rmse)
      4 # X_train = X_train[X_train.columns[model_lasso.coef_ > 0]]

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
    138                                               train, test, verbose, None,
    139                                               fit_params)
--> 140                       for train, test in cv.split(X, y, groups))
    141     return np.array(scores)[:, 0]
    142 

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    756             # was dispatched. In particular this covers the edge
    757             # case of Parallel used with an exhausted iterator.
--> 758             while self.dispatch_one_batch(iterator):
    759                 self._iterating = True
    760             else:

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    606                 return False
    607             else:
--> 608                 self._dispatch(tasks)
    609                 return True
    610 

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    569         dispatch_timestamp = time.time()
    570         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571         job = self._backend.apply_async(batch, callback=cb)
    572         self._jobs.append(job)
    573 

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    107     def apply_async(self, func, callback=None):
    108         """Schedule a func to be run"""
--> 109         result = ImmediateResult(func)
    110         if callback:
    111             callback(result)

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    320         # Don't delay the application, to avoid keeping the input
    321         # arguments in memory
--> 322         self.results = batch()
    323 
    324     def get(self):

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
    236             estimator.fit(X_train, **fit_params)
    237         else:
--> 238             estimator.fit(X_train, y_train, **fit_params)
    239 
    240     except Exception as e:

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/ensemble/gradient_boosting.py in fit(self, X, y, sample_weight, monitor)
   1026         # fit the boosting stages
   1027         n_stages = self._fit_stages(X, y, y_pred, sample_weight, random_state,
-> 1028                                     begin_at_stage, monitor, X_idx_sorted)
   1029         # change shape of arrays after fit (early-stopping or additional ests)
   1030         if n_stages != self.estimators_.shape[0]:

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/ensemble/gradient_boosting.py in _fit_stages(self, X, y, y_pred, sample_weight, random_state, begin_at_stage, monitor, X_idx_sorted)
   1081             y_pred = self._fit_stage(i, X, y, y_pred, sample_weight,
   1082                                      sample_mask, random_state, X_idx_sorted,
-> 1083                                      X_csc, X_csr)
   1084 
   1085             # track deviance (= loss)

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/ensemble/gradient_boosting.py in _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask, random_state, X_idx_sorted, X_csc, X_csr)
    785             else:
    786                 tree.fit(X, residual, sample_weight=sample_weight,
--> 787                          check_input=False, X_idx_sorted=X_idx_sorted)
    788 
    789             # update tree leaves

/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/tree/tree.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
    375                                            max_leaf_nodes, self.min_impurity_split)
    376 
--> 377         builder.build(self.tree_, X, y, sample_weight, X_idx_sorted)
    378 
    379         if self.n_outputs_ == 1:

KeyboardInterrupt: 

In [ ]:


In [366]:
clf = GradientBoostingRegressor(n_estimators=500)
clf.fit(X_train, y)
print(cv_result)


[ 0.13839848  0.09917672  0.10317121  0.16675129  0.15574974  0.10727297
  0.13311965  0.10372274  0.11709455  0.13873188]

In [368]:
X_test['PriceRange'] =  clf.predict(X_test)


/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [373]:
X_test['PriceRange'] =  clf.predict(X_test)
X_test = pd.get_dummies(X_test)

In [378]:
X_test.shape


Out[378]:
(1459, 292)

In [127]:
clf = GradientBoostingRegressor(n_estimators=500)

param_grid = {"max_depth": [3, 5, None], "learning_rate": [0.1, 0.05, 0.001]}    
# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs = -1, cv= 8)

grid_search.fit(X_train, y)


Out[127]:
GridSearchCV(cv=8, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=200,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'learning_rate': [0.1, 0.05], 'max_depth': [1, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [123]:
report(grid_search.cv_results_)


Model with rank: 1
Mean validation score: 0.906 (std: 0.016)
Parameters: {'learning_rate': 0.1, 'max_depth': 3}

Model with rank: 2
Mean validation score: 0.902 (std: 0.015)
Parameters: {'learning_rate': 0.05, 'max_depth': 3}

Model with rank: 3
Mean validation score: 0.884 (std: 0.018)
Parameters: {'learning_rate': 0.1, 'max_depth': 1}


In [124]:
cv_rf = rmse_cv(grid_search)

In [54]:
cv_rf.mean()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-54-455094036f49> in <module>()
----> 1 cv_rf.mean()

NameError: name 'cv_rf' is not defined

In [98]:
learning_rates = [0.05, 0.1]
cv_gbm = [rmse_cv(GradientBoostingRegressor(n_estimators= 500, learning_rate = learning_rate)).mean() 
            for learning_rate in learning_rates]
cv_gbm = pd.Series(cv_gbm, index = learning_rates)

In [99]:
cv_gbm.min()


Out[99]:
0.12565536164521188

In [53]:
rmse_cv(clf1).mean()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-53-22c91a9e6add> in <module>()
----> 1 rmse_cv(clf1).mean()

NameError: name 'clf1' is not defined

In [744]:
sample = pd.read_csv("../rawData/sample_submission.csv")

In [745]:
sample.head()


Out[745]:
Id SalePrice
0 1461 169277.052498
1 1462 187758.393989
2 1463 183583.683570
3 1464 179317.477511
4 1465 150730.079977

In [746]:
sample['SalePrice'] = final_result

In [747]:
sample.to_csv("../submission/15th.csv", index = False)

In [75]:
# X_train.to_csv("../cleanedData/data.train.matrix2.csv")

In [ ]: