In [603]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import randint as sp_randint
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from itertools import product
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import cross_val_score
%matplotlib inline
In [668]:
def report(results, n_top=3):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")
In [669]:
# train = pd.read_csv("../cleanedData/data.train.matrix.csv")
# test = pd.read_csv("../cleanedData/data.test.matrix.csv")
In [670]:
# y = train['SalePrice']
# cols = [col for col in train.columns if col not in ['SalePrice']]
# X_train = train[cols]
In [671]:
train = pd.read_csv("../rawData/train.csv")
test = pd.read_csv("../rawData/test.csv")
In [672]:
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
test.loc[:,'MSSubClass':'SaleCondition']))
matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
prices = pd.DataFrame({"price":train["SalePrice"], "log(price + 1)":np.log1p(train["SalePrice"])})
# prices.hist()
#log transform the target:
train["SalePrice"] = np.log1p(train["SalePrice"])
#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice
In [673]:
def rmse_cv(model, X_train, y):
rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 10))
return(rmse)
In [674]:
# more features
all_data.head()
Out[674]:
In [675]:
model_ridge = Ridge()
In [676]:
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
cv_ridge = [rmse_cv(Ridge(alpha = alpha), X_train, y).mean()
for alpha in alphas]
In [677]:
cv_ridge = pd.Series(cv_ridge, index = alphas)
# cv_ridge.plot(title = "Validation - Just Do It")
# plt.xlabel("alpha")
# plt.ylabel("rmse")
In [678]:
cv_ridge.mean()
Out[678]:
In [679]:
model_lasso = LassoCV(alphas = [1, 0.5, 0.1, 0.005, 0.001, 0.0005]).fit(X_train, y)
In [680]:
# list(rmse_cv(model_lasso))
In [681]:
cv_lasso = pd.Series(list(rmse_cv(model_lasso, X_train, y)))
# cv_ridge.plot()
# plt.xlabel("alpha")
# plt.ylabel("rmse")
In [682]:
cv_lasso = rmse_cv(model_lasso,X_train, y)
cv_lasso.mean()
Out[682]:
In [726]:
X_train_reduced = X_train[X_train.columns[model_lasso.coef_ > 0]]
X_test_reduced = X_test[X_test.columns[model_lasso.coef_ > 0]]
In [721]:
model_lasso = LassoCV(alphas = [1, 0.5, 0.1, 0.005, 0.001, 0.0005]).fit(X_train, y)
cv_lasso = pd.Series(list(rmse_cv(model_lasso, X_train, y)))
cv_lasso = rmse_cv(model_lasso, X_train, y)
cv_lasso.mean()
Out[721]:
In [722]:
model_enet = ElasticNet(alpha=0.0005, l1_ratio=0.8)
rmse_cv(model_enet, X_train, y).mean()
Out[722]:
In [723]:
model_lasso_reduced = LassoCV(alphas = [1, 0.5, 0.1, 0.005, 0.001, 0.0005]).fit(X_train_reduced, y)
cv_lasso = pd.Series(list(rmse_cv(model_lasso_reduced, X_train_reduced, y)))
cv_lasso = rmse_cv(model_lasso_reduced, X_train_reduced, y)
cv_lasso.mean()
Out[723]:
In [725]:
model_enet_reduced = ElasticNet(alpha=0.0005, l1_ratio=0.8)
rmse_cv(model_enet_reduced, X_train_reduced, y).mean()
Out[725]:
In [664]:
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
cv_ridge = [rmse_cv(Ridge(alpha = alpha), X_train_reduced, y).mean()
for alpha in alphas]
In [665]:
cv_ridge = pd.Series(cv_ridge, index = alphas)
cv_ridge.mean()
Out[665]:
In [ ]:
In [ ]:
In [ ]:
clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.9)
rmse_cv(clf2, X_train_reduced, y).mean()
In [635]:
#ELASTIC NET
clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.9)
rmse_cv(clf2).mean()
Out[635]:
In [590]:
#LASSO MODEL
clf1 = LassoCV(alphas = [1, 0.1, 0.001, 0.0005, 5e-4])
clf1.fit(X_train, y)
lasso_preds = np.expm1(clf1.predict(X_test))
#ELASTIC NET
clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.9)
clf2.fit(X_train, y)
elas_preds = np.expm1(clf2.predict(X_test))
In [730]:
clf1 = LassoCV(alphas = [1, 0.1, 0.001, 0.0005, 5e-4])
clf1.fit(X_train, y)
lasso_preds = np.expm1(clf1.predict(X_test))
clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.8)
clf2.fit(X_train, y)
elas_preds = np.expm1(clf2.predict(X_test))
In [731]:
clf3 = LassoCV(alphas = [1, 0.1, 0.001, 0.0005, 5e-4])
clf3.fit(X_train, y)
lasso_preds_reduced = np.expm1(clf3.predict(X_test))
clf4 = ElasticNet(alpha=0.0005, l1_ratio=0.8)
clf4.fit(X_train, y)
elas_preds_reduced = np.expm1(clf4.predict(X_test))
In [743]:
final_result = 0.5 * elas_preds + 0.3 * lasso_preds + 0.1 * elas_preds_reduced + 0.1 * lasso_preds_reduced
In [738]:
final_result
Out[738]:
In [458]:
model_lasso = LassoCV(alphas = [1, 0.5, 0.1, 0.005, 0.001, 0.0005]).fit(X_train, y)
In [298]:
# model_elastic = ElasticNetCV(l1_ratio = [num / 100 for num in range(1, 100)], cv=5).fit(X_train, y)
In [300]:
param_bundle = {
'randomForest': {
"max_depth": [None],
"n_estimators": [2000],
'n_jobs':[-1]
},
'gbm': {
"max_depth": [None],
"n_estimators": [500],
"learning_rate": [0.1, 0.05]
}
}
In [301]:
model_bundle = {
'randomForest': RandomForestRegressor,
'gbm': GradientBoostingRegressor
}
In [302]:
# for key, val in param_bunble.items():
# param_grid = param_bundle['gbm']
# model = model_bundle['gbm']()
# # clf = model(param)
# grid_search = GridSearchCV(model, param_grid=param_grid, n_jobs = -1, cv= 8)
# grid_search.fit(X_train, y)
# report(grid_search.cv_results_)
cv_result = rmse_cv(GradientBoostingRegressor(n_estimators=500))
print(cv_result.mean())
In [ ]:
In [366]:
clf = GradientBoostingRegressor(n_estimators=500)
clf.fit(X_train, y)
print(cv_result)
In [368]:
X_test['PriceRange'] = clf.predict(X_test)
In [373]:
X_test['PriceRange'] = clf.predict(X_test)
X_test = pd.get_dummies(X_test)
In [378]:
X_test.shape
Out[378]:
In [127]:
clf = GradientBoostingRegressor(n_estimators=500)
param_grid = {"max_depth": [3, 5, None], "learning_rate": [0.1, 0.05, 0.001]}
# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs = -1, cv= 8)
grid_search.fit(X_train, y)
Out[127]:
In [123]:
report(grid_search.cv_results_)
In [124]:
cv_rf = rmse_cv(grid_search)
In [54]:
cv_rf.mean()
In [98]:
learning_rates = [0.05, 0.1]
cv_gbm = [rmse_cv(GradientBoostingRegressor(n_estimators= 500, learning_rate = learning_rate)).mean()
for learning_rate in learning_rates]
cv_gbm = pd.Series(cv_gbm, index = learning_rates)
In [99]:
cv_gbm.min()
Out[99]:
In [53]:
rmse_cv(clf1).mean()
In [744]:
sample = pd.read_csv("../rawData/sample_submission.csv")
In [745]:
sample.head()
Out[745]:
In [746]:
sample['SalePrice'] = final_result
In [747]:
sample.to_csv("../submission/15th.csv", index = False)
In [75]:
# X_train.to_csv("../cleanedData/data.train.matrix2.csv")
In [ ]: