In [27]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from math import expm1, log1p, sqrt
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, GridSearchCV, train_test_split, cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection
from sklearn import ensemble
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4
In [2]:
n_folds = 5
In [3]:
X = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
In [4]:
X.drop('ID', axis=1, inplace=True)
test_id = test.pop('ID')
X['target'] = X['target'].apply(lambda x: log1p(x))
In [5]:
y = X.pop('target')
In [6]:
cols_to_remove = []
for col in X.columns:
if X[col].std() == 0:
cols_to_remove.append(col)
# remove constant columns in the training set
X.drop(cols_to_remove, axis=1, inplace=True)
# remove constant columns in the test set
test.drop(cols_to_remove, axis=1, inplace=True)
print("Removed `{}` Constant Columns\n".format(len(cols_to_remove)))
In [7]:
cols_to_remove = []
cols_scaned = []
dups = {}
columns = X.columns
for i in range(len(columns) - 1):
v = X[columns[i]].values
dup_cols = []
for j in range(i + 1, len(columns)):
if np.array_equal(v, X[columns[j]].values):
cols_to_remove.append(columns[j])
if columns[j] not in cols_scaned:
dup_cols.append(columns[j])
cols_scaned.append(columns[j])
dups[columns[i]] = dup_cols
# remove duplicate columns in the training set
X.drop(cols_to_remove, axis=1, inplace=True)
# remove duplicate columns in the testing set
test.drop(cols_to_remove, axis=1, inplace=True)
print("Removed `{}` Duplicate Columns\n".format(len(dups)))
In [8]:
NUM_OF_FEATURES = 1000
def rmsle(y, pred):
return np.sqrt(np.mean(np.power(y - pred, 2)))
x1, x2, y1, y2 = model_selection.train_test_split(X, y, test_size=0.20, random_state=5)
model = ensemble.RandomForestRegressor(n_jobs=-1, random_state=7)
model.fit(x1, y1)
print(rmsle(y2, model.predict(x2)))
col = pd.DataFrame({'importance': model.feature_importances_, 'feature': X.columns}).sort_values(
by=['importance'], ascending=[False])[:NUM_OF_FEATURES]['feature'].values
X = X[col]
test = test[col]
In [9]:
def rmsle_cv(model):
kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X)
rmse= np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv = kf))
return (rmse)
In [10]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.054, gamma=0.4,
learning_rate=0.01, max_depth=8,
min_child_weight=5, n_estimators=1000,
reg_alpha=1e-05, reg_lambda=0.8571,
subsample=0.6, random_state =7,
nthread=4)
In [11]:
model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=144,
learning_rate=0.005, n_estimators=720, max_depth=13,
metric='rmse', is_training_metric=True,
max_bin = 55, bagging_fraction = 0.8,
bagging_freq = 5, feature_fraction = 0.9)
In [12]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
def __init__(self, models):
self.models = models
# we define clones of the original models to fit the data in
def fit(self, X, y, eval_metric='rmse'):
self.models_ = [clone(x) for x in self.models]
# Train cloned base models
for model in self.models_:
model.fit(X, y, eval_metric=eval_metric)
return self
#Now we do the predictions for cloned models and average them
def predict(self, X):
predictions = np.column_stack([
model.predict(X) for model in self.models_
])
return np.mean(predictions, axis=1)
In [28]:
pipe = Pipeline([
('vt', VarianceThreshold(threshold=0.0)),
('avg-cv', AveragingModels(models = (model_xgb, model_lgb)))
])
In [29]:
score = rmsle_cv(pipe)
print("Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
In [16]:
submit = pd.DataFrame()
submit['ID'] = test_id
In [17]:
pipe.fit(X, y, eval_metric='rmse')
predictions = pipe.predict(test)
In [18]:
submit['target'] = [expm1(x) for x in predictions]
submit.to_csv('my_XGB_prediction.csv', index=False)
In [ ]: