In [27]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from math import expm1, log1p, sqrt
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, GridSearchCV, train_test_split, cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection
from sklearn import ensemble
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

In [2]:
n_folds = 5

In [3]:
X = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
X.drop('ID', axis=1, inplace=True)
test_id = test.pop('ID')
X['target'] = X['target'].apply(lambda x: log1p(x))

In [5]:
y = X.pop('target')

In [6]:
cols_to_remove = []
for col in X.columns:
    if X[col].std() == 0: 
        cols_to_remove.append(col)
        
# remove constant columns in the training set
X.drop(cols_to_remove, axis=1, inplace=True)
# remove constant columns in the test set
test.drop(cols_to_remove, axis=1, inplace=True)

print("Removed `{}` Constant Columns\n".format(len(cols_to_remove)))


Removed `256` Constant Columns


In [7]:
cols_to_remove = []
cols_scaned = []
dups = {}

columns = X.columns
for i in range(len(columns) - 1):
    v = X[columns[i]].values
    dup_cols = []
    for j in range(i + 1, len(columns)):
        if np.array_equal(v, X[columns[j]].values):
            cols_to_remove.append(columns[j])
            if columns[j] not in cols_scaned:
                dup_cols.append(columns[j]) 
                cols_scaned.append(columns[j])
                dups[columns[i]] = dup_cols
                
# remove duplicate columns in the training set
X.drop(cols_to_remove, axis=1, inplace=True) 
# remove duplicate columns in the testing set
test.drop(cols_to_remove, axis=1, inplace=True)

print("Removed `{}` Duplicate Columns\n".format(len(dups)))


Removed `4` Duplicate Columns


In [8]:
NUM_OF_FEATURES = 1000
def rmsle(y, pred):
    return np.sqrt(np.mean(np.power(y - pred, 2)))

x1, x2, y1, y2 = model_selection.train_test_split(X, y, test_size=0.20, random_state=5)
model = ensemble.RandomForestRegressor(n_jobs=-1, random_state=7)
model.fit(x1, y1)
print(rmsle(y2, model.predict(x2)))

col = pd.DataFrame({'importance': model.feature_importances_, 'feature': X.columns}).sort_values(
    by=['importance'], ascending=[False])[:NUM_OF_FEATURES]['feature'].values
X = X[col]
test = test[col]


1.5367277914935242

In [9]:
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X)
    rmse= np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv = kf))
    return (rmse)

In [10]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.054, gamma=0.4, 
                             learning_rate=0.01, max_depth=8, 
                             min_child_weight=5, n_estimators=1000,
                             reg_alpha=1e-05, reg_lambda=0.8571,
                             subsample=0.6, random_state =7,
                             nthread=4)

In [11]:
model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=144,
                              learning_rate=0.005, n_estimators=720, max_depth=13,
                              metric='rmse', is_training_metric=True,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.9)

In [12]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y, eval_metric='rmse'):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y, eval_metric=eval_metric)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

In [28]:
pipe = Pipeline([
    ('vt', VarianceThreshold(threshold=0.0)),
    ('avg-cv', AveragingModels(models = (model_xgb, model_lgb)))
])

In [29]:
score = rmsle_cv(pipe)
print("Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Averaged base models score: 1.4234 (0.0536)


In [16]:
submit = pd.DataFrame()
submit['ID'] = test_id

In [17]:
pipe.fit(X, y, eval_metric='rmse')
predictions = pipe.predict(test)

In [18]:
submit['target'] = [expm1(x) for x in predictions]
submit.to_csv('my_XGB_prediction.csv', index=False)

In [ ]: