In [ ]:
%%javascript
$('<div id="toc"></div>').css({position: 'fixed', top: '120px', left: 0}).appendTo(document.body);
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js');
In [ ]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
import numpy as np
import gc
from jupyterthemes import jtplot
jtplot.style()
import xgboost as xg
from xgboost import XGBModel
from xgboost import plot_importance
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, ShuffleSplit
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn import cross_validation
from catboost import CatBoostRegressor
from tqdm import tqdm
%matplotlib inline
%load_ext autotime
%load_ext line_profiler
%matplotlib inline
In [ ]:
# Kaggle Kernel Data Preparation & Own Implementations
def plot_data(test, pred, sample, title, width=40, height=10, linewidth=0.5, color1='white', color2='orange'):
""" Plotting method. """
fig = plt.figure(figsize=(width, height))
plt.plot(pred[:sample], color=color1, zorder=4, linewidth=linewidth, label='%s Prediction'%(title))
plt.plot(test[:sample], color=color2, zorder=3, linewidth=linewidth, label='%s True Data'%(title))
plt.title = title
plt.legend()
# Frequency count
def get_frequency(data):
# Gets the frequency of a column's values in 'data'. Pass on a series.
vals = pd.merge(data.to_frame(), data.value_counts().reset_index(),
how='left', left_on=data.to_frame().columns[0], right_on='index').iloc[:, -1:].values
return vals
def time_data(data):
data['transactiondate'] = pd.to_datetime(data['transactiondate'])
data['day_of_week'] = data['transactiondate'].dt.dayofweek
data['month_of_year'] = data['transactiondate'].dt.month
data['quarter'] = data['transactiondate'].dt.quarter
data['is_weekend'] = (data['day_of_week'] < 5).astype(int)
data.drop('transactiondate', axis=1, inplace=True)
print('Added time data')
print('........')
return data
def column_excluder(data, missing_perc_thresh=0.98):
# Quick clean from https://www.kaggle.com/seesee/concise-catboost-starter-ensemble-plb-0-06435
exclude_missing = []
exclude_unique = []
num_rows = data.shape[0]
for c in data.columns:
num_missing = data[c].isnull().sum()
if num_missing == 0:
continue
missing_frac = num_missing / float(num_rows)
if missing_frac > missing_perc_thresh:
exclude_missing.append(c)
num_uniques = len(data[c].unique())
if data[c].isnull().sum() != 0:
num_uniques -= 1
if num_uniques == 1:
exclude_unique.append(c)
to_exclude = list(set(exclude_missing + exclude_unique))
print('Excluded columns:')
print(to_exclude)
print('........')
return to_exclude
def categorical_features(data):
# Quick categories from https://www.kaggle.com/seesee/concise-catboost-starter-ensemble-plb-0-06435
cat_feature_inds = []
cat_unique_thresh = 1000
for i, c in enumerate(data.columns):
num_uniques = len(data[c].unique())
if num_uniques < cat_unique_thresh \
and not 'sqft' in c \
and not 'cnt' in c \
and not 'nbr' in c \
and not 'number' in c:
cat_feature_inds.append(i)
print("Categorical features:")
print([data.columns[ind] for ind in cat_feature_inds])
print('........')
return cat_feature_inds
def complex_features(data):
# Gets counts, label encoding and frequency estimates.
# Frequency of occurances | length of codes | check if * is present
data['propertyzoningdesc_frq'] = get_frequency(data['propertyzoningdesc'])
data['propertyzoningdesc_len'] = data['propertyzoningdesc'].apply(lambda x: len(x) if pd.notnull(x) else x)
#transactions_shuffled['propertyzoningdesc_str'] = transactions_shuffled['propertyzoningdesc'].apply(lambda x: (1 if '*' in str(x) else 0) if pd.notnull(x) else x)
# Label encoding | length of code
#transactions_shuffled['propertycountylandusecode_enc'] = transactions_shuffled[['propertycountylandusecode']].astype(str).apply(LabelEncoder().fit_transform)
#transactions_shuffled['propertycountylandusecode_len'] = transactions_shuffled['propertycountylandusecode'].apply(lambda x: x if pd.isnull(x) else len(x))
# Zip code area extraction
data['regionidzip_ab'] = data['regionidzip'].apply(lambda x: x if pd.isnull(x) else str(x)[:2]).astype(float)
data['regionidzip_abc'] = data['regionidzip'].apply(lambda x: x if pd.isnull(x) else str(x)[:3]).astype(float)
# Region neighbourhood area extraction
data['regionidneighborhood_ab'] = data['regionidneighborhood'].apply(lambda x: str(x)[:2] if pd.notnull(x) else x).astype(float)
# Rawcensustractandblock transformed
data['code_fips_cnt'] = get_frequency(data['rawcensustractandblock'].apply(lambda x: str(x)[:4]))
data['code_tract_cnt'] = get_frequency(data['rawcensustractandblock'].apply(lambda x: str(x)[4:11]))
data['code_block_cnt'] = get_frequency(data['rawcensustractandblock'].apply(lambda x: str(x)[11:]))
data.drop('rawcensustractandblock', axis=1, inplace=True)
# Encode string values
data[['propertycountylandusecode', 'propertyzoningdesc']] = data[['propertycountylandusecode', 'propertyzoningdesc']].astype(str).apply(LabelEncoder().fit_transform)
print('Generating complex features')
print('........')
return data
In [ ]:
models = {}
In [ ]:
non_number_columns
In [ ]:
# Kaggle Kernel Data Preparation
print('Loading data...')
# Load raw data
properties2016_raw = pd.read_csv('../Data/properties_2016.csv', low_memory = False)
properties2017 = pd.read_csv('../Data/properties_2017.csv', low_memory = False)
train2016 = pd.read_csv('../Data/train_2016_v2.csv')
train2017 = pd.read_csv('../Data/train_2017.csv')
sample_submission = pd.read_csv('../Data/sample_submission.csv', low_memory = False)
# Create a new version of 2016 properties data that takes all non-tax variables from 2017
taxvars = ['structuretaxvaluedollarcnt', 'landtaxvaluedollarcnt', 'taxvaluedollarcnt', 'taxamount']
tax2016 = properties2016_raw[['parcelid']+taxvars]
properties2016 = properties2017.drop(taxvars,axis=1).merge(tax2016,
how='left', on='parcelid').reindex_axis(properties2017.columns, axis=1)
# Create a training data set
train2016 = pd.merge(train2016, properties2016, how = 'left', on = 'parcelid')
train2017 = pd.merge(train2017, properties2017, how = 'left', on = 'parcelid')
train = pd.concat([train2016, train2017], axis = 0)
# Create separate test data sets for 2016 and 2017
test2016 = pd.merge(sample_submission[['ParcelId']], properties2016.rename(columns = {'parcelid': 'ParcelId'}),
how = 'left', on = 'ParcelId')
test2017 = pd.merge(sample_submission[['ParcelId']], properties2017.rename(columns = {'parcelid': 'ParcelId'}),
how = 'left', on = 'ParcelId')
del properties2016, properties2017, train2016, train2017
gc.collect();
print('Memory usage reduction...')
train[['latitude', 'longitude']] /= 1e6
train['censustractandblock'] /= 1e12
def preptest(test):
test[['latitude', 'longitude']] /= 1e6
test[['latitude', 'longitude']] /= 1e6
test['censustractandblock'] /= 1e12
test['censustractandblock'] /= 1e12
for column in test.columns:
if test[column].dtype == int:
test[column] = test[column].astype(np.int32)
if test[column].dtype == float:
test[column] = test[column].astype(np.float32)
preptest(test2016)
preptest(test2017)
print('Feature engineering...')
train['month'] = (pd.to_datetime(train['transactiondate']).dt.year - 2016)*12 + pd.to_datetime(train['transactiondate']).dt.month
train = train.drop('transactiondate', axis = 1)
from sklearn.preprocessing import LabelEncoder
non_number_columns = train.dtypes[train.dtypes == object].index.values
for column in non_number_columns:
train_test = pd.concat([train[column], test2016[column], test2017[column]], axis = 0)
encoder = LabelEncoder().fit(train_test.astype(str))
train[column] = encoder.transform(train[column].astype(str)).astype(np.int32)
test2016[column] = encoder.transform(test2016[column].astype(str)).astype(np.int32)
test2017[column] = encoder.transform(test2017[column].astype(str)).astype(np.int32)
feature_names = [feature for feature in train.columns[2:] if feature != 'month']
month_avgs = train.groupby('month').agg('mean')['logerror'].values - train['logerror'].mean()
X_train_all = train[feature_names].fillna(-1)
y_train_all = train['logerror'].fillna(-1)
train = train[np.abs(train['logerror']) < 0.4]
print('Preparing arrays and throwing out outliers...')
X_train = train[feature_names].fillna(-1)#.values
y_train = train['logerror'].fillna(-1)#.values
X_test2016 = test2016[feature_names].fillna(-1)#.values
X_test2017 = test2017[feature_names].fillna(-1)#.values
del test2016, test2017;
gc.collect();
month_values = train['month'].values
month_avg_values = np.array([month_avgs[month - 1] for month in month_values]).reshape(-1, 1)
X_train = np.hstack([X_train, month_avg_values])
In [ ]:
# OLS
model_lr = LinearRegression()
#model_lr.fit(x_train_data, x_train_label)
#y_pred_lr_valid = model_lr.predict(x_valid)
#y_pred_lr_train = model_lr.predict(x_train_data)
models['LinearRegression'] = model_lr
# Make predictions on both test and validation with OLS and BR
#predicted_mae_lr_valid = mean_absolute_error(y_valid, y_pred_lr_valid)
#predicted_mae_lr_train = mean_absolute_error(x_train_label, y_pred_lr_train)
#print('OLS MAE LR Valid:', predicted_mae_lr_valid, 'Train:', predicted_mae_lr_train)
scores = cross_validation.cross_val_score(model_lr, X_train_all, y_train_all, cv=5, scoring='neg_mean_absolute_error', verbose=1)
print("%s MAE: %0.5f (+/- %0.5f)" % (model_lr.__class__.__name__, scores.mean(), scores.std() * 2))
#del y_pred_lr_valid
#del y_pred_lr_train
In [ ]:
# BayesianRidge Regression
model_br = BayesianRidge(compute_score=True)
#model_br.fit(x_train, y_train)
#y_pred_br_valid = model_br.predict(x_valid)
#y_pred_br_train = model_br.predict(x_train_data)
models['BayesianRidge'] = model_br
#predicted_mae_br_valid = mean_absolute_error(y_valid, y_pred_br_valid)
#predicted_mae_br_train = mean_absolute_error(x_train_label, y_pred_br_train)
#print('BR MAE BayesianRidge Valid: %s \nTrain: %s' % (predicted_mae_br_valid, predicted_mae_br_train))
scores = cross_validation.cross_val_score(model_br, X_train_all, y_train_all, cv=5, scoring='neg_mean_absolute_error', verbose=1)
print("%s MAE: %0.5f (+/- %0.5f)" % (model_br.__class__.__name__, scores.mean(), scores.std() * 2))
#del y_pred_br_valid
#del y_pred_br_train
In [ ]:
from sklearn.ensemble import RandomForestRegressor
model_rf = RandomForestRegressor(n_jobs=1, random_state=2016, verbose=1, n_estimators=500, max_features=12)
#model_rf.fit(x_train, y_train)
#y_pred_rf_valid = model_rf.predict(x_valid)
#y_pred_rf_train = model_rf.predict(x_train_data)
models['RandomForest'] = model_rf
#predicted_mae_rf_valid = mean_absolute_error(y_valid, y_pred_rf_valid)
#predicted_mae_rf_train = mean_absolute_error(x_train_label, y_pred_rf_train)
#print('BR MAE RandomForest Valid: %s \nTrain: %s' % (predicted_mae_rf_valid, predicted_mae_rf_train))
scores = cross_validation.cross_val_score(model_rf, X_train_all, y_train_all, cv=5, scoring='neg_mean_absolute_error', verbose=1)
print("%s MAE: %0.5f (+/- %0.5f)" % (model_rf.__class__.__name__, scores.mean(), scores.std() * 2))
#del y_pred_rf_train
#del y_pred_rf_valid
In [ ]:
from sklearn.ensemble import ExtraTreesRegressor
model_et = ExtraTreesRegressor(
n_jobs=1, random_state=2016, verbose=1,
n_estimators=500, max_features=12)
#model_et.fit(x_train, y_train)
#y_pred_et_valid = model_et.predict(x_valid)
#y_pred_et_train = model_et.predict(x_train_data)
models['ExtraTrees'] = model_et
#predicted_mae_et_valid = mean_absolute_error(y_valid, y_pred_et_valid)
#predicted_mae_et_train = mean_absolute_error(x_train_label, y_pred_et_train)
#print('BR MAE ExtraTrees Valid: %s \nTrain: %s' % (predicted_mae_et_valid, predicted_mae_et_train))
#scores = cross_validation.cross_val_score(model_et, x_train, y_train, cv=5, scoring='neg_mean_absolute_error', verbose=1)
#print("%s MAE: %0.5f (+/- %0.5f)" % (model_et.__class__.__name__, scores.mean(), scores.std() * 2))
#del y_pred_et_valid
#del y_pred_et_train
In [ ]:
from sklearn.ensemble import AdaBoostRegressor
model_ab = AdaBoostRegressor()
#model_ab.fit(x_train, y_train)
#y_pred_ab_valid = model_ab.predict(x_valid)
#y_pred_ab_train = model_ab.predict(x_train_data)
models['AdaBoost'] = model_ab
#predicted_mae_ab_valid = mean_absolute_error(y_valid, y_pred_ab_valid)
#predicted_mae_ab_train = mean_absolute_error(x_train_label, y_pred_ab_train)
#print('BR MAE AdaBoost Valid: %s \nTrain: %s' % (predicted_mae_ab_valid, predicted_mae_ab_train))
#scores = cross_validation.cross_val_score(model_ab, x_train, y_train, cv=5, scoring='neg_mean_absolute_error', verbose=1)
#print("%s MAE: %0.5f (+/- %0.5f)" % (model_ab.__class__.__name__, scores.mean(), scores.std() * 2))
#del y_pred_ab_valid
#del y_pred_ab_train
In [ ]:
def cat_booster(x_train, y_train, x_valid, y_valid, cat_index, loss='MAE'):
# Cat booster train and predict
num_ensembles = 5
y_pred_valid = 0.0
y_pred_train = 0.0
print('Initialising CAT Boost Regression')
for i in tqdm(range(num_ensembles)):
print('Building ensemble', i)
# Use CV, tune hyperparameters
catb = CatBoostRegressor(
iterations=630, learning_rate=0.03,
depth=6, l2_leaf_reg=3,
loss_function=loss,
eval_metric='MAE',
random_seed=i)
catb.fit(x_train, y_train, cat_features=cat_index)
y_pred_valid += catb.predict(x_valid)
y_pred_train += catb.predict(x_train)
y_pred_valid /= num_ensembles
y_pred_train /= num_ensembles
print('Train MAE:', mean_absolute_error(y_train, y_pred_train))
print('Valid MAE:', mean_absolute_error(y_valid, y_pred_valid))
return catb, y_pred_valid
In [ ]:
model_cb, preds = cat_booster(x_train, y_train, x_train_data, x_train_label, cat_index)
print('BR MAE CatBoost Valid: %s' % (mean_absolute_error(y_valid, preds)))
In [ ]:
model_cb = CatBoostRegressor(
iterations=630, learning_rate=0.03,
depth=6, l2_leaf_reg=3,
loss_function='MAE',
eval_metric='MAE')
models['CatBoost'] = model_cb
#scores = cross_validation.cross_val_score(model_cb, x_train, y_train, cv=5, scoring='neg_mean_absolute_error', verbose=1)
#print("%s MAE: %0.5f (+/- %0.5f)" % (model_cb.__class__.__name__, scores.mean(), scores.std() * 2))
#del preds
In [ ]:
from sklearn.ensemble import GradientBoostingRegressor
model_gb = GradientBoostingRegressor(
random_state=2016, verbose=1,
n_estimators=500, max_features=12, max_depth=8,
learning_rate=0.05, subsample=0.8)
#model_gb.fit(x_train, y_train)
#y_pred_gb_valid = model_gb.predict(x_valid)
#y_pred_gb_train = model_gb.predict(x_train_data)
models['GradientBoosting'] = model_gb
#predicted_mae_gb_valid = mean_absolute_error(y_valid, y_pred_gb_valid)
#predicted_mae_gb_train = mean_absolute_error(x_train_label, y_pred_gb_train)
#print('BR MAE GradientBoosting Valid: %s \nTrain: %s' % (predicted_mae_gb_valid, predicted_mae_gb_train))
#scores = cross_validation.cross_val_score(model_gb, x_train, y_train, cv=5, scoring='neg_mean_absolute_error', verbose=1)
#print("%s MAE: %0.5f (+/- %0.5f)" % (model_gb.__class__.__name__, scores.mean(), scores.std() * 2))
#del y_pred_gb_valid
#del y_pred_gb_train
In [ ]:
params_xgb = {
'max_depth': 5, # shuld be 0.5 to 1% of the examples
'subsample': 1, # Ratio of observations to be used as samples for each tree
'min_child_weight': 10, # Deals with imbalanced data and prevents overfitting as the value >
'objective': 'reg:linear',
'n_estimators': 1000, # Sequential trees to be modelled.
'eta': 0.1, # Shrinkage. Typically between 0.1 - 0.2 - learning rate for gradient boost (D:0.3)
'eval_metric': 'mae'
}
d_train = xg.DMatrix(X_train, label=y_train, missing=-1)
#d_valid = xg.DMatrix(x_valid, label=y_valid, missing=-1)
xgb_gs = xg.train(params_xgb, d_train, num_boost_round=250, verbose_eval=50)
#models['XGB'] = xgb_gs
#del d_train
#del d_valid
In [ ]:
def light_gbm_folds(x_train, x_valid, y_train, y_valid, params, num_ensembles):
# Light gbm n ensambles average predictions
y_pred_valid = 0.0
y_pred_train = 0.0
d_train = lgb.Dataset(x_train, label=y_train)
print('Initialising Light GBM')
for i in tqdm(range(num_ensembles)):
# Use CV, tune hyperparameters
params['seed'] = i
model_lgb = lgb.train(params, d_train, 430)
lg_pred_valid = model_lgb.predict(x_valid)
lg_pred_train = model_lgb.predict(x_train)
lg_pred_valid /= num_ensembles
lg_pred_train /= num_ensembles
print('Train MAE:', mean_absolute_error(y_train, lg_pred_train))
print('Valid MAE:', mean_absolute_error(y_valid, lg_pred_valid))
return model_lgb
In [ ]:
import random
import lightgbm as lgb
params_lg={
'max_bin' : 10,
'learning_rate' : 0.0021, # shrinkage_rate
'boosting_type' : 'gbdt',
'objective' : 'regression',
'metric' : 'mae',
'sub_feature' : 0.345 ,
'bagging_fraction' : 0.85,
'bagging_freq' : 40,
'num_leaves' : 512, # num_leaf
'min_data' : 500, # min_data_in_leaf
'min_hessian' : 0.05, # min_sum_hessian_in_leaf
'verbose' : 1
}
d_train = lgb.Dataset(X_train, label=y_train)
model_lgb = lgb.train(params_lg, d_train, 430)
#model_lgb = light_gbm_folds(x_train, x_train_data, y_train, x_train_label, params_lg, num_ensembles=5)
models['LightGBM'] = model_lgb
In [ ]:
In [ ]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from keras.layers import Dropout, BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.layers.noise import GaussianDropout
from keras.optimizers import Adam
from sklearn.preprocessing import Imputer
def larger_model():
# create model
model = Sequential()
model.add(Dense(size, input_dim=size, kernel_initializer='normal', activation='relu'))
model.add(Dense(size*2, kernel_initializer='normal', activation='relu'))
model.add(Dense(size, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
# Compile model
model.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))
return model
# define wider model
def wider_model():
# create model
model = Sequential()
model.add(Dense(size*2, input_dim=size, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
# Compile model
model.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))
return model
# define base model
def baseline_model():
# create model
model = Sequential()
model.add(Dense(size, input_dim=size, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
# Compile model
model.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))
return model
def prebuilt_nn():
nn = Sequential()
nn.add(Dense(units = 400 , kernel_initializer = 'normal', input_dim = size))
nn.add(PReLU())
nn.add(Dropout(.4))
nn.add(Dense(units = 160 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.6))
nn.add(Dense(units = 64 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.5))
nn.add(Dense(units = 26, kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.6))
nn.add(Dense(1, kernel_initializer='normal'))
nn.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))
return nn
In [ ]:
## Preprocessing
print("Preprocessing neural network data...")
imputer= Imputer()
imputer.fit(X_train)
x_train_nn = imputer.transform(X_train)
#imputer.fit(x_valid.iloc[:, :])
#x_valid_nn = imputer.transform(x_valid.iloc[:, :])
sc = StandardScaler()
x_train_nn = sc.fit_transform(x_train_nn)
#x_valid_nn = sc.transform(x_valid_nn)
In [ ]:
# fix random seed for reproducibility
seed = 7
size = x_train_nn.shape[1]
# Prebuit KAGGLE Kernel
np.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=prebuilt_nn, epochs=5, batch_size=50, verbose=0)))
pipeline = Pipeline(estimators)
pipeline.fit(x_train_nn, y_train)
models['DNN'] = pipeline
#print(mean_absolute_error(y_valid, pipeline.predict(x_valid_nn)))
In [ ]:
from numpy import concatenate
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
#x_train = x_train.values
#x_valid = x_valid.values
# reshape input to be 3D [samples, timesteps, features]
x_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
#x_valid_lstm = x_valid.values.reshape((x_valid.shape[0], 1, x_valid.shape[1]))
# design network
lstm = Sequential()
lstm.add(LSTM(50, input_shape=(x_train_lstm.shape[1], x_train_lstm.shape[2])))
lstm.add(PReLU())
lstm.add(Dropout(.2))
lstm.add(Dense(units = 100 , kernel_initializer = 'normal'))
lstm.add(PReLU())
lstm.add(Dropout(.2))
lstm.add(Dense(units = 50 , kernel_initializer = 'normal'))
lstm.add(PReLU())
lstm.add(Dense(1))
lstm.compile(loss='mae', optimizer='adam')
# fit network
#validation_data=(x_valid_lstm, y_valid)
lstm.fit(x_train_lstm, y_train, epochs=15, batch_size=50, verbose=1, shuffle=False)
# make a prediction
#yhat = lstm.predict(x_valid_lstm)
models['LSTM'] = lstm
#mae = mean_absolute_error(y_valid, yhat)
#print('Test MAE: %.3f' % mae)
In [ ]:
# https://github.com/dnc1994/Kaggle-Playground/blob/master/home-depot/ensemble.py
import time
from sklearn.metrics import mean_absolute_error, make_scorer
from xgboost import XGBRegressor
from sklearn.cross_validation import KFold
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, \
ExtraTreesRegressor, AdaBoostClassifier
from sklearn import grid_search
def mean_absolute_error_(ground_truth, predictions):
return mean_absolute_error(ground_truth, predictions)
MAE = make_scorer(mean_absolute_error_, greater_is_better=False)
params_xgb = {
'max_depth': 5, # shuld be 0.5 to 1% of the examples
'subsample': 1, # Ratio of observations to be used as samples for each tree
'min_child_weight': 10, # Deals with imbalanced data and prevents overfitting as the value >
'objective': 'reg:linear',
'n_estimators': 1000, # Sequential trees to be modelled.
'eta': 0.1, # Shrinkage. Typically between 0.1 - 0.2 - learning rate for gradient boost (D:0.3)
'eval_metric': 'mae'
}
class Ensemble(object):
def __init__(self, n_folds, base_models, floor_models, final_model, include_features, cvgrid, stacker=None):
self.n_folds = n_folds
self.stacker = stacker
self.final_model = final_model
self.base_models = base_models
self.floor_models = floor_models
self.features = include_features
self.param_grid = cvgrid
def fit(self, X, y):
X = np.array(X)
y = np.array(y)
folds = list(KFold(len(y), n_folds=self.n_folds, shuffle=True, random_state=2016))
S_train = np.zeros((X.shape[0], len(self.base_models)))
start_time = time.time()
for i, c in enumerate(self.base_models):
print('Fitting For Base Model {} ---'.format(c))
clf = self.base_models[c]
for j, (train_idx, test_idx) in enumerate(folds):
print('--- Fitting For Fold %d / %d ---', j + 1, self.n_folds)
X_train = X[train_idx]
y_train = y[train_idx]
X_holdout = X[test_idx]
if c not in ['XGB', 'LightGBM', 'LSTM']:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_holdout)[:]
S_train[test_idx, i] = y_pred
elif c in ['LSTM']:
x_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
clf.fit(x_train_lstm, y_train, epochs=15, batch_size=50, verbose=1, shuffle=False)
y_pred = clf.predict(X_holdout.reshape((X_holdout.shape[0], 1, X_holdout.shape[1])))[:]
S_train[test_idx, i] = [i[0] for i in y_pred]
else:
d_train = xg.DMatrix(X_train, label=y_train, missing=-1)
d_valid = xg.DMatrix(X_holdout, missing=-1)
clf = xg.train(params_xgb, d_train)
y_pred = clf.predict(d_valid)[:]
S_train[test_idx, i] = y_pred
print('Elapsed: %s minutes ---' % round(((time.time() - start_time) / 60), 2))
print('Elapsed: %s minutes ---' % round(((time.time() - start_time) / 60), 2))
print('--- Base Models Trained: %s minutes ---' % round(((time.time() - start_time) / 60), 2))
if self.features:
S_train = np.append(X, S_train, 1)
#if self.stacker is None:
# self.base_models = self.floor_models
# self.stacker = self.final_model
# self.fit(S_train, y)
d_train = xg.DMatrix(S_train, label=y, missing=-1)
#d_valid = xg.DMatrix(x_valid, label=y_valid, missing=-1)
xgb_gs = xg.train(params_xgb, d_train, num_boost_round=250, verbose_eval=50)
#else:
# grid = grid_search.GridSearchCV(estimator=self.stacker, param_grid=self.param_grid, n_jobs=1, cv=5, verbose=20, scoring=MAE)
# grid.fit(S_train, y)
#try:
# print('Best Params:')
# print(grid.best_params_)
# print('Best CV Score:')
# print(-grid.best_score_)
# print('Best estimator:')
# print(grid.best_estimator_)
#except:
# pass
self.stacker = xgb_gs#grid.best_estimator_
print('--- Stacker Trained: %s minutes ---' % round(((time.time() - start_time) / 60), 2))
def predict(self, X):
X = np.array(X)
folds = list(KFold(len(X), n_folds=self.n_folds, shuffle=True, random_state=2016))
if self.features:
S_test = np.append(X, np.zeros((X.shape[0], len(self.base_models))), 1)
print('Using features of shape', S_test.shape)
else:
S_test = np.zeros((X.shape[0], len(self.base_models)))
print('Using features of shape', S_test.shape)
for ind, c in enumerate(self.base_models):
clf = self.base_models[c]
# Uses all features.
if self.features:
i = X.shape[1] + ind
else:
i = ind
S_test_i = np.zeros((X.shape[0], len(folds)))
print('--- Predicting For #{}'.format(c))
# Makes predictions for each model
for j, (train_idx, test_idx) in enumerate(folds):
if c not in ['XGB', 'LSTM']:
S_test_i[:, j] = clf.predict(X)[:]
elif c in ['LSTM']:
S_test_i[:, j] = [i for i in clf.predict(X.reshape((X.shape[0], 1, X.shape[1])))[:]]
else:
S_test_i[:, j] = clf.predict(X)[:]
S_test[:, i] = S_test_i.mean(1)
clf = self.stacker
try:
y_pred = clf.predict(S_test)[:]
except:
y_pred = clf.predict(xg.DMatrix(S_test))
return y_pred
def fit_predict(self, X, y, T):
X = np.array(X)
y = np.array(y)
T = np.array(T)
start_time = time.time()
folds = list(KFold(len(y), n_folds=self.n_folds, shuffle=True, random_state=2016))
S_train = np.zeros((X.shape[0], len(self.base_models)))
S_test = np.zeros((T.shape[0], len(self.base_models)))
for i, c in enumerate(self.base_models):
print('########## \nFitting For Base Model {} \n##########'.format(c))
clf = self.base_models[c]
S_test_i = np.zeros((T.shape[0], len(folds)))
for j, (train_idx, test_idx) in enumerate(folds):
print('--- Fitting For Fold #{0} / {1} ---'.format(j+1, self.n_folds))
X_train = X[train_idx]
y_train = y[train_idx]
X_holdout = X[test_idx]
if c not in ['XGB', 'LightGBM', 'LSTM']:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_holdout)[:]
S_train[test_idx, i] = y_pred
S_test_i[:, j] = clf.predict(T)[:]
elif c in ['LSTM']:
x_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
clf.fit(x_train_lstm, y_train, epochs=15, batch_size=50, verbose=1, shuffle=False)
y_pred = clf.predict(X_holdout.reshape((X_holdout.shape[0], 1, X_holdout.shape[1])))[:]
S_train[test_idx, i] = [i[0] for i in y_pred]
S_test_i[:, j] = [i for i in clf.predict(T.reshape((T.shape[0], 1, T.shape[1])))[:]]
else:
d_train = xg.DMatrix(X_train, label=y_train, missing=-1)
d_valid = xg.DMatrix(X_holdout, missing=-1)
clf = xg.train(params_xgb, d_train)
y_pred = clf.predict(d_valid)[:]
S_train[test_idx, i] = y_pred
data_pred = xg.DMatrix(T, missing=-1)
S_test_i[:, j] = clf.predict(data_pred)[:]
print('Elapsed: %s minutes ---' % round(((time.time() - start_time) / 60), 2))
S_test[:, i] = S_test_i.mean(1)
print('Elapsed: %s minutes ---' % round(((time.time() - start_time) / 60), 2))
print('--- Base Models Trained: %s minutes ---' % round(((time.time() - start_time) / 60), 2))
param_grid = {'n_estimators': [100],
'learning_rate': [0.05],
'subsample': [0.75]}
#grid = grid_search.GridSearchCV(estimator=self.stacker, param_grid=param_grid, n_jobs=1, cv=5, verbose=20, scoring=MAE)
grid.fit(S_train, y)
try:
print('Param grid:')
print(param_grid)
print('Best Params:')
print(grid.best_params_)
print('Best CV Score:')
print(-grid.best_score_)
print('Best estimator:')
print(grid.best_estimator_)
print(message)
except:
pass
print('--- Stacker Trained: %s minutes ---' % round(((time.time() - start_time) / 60), 2))
y_pred = grid.predict(S_test)[:]
return y_pred
In [ ]:
#del models['CatBoost']
#del models['XGB']
In [ ]:
param_grid = {'n_estimators': [50, 100, 150],
'learning_rate': [0.03, 0.05, 0.07],
'subsample': [0.5, 0.75, 1]
}
stackers = {'GBM' : GradientBoostingRegressor(),}
ensemble = Ensemble(n_folds=5,
base_models=models,
floor_models=stackers,
final_model=models['DNN'],
include_features=False,
cvgrid=param_grid)
#model_ensemble = ensemble.fit_predict(x_train[:100], y_train[:100], x_valid)
# MAE 0.0653212760898 - lr 0.03, nest = 50, subsample: 0.5
ensemble.fit(X_train, y_train)
#final_prediction = ensemble.predict(X_test2016)
#print('MAE', mean_absolute_error(y_valid, final_prediction))
#del final_prediction
In [ ]:
def k_fold_cross_validation(X, K, randomise = False):
"""Generates K (training, validation) pairs from the items in X."""
if randomise: from random import shuffle; X=list(X); shuffle(X)
for k in range(K):
training = [x for i, x in enumerate(X) if i % K != k]
validation = [x for i, x in enumerate(X) if i % K == k]
yield training, validation
In [ ]:
index = 0
for training, validation in k_fold_cross_validation(x_train_data.values, K=5):
pred_k = ensemble.predict(validation)
print('MAE',mean_absolute_error(x_train_label[index:index+len(validation)], pred_k))
index += len(validation)
In [ ]:
########## LAYER 1 ##########
# Submodel 1 : OLS # Ordinary least squares estimator Sklearn implementation
# Submodel 2 : BR # Bayesian ridge regression - Sklearn implementation
# Submodel 3 : DNN # Dense Neural Network - Keras - Dense layers
# Submodel 4 : LightGBM # Light Gradient Boosting - https://github.com/Microsoft/LightGBM
# Submodel 5 : XGBoost # Extreme Gradient Boosting - http://xgboost.readthedocs.io/en/latest/model.html
# Submodel 6 : CatBoost # Categorical Boosting https://github.com/catboost/catboost
# Submodel 7 : LSTM # Long Short Term Memory Neural Network - Keras implementation
# Submodel 8 : RandomForestRegressor # Sklearn implementation
# Submodel 9 : ExtraTreesRegressor # Sklearn implementation
# Submodel 10 : SVR # Support vector machines for regression - Sklearn implementation
# Submodel 11 : AdaBoost # Adaptive Boosting Sklearn Implementation
########## LAYER 2 ##########
# https://www.kaggle.com/dragost/boosted-trees-lb-0-0643707/edit
In [ ]:
# Predicting already build test framework for 2017 and 2016
In [ ]:
test_dates = {
'201610': pd.Timestamp('2016-09-30'),
'201611': pd.Timestamp('2016-10-31'),
'201612': pd.Timestamp('2016-11-30'),
'201710': pd.Timestamp('2017-09-30'),
'201711': pd.Timestamp('2017-10-31'),
'201712': pd.Timestamp('2017-11-30')
}
for d in test_dates.keys():
if '2016' in d:
print('Predicting for 2016')
X_test2016['month'] = int(d[-2:])
sample_submission[d] = predict(ensemble, X_test2016)
elif '2017' in d:
print('Predicting for 2017')
X_test2017['month'] = int(d[-2:]) + 12
sample_submission[d] = predict(ensemble, X_test2017)
In [ ]:
#train['month'] = (pd.to_datetime(train['transactiondate']).dt.year - 2016)*12 + pd.to_datetime(train['transactiondate']).dt.month
X_train.shape
In [ ]:
print('Building properties data')
properties2017 = pd.read_csv('../Data/properties_2017.csv', low_memory = False)
sample_prediction = pd.merge(sample['ParcelId'].to_frame(), properties2017, how='left', left_on=['ParcelId'], right_on=['parcelid'])
#transactions[['propertycountylandusecode', 'propertyzoningdesc']] = transactions[['propertycountylandusecode', 'propertyzoningdesc']].astype(str).apply(LabelEncoder().fit_transform)
sample_prediction['taxdelinquencyflag'].replace('Y', 1, inplace=True)
sample_prediction.drop(to_drop, axis=1, inplace=True)
sample_prediction = complex_features(sample_prediction)
sample_prediction.drop(['parcelid', 'propertyzoningdesc', 'propertycountylandusecode', 'fireplacecnt'], axis=1, inplace=True)
sample_prediction.fillna(sample_prediction.median(), inplace = True)
del properties2017
gc.collect()
In [ ]:
# https://www.kaggle.com/c/zillow-prize-1/discussion/33899, Oct,Nov,Dec
WEIGHT_XGB = 0.4
WEIGHT_CAT = 0.6
test_dates = {
'201610': pd.Timestamp('2016-09-30'),
'201611': pd.Timestamp('2016-10-31'),
'201612': pd.Timestamp('2016-11-30'),
'201710': pd.Timestamp('2017-09-30'),
'201711': pd.Timestamp('2017-10-31'),
'201712': pd.Timestamp('2017-11-30')
}
for m in test_dates.keys():
print('Processing', m)
sample_prediction['transactiondate'] = test_dates[m]
sample_prediction = time_data(sample_prediction)
print('Ensemble Prediction', m)
sample_prediction['ensemble'] = ensemble.predict(sample_prediction[best_columns])
print('XGB - CatBoost Train', m)
predictions_xgb = xgb_gs.predict(xg.DMatrix(sample_prediction[list(best_columns) + ['ensemble']]))
predictions_cat = get_cat_boost_all(sample_train, sample_label, sample_prediction[list(best_columns) + ['ensemble']])
sample[m] = (WEIGHT_XGB * predictions_xgb) + (WEIGHT_CAT * predictions_cat)
del predictions_xgb, predictions_cat
gc.collect()
#del x_predict
In [ ]:
In [ ]:
train2017 = pd.read_csv('../Data/train_2017.csv', parse_dates=['transactiondate'], low_memory=False)
sample_train = pd.merge(train2017, sample_prediction, how='left', left_on='parcelid' ,right_on='ParcelId')
sample_train['ensemble'] = ensemble.predict(sample_train[best_columns])
sample_label = sample_train['logerror']
sample_train = time_data(sample_train)[list(best_columns) + ['ensemble']]
In [ ]:
del train2017
gc.collect()
In [ ]:
params_xgb = {
'max_depth': 5, # shuld be 0.5 to 1% of the examples
'subsample': 1, # Ratio of observations to be used as samples for each tree
'min_child_weight': 10, # Deals with imbalanced data and prevents overfitting as the value >
'objective': 'reg:linear',
'n_estimators': 1000, # Sequential trees to be modelled.
'eta': 0.1, # Shrinkage. Typically between 0.1 - 0.2 - learning rate for gradient boost (D:0.3)
'eval_metric': 'mae'
}
d_train = xg.DMatrix(sample_train, label=sample_label)
xgb_gs = xg.train(params_xgb, d_train, num_boost_round=250, verbose_eval=50)
In [ ]:
def get_cat_boost_all(x_train, y_train, x_valid):
num_ensembles = 5
y_pred_valid = 0.0
print('Initialising CAT Boost Regression')
for i in tqdm(range(num_ensembles)):
print('Building ensemble', i)
# Use CV, tune hyperparameters
catb = CatBoostRegressor(
iterations=600, learning_rate=0.03,
depth=5, l2_leaf_reg=3,
loss_function='MAE',
eval_metric='MAE',
random_seed=i)
catb.fit(x_train, y_train, cat_features=cat_index)
y_pred_valid += catb.predict(x_valid)
y_pred_valid /= num_ensembles
return y_pred_valid
In [ ]:
sample_prediction.to_csv('submission5.csv',index=False)
sample_prediction.head()
In [ ]: