In [ ]:
%%javascript
$('<div id="toc"></div>').css({position: 'fixed', top: '120px', left: 0}).appendTo(document.body);
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js');

Libraries


In [ ]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
import numpy as np

import gc 

from jupyterthemes import jtplot
jtplot.style()

import xgboost as xg
from xgboost import XGBModel
from xgboost import plot_importance
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, ShuffleSplit
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import RFE

from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.metrics import r2_score

from sklearn.preprocessing import LabelEncoder
from sklearn import cross_validation

from catboost import CatBoostRegressor
from tqdm import tqdm

%matplotlib inline
%load_ext autotime
%load_ext line_profiler
%matplotlib inline

Processing


In [ ]:
# Kaggle Kernel Data Preparation & Own Implementations

def plot_data(test, pred, sample, title, width=40, height=10, linewidth=0.5, color1='white', color2='orange'):
    """ Plotting method. """
    fig = plt.figure(figsize=(width, height))
    plt.plot(pred[:sample], color=color1, zorder=4, linewidth=linewidth, label='%s Prediction'%(title))
    plt.plot(test[:sample], color=color2, zorder=3, linewidth=linewidth, label='%s True Data'%(title))
    plt.title = title
    plt.legend()

# Frequency count
def get_frequency(data):
    # Gets the frequency of a column's values in 'data'. Pass on a series.
    vals = pd.merge(data.to_frame(), data.value_counts().reset_index(), 
                    how='left', left_on=data.to_frame().columns[0], right_on='index').iloc[:, -1:].values
    return vals
  
def time_data(data):
    data['transactiondate'] = pd.to_datetime(data['transactiondate'])
    data['day_of_week']     = data['transactiondate'].dt.dayofweek
    data['month_of_year']   = data['transactiondate'].dt.month
    data['quarter']         = data['transactiondate'].dt.quarter
    data['is_weekend']      = (data['day_of_week'] < 5).astype(int)
    data.drop('transactiondate', axis=1, inplace=True)
    
    print('Added time data')
    print('........')
    
    return data


def column_excluder(data, missing_perc_thresh=0.98):
    # Quick clean from https://www.kaggle.com/seesee/concise-catboost-starter-ensemble-plb-0-06435
    
    exclude_missing = []
    exclude_unique = []
    num_rows = data.shape[0]
    for c in data.columns:
        num_missing = data[c].isnull().sum()
        if num_missing == 0:
            continue
        missing_frac = num_missing / float(num_rows)
        if missing_frac > missing_perc_thresh:
            exclude_missing.append(c)

        num_uniques = len(data[c].unique())
        if data[c].isnull().sum() != 0:
            num_uniques -= 1
        if num_uniques == 1:
            exclude_unique.append(c)
            
    to_exclude = list(set(exclude_missing + exclude_unique))
    
    print('Excluded columns:')
    print(to_exclude)
    print('........')
    
    return to_exclude

def categorical_features(data):
    # Quick categories from https://www.kaggle.com/seesee/concise-catboost-starter-ensemble-plb-0-06435
        
    cat_feature_inds = []
    cat_unique_thresh = 1000
    for i, c in enumerate(data.columns):
        num_uniques = len(data[c].unique())
        if num_uniques < cat_unique_thresh \
            and not 'sqft'   in c \
            and not 'cnt'    in c \
            and not 'nbr'    in c \
            and not 'number' in c:
            cat_feature_inds.append(i)

    print("Categorical features:")
    print([data.columns[ind] for ind in cat_feature_inds])
    print('........')
    
    return cat_feature_inds


def complex_features(data):
    # Gets counts, label encoding and frequency estimates.
    
    # Frequency of occurances | length of codes | check if * is present
    data['propertyzoningdesc_frq'] = get_frequency(data['propertyzoningdesc'])
    data['propertyzoningdesc_len'] = data['propertyzoningdesc'].apply(lambda x: len(x) if pd.notnull(x) else x)
    #transactions_shuffled['propertyzoningdesc_str'] = transactions_shuffled['propertyzoningdesc'].apply(lambda x: (1 if '*' in str(x) else 0) if pd.notnull(x) else x)

    # Label encoding | length of code
    #transactions_shuffled['propertycountylandusecode_enc'] = transactions_shuffled[['propertycountylandusecode']].astype(str).apply(LabelEncoder().fit_transform)
    #transactions_shuffled['propertycountylandusecode_len'] = transactions_shuffled['propertycountylandusecode'].apply(lambda x: x if pd.isnull(x) else len(x))

    # Zip code area extraction
    data['regionidzip_ab']  = data['regionidzip'].apply(lambda x: x if pd.isnull(x) else str(x)[:2]).astype(float)
    data['regionidzip_abc'] = data['regionidzip'].apply(lambda x: x if pd.isnull(x) else str(x)[:3]).astype(float)

    # Region neighbourhood area extraction
    data['regionidneighborhood_ab'] = data['regionidneighborhood'].apply(lambda x: str(x)[:2] if pd.notnull(x) else x).astype(float)

    # Rawcensustractandblock transformed
    data['code_fips_cnt']  = get_frequency(data['rawcensustractandblock'].apply(lambda x: str(x)[:4]))
    data['code_tract_cnt'] = get_frequency(data['rawcensustractandblock'].apply(lambda x: str(x)[4:11]))
    data['code_block_cnt'] = get_frequency(data['rawcensustractandblock'].apply(lambda x: str(x)[11:]))
    data.drop('rawcensustractandblock', axis=1, inplace=True)
    
    # Encode string values
    data[['propertycountylandusecode', 'propertyzoningdesc']] = data[['propertycountylandusecode', 'propertyzoningdesc']].astype(str).apply(LabelEncoder().fit_transform)
    
    print('Generating complex features')
    print('........')
    
    return data

In [ ]:
models = {}

Data Load

# Kaggle Kernel Data Preparation seed = 11 np.random.seed(seed) drop_tax = False train2016 = pd.read_csv("../Data/train_2016_v2.csv", parse_dates=["transactiondate"], low_memory=False) train2017 = pd.read_csv('../Data/train_2017.csv', parse_dates=['transactiondate'], low_memory=False) if drop_tax: # Avoids external bias print('Removing tax features from 2017') train2017.iloc[:, train2017.columns.str.startswith('tax')] = np.nan properties2016 = pd.read_csv('../Data/properties_2016.csv', low_memory = False) properties2017 = pd.read_csv('../Data/properties_2017.csv', low_memory = False) sample = pd.read_csv('../Data/sample_submission.csv') transactions2016 = pd.merge(train2016, properties2016, how='left', on=['parcelid']).sample(frac=1) transactions2017 = pd.merge(train2017, properties2017, how='left', on=['parcelid']).sample(frac=1) transactions = pd.concat([transactions2016, transactions2017], axis = 0) #transactions[['propertycountylandusecode', 'propertyzoningdesc']] = transactions[['propertycountylandusecode', 'propertyzoningdesc']].astype(str).apply(LabelEncoder().fit_transform) transactions['taxdelinquencyflag'].replace('Y', 1, inplace=True) # Clean columns to_drop = column_excluder(transactions) transactions.drop(to_drop, axis=1, inplace=True) # Time data transactions = time_data(transactions) transactions = complex_features(transactions) x_all = transactions.drop(['parcelid', 'propertyzoningdesc', 'propertycountylandusecode', 'fireplacecnt'], axis=1) y_all = transactions['logerror'] #x_all.drop(['hashottuborspa' 'taxdelinquencyflag' 'fireplaceflag'], axis=1) #x_all['hashottuborspa'].astype(float, inplace=True) #x_all.fillna(-1, inplace=True)#.astype(str)#.apply(LabelEncoder().fit_transform) x_all.fillna(x_all.median(),inplace = True) ratio = 0.0 x_train, x_valid, y_train, y_valid = train_test_split(x_all, y_all, test_size=ratio) x_train_label = x_train['logerror'].copy() x_train_data = x_train.drop(['logerror'], axis=1).copy() # Drop outliers x_train = x_train[(x_train['logerror'] > -0.4) & (x_train['logerror'] < 0.419)] y_train = x_train['logerror'] x_train.drop('logerror', axis=1, inplace=True) x_valid.drop('logerror', axis=1, inplace=True) cat_index = categorical_features(x_train) best_columns = x_train.columns y_mean = np.mean(y_train) del x_all, y_all, transactions, transactions2016, transactions2017, properties2017, properties2016, train2016, train2017 gc.collect()

In [ ]:
non_number_columns

In [ ]:
# Kaggle Kernel Data Preparation

print('Loading data...')
# Load raw data
properties2016_raw = pd.read_csv('../Data/properties_2016.csv', low_memory = False)
properties2017 = pd.read_csv('../Data/properties_2017.csv', low_memory = False)
train2016 = pd.read_csv('../Data/train_2016_v2.csv')
train2017 = pd.read_csv('../Data/train_2017.csv')
sample_submission = pd.read_csv('../Data/sample_submission.csv', low_memory = False)

# Create a new version of 2016 properties data that takes all non-tax variables from 2017
taxvars = ['structuretaxvaluedollarcnt', 'landtaxvaluedollarcnt', 'taxvaluedollarcnt', 'taxamount']
tax2016 = properties2016_raw[['parcelid']+taxvars]
properties2016 = properties2017.drop(taxvars,axis=1).merge(tax2016, 
                 how='left', on='parcelid').reindex_axis(properties2017.columns, axis=1)

# Create a training data set
train2016 = pd.merge(train2016, properties2016, how = 'left', on = 'parcelid')
train2017 = pd.merge(train2017, properties2017, how = 'left', on = 'parcelid')
train = pd.concat([train2016, train2017], axis = 0)

# Create separate test data sets for 2016 and 2017
test2016 = pd.merge(sample_submission[['ParcelId']], properties2016.rename(columns = {'parcelid': 'ParcelId'}), 
                how = 'left', on = 'ParcelId')
test2017 = pd.merge(sample_submission[['ParcelId']], properties2017.rename(columns = {'parcelid': 'ParcelId'}), 
                how = 'left', on = 'ParcelId')
del properties2016, properties2017, train2016, train2017
gc.collect();


print('Memory usage reduction...')


train[['latitude', 'longitude']] /= 1e6
train['censustractandblock'] /= 1e12

def preptest(test):
    test[['latitude', 'longitude']] /= 1e6
    test[['latitude', 'longitude']] /= 1e6
    test['censustractandblock'] /= 1e12
    test['censustractandblock'] /= 1e12

    for column in test.columns:
        if test[column].dtype == int:
            test[column] = test[column].astype(np.int32)
        if test[column].dtype == float:
            test[column] = test[column].astype(np.float32)

preptest(test2016)
preptest(test2017)
        
print('Feature engineering...')
train['month'] = (pd.to_datetime(train['transactiondate']).dt.year - 2016)*12 + pd.to_datetime(train['transactiondate']).dt.month
train = train.drop('transactiondate', axis = 1)


from sklearn.preprocessing import LabelEncoder
non_number_columns = train.dtypes[train.dtypes == object].index.values

for column in non_number_columns:
    train_test = pd.concat([train[column], test2016[column], test2017[column]], axis = 0)
    encoder = LabelEncoder().fit(train_test.astype(str))
    train[column] = encoder.transform(train[column].astype(str)).astype(np.int32)
    test2016[column] = encoder.transform(test2016[column].astype(str)).astype(np.int32)
    test2017[column] = encoder.transform(test2017[column].astype(str)).astype(np.int32)
    
feature_names = [feature for feature in train.columns[2:] if feature != 'month']

month_avgs = train.groupby('month').agg('mean')['logerror'].values - train['logerror'].mean()

X_train_all = train[feature_names].fillna(-1)
y_train_all = train['logerror'].fillna(-1)

train = train[np.abs(train['logerror']) < 0.4]

print('Preparing arrays and throwing out outliers...')
X_train = train[feature_names].fillna(-1)#.values
y_train = train['logerror'].fillna(-1)#.values
X_test2016 = test2016[feature_names].fillna(-1)#.values
X_test2017 = test2017[feature_names].fillna(-1)#.values

del test2016, test2017;
gc.collect();

month_values = train['month'].values
month_avg_values = np.array([month_avgs[month - 1] for month in month_values]).reshape(-1, 1)
X_train = np.hstack([X_train, month_avg_values])

OLS


In [ ]:
# OLS
model_lr = LinearRegression()
#model_lr.fit(x_train_data, x_train_label)
#y_pred_lr_valid = model_lr.predict(x_valid)
#y_pred_lr_train = model_lr.predict(x_train_data)
models['LinearRegression'] = model_lr

# Make predictions on both test and validation with OLS and BR
#predicted_mae_lr_valid = mean_absolute_error(y_valid, y_pred_lr_valid)
#predicted_mae_lr_train = mean_absolute_error(x_train_label, y_pred_lr_train)

#print('OLS MAE LR Valid:', predicted_mae_lr_valid, 'Train:', predicted_mae_lr_train)

scores = cross_validation.cross_val_score(model_lr, X_train_all, y_train_all, cv=5, scoring='neg_mean_absolute_error', verbose=1)
print("%s MAE: %0.5f (+/- %0.5f)" % (model_lr.__class__.__name__, scores.mean(), scores.std() * 2))

#del y_pred_lr_valid
#del y_pred_lr_train

Bayesian Ridge


In [ ]:
# BayesianRidge Regression
model_br = BayesianRidge(compute_score=True)
#model_br.fit(x_train, y_train)
#y_pred_br_valid = model_br.predict(x_valid)
#y_pred_br_train = model_br.predict(x_train_data)
models['BayesianRidge'] = model_br

#predicted_mae_br_valid = mean_absolute_error(y_valid,       y_pred_br_valid)
#predicted_mae_br_train = mean_absolute_error(x_train_label, y_pred_br_train)

#print('BR MAE BayesianRidge Valid: %s \nTrain: %s' % (predicted_mae_br_valid, predicted_mae_br_train))

scores = cross_validation.cross_val_score(model_br, X_train_all, y_train_all, cv=5, scoring='neg_mean_absolute_error', verbose=1)
print("%s MAE: %0.5f (+/- %0.5f)" % (model_br.__class__.__name__, scores.mean(), scores.std() * 2))


#del y_pred_br_valid
#del y_pred_br_train

Random Forest


In [ ]:
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor(n_jobs=1, random_state=2016, verbose=1, n_estimators=500, max_features=12)
#model_rf.fit(x_train, y_train)
#y_pred_rf_valid = model_rf.predict(x_valid)
#y_pred_rf_train = model_rf.predict(x_train_data)
models['RandomForest'] = model_rf

#predicted_mae_rf_valid = mean_absolute_error(y_valid,       y_pred_rf_valid)
#predicted_mae_rf_train = mean_absolute_error(x_train_label, y_pred_rf_train)

#print('BR MAE RandomForest Valid: %s \nTrain: %s' % (predicted_mae_rf_valid, predicted_mae_rf_train))

scores = cross_validation.cross_val_score(model_rf, X_train_all, y_train_all, cv=5, scoring='neg_mean_absolute_error', verbose=1)
print("%s MAE: %0.5f (+/- %0.5f)" % (model_rf.__class__.__name__, scores.mean(), scores.std() * 2))

#del y_pred_rf_train
#del y_pred_rf_valid

Extra Trees


In [ ]:
from sklearn.ensemble import ExtraTreesRegressor

model_et = ExtraTreesRegressor(
        n_jobs=1, random_state=2016, verbose=1,
        n_estimators=500, max_features=12)

#model_et.fit(x_train, y_train)
#y_pred_et_valid = model_et.predict(x_valid)
#y_pred_et_train = model_et.predict(x_train_data)
models['ExtraTrees'] = model_et

#predicted_mae_et_valid = mean_absolute_error(y_valid,       y_pred_et_valid)
#predicted_mae_et_train = mean_absolute_error(x_train_label, y_pred_et_train)

#print('BR MAE ExtraTrees Valid: %s \nTrain: %s' % (predicted_mae_et_valid, predicted_mae_et_train))

#scores = cross_validation.cross_val_score(model_et, x_train, y_train, cv=5, scoring='neg_mean_absolute_error', verbose=1)
#print("%s MAE: %0.5f (+/- %0.5f)" % (model_et.__class__.__name__, scores.mean(), scores.std() * 2))

#del y_pred_et_valid
#del y_pred_et_train

AdaBoost


In [ ]:
from sklearn.ensemble import AdaBoostRegressor

model_ab = AdaBoostRegressor()
#model_ab.fit(x_train, y_train)
#y_pred_ab_valid = model_ab.predict(x_valid)
#y_pred_ab_train = model_ab.predict(x_train_data)
models['AdaBoost'] = model_ab

#predicted_mae_ab_valid = mean_absolute_error(y_valid,       y_pred_ab_valid)
#predicted_mae_ab_train = mean_absolute_error(x_train_label, y_pred_ab_train)

#print('BR MAE AdaBoost Valid: %s \nTrain: %s' % (predicted_mae_ab_valid, predicted_mae_ab_train))

#scores = cross_validation.cross_val_score(model_ab, x_train, y_train, cv=5, scoring='neg_mean_absolute_error', verbose=1)
#print("%s MAE: %0.5f (+/- %0.5f)" % (model_ab.__class__.__name__, scores.mean(), scores.std() * 2))

#del y_pred_ab_valid
#del y_pred_ab_train

CatBoost


In [ ]:
def cat_booster(x_train, y_train, x_valid, y_valid, cat_index, loss='MAE'):
    # Cat booster train and predict
    num_ensembles = 5
    y_pred_valid = 0.0
    y_pred_train = 0.0
    
    print('Initialising CAT Boost Regression')
    for i in tqdm(range(num_ensembles)):
        print('Building ensemble', i)
        # Use CV, tune hyperparameters
        catb = CatBoostRegressor(
                iterations=630, learning_rate=0.03,
                depth=6, l2_leaf_reg=3,
                loss_function=loss,
                eval_metric='MAE',
                random_seed=i)

        catb.fit(x_train, y_train, cat_features=cat_index)

        y_pred_valid += catb.predict(x_valid)
        y_pred_train += catb.predict(x_train)

    y_pred_valid /= num_ensembles
    y_pred_train /= num_ensembles

    print('Train MAE:', mean_absolute_error(y_train, y_pred_train))
    print('Valid MAE:', mean_absolute_error(y_valid, y_pred_valid))
    
    return catb, y_pred_valid

In [ ]:
model_cb, preds = cat_booster(x_train, y_train, x_train_data, x_train_label, cat_index)

print('BR MAE CatBoost Valid: %s' % (mean_absolute_error(y_valid, preds)))

In [ ]:
model_cb = CatBoostRegressor(
            iterations=630, learning_rate=0.03,
            depth=6, l2_leaf_reg=3,
            loss_function='MAE',
            eval_metric='MAE')

models['CatBoost'] = model_cb

#scores = cross_validation.cross_val_score(model_cb, x_train, y_train, cv=5, scoring='neg_mean_absolute_error', verbose=1)
#print("%s MAE: %0.5f (+/- %0.5f)" % (model_cb.__class__.__name__, scores.mean(), scores.std() * 2))

#del preds

GB


In [ ]:
from sklearn.ensemble import GradientBoostingRegressor

model_gb = GradientBoostingRegressor(
             random_state=2016, verbose=1,
             n_estimators=500, max_features=12, max_depth=8,
             learning_rate=0.05, subsample=0.8)

#model_gb.fit(x_train, y_train)
#y_pred_gb_valid = model_gb.predict(x_valid)
#y_pred_gb_train = model_gb.predict(x_train_data)
models['GradientBoosting'] = model_gb

#predicted_mae_gb_valid = mean_absolute_error(y_valid,       y_pred_gb_valid)
#predicted_mae_gb_train = mean_absolute_error(x_train_label, y_pred_gb_train)

#print('BR MAE GradientBoosting Valid: %s \nTrain: %s' % (predicted_mae_gb_valid, predicted_mae_gb_train))

#scores = cross_validation.cross_val_score(model_gb, x_train, y_train, cv=5, scoring='neg_mean_absolute_error', verbose=1)
#print("%s MAE: %0.5f (+/- %0.5f)" % (model_gb.__class__.__name__, scores.mean(), scores.std() * 2))

#del y_pred_gb_valid
#del y_pred_gb_train

XGB


In [ ]:
params_xgb = {
    'max_depth':        5,  # shuld be 0.5 to 1% of the examples
    'subsample':        1,  # Ratio of observations to be used as samples for each tree
    'min_child_weight': 10, # Deals with imbalanced data and prevents overfitting as the value >
    'objective':        'reg:linear',
    'n_estimators':     1000, # Sequential trees to be modelled.
    'eta':              0.1,  # Shrinkage. Typically between 0.1 - 0.2 - learning rate for gradient boost (D:0.3)
    'eval_metric':      'mae'
}

d_train = xg.DMatrix(X_train, label=y_train, missing=-1)
#d_valid = xg.DMatrix(x_valid, label=y_valid, missing=-1)
xgb_gs = xg.train(params_xgb, d_train, num_boost_round=250, verbose_eval=50)
#models['XGB'] = xgb_gs

#del d_train
#del d_valid

LightGBM


In [ ]:
def light_gbm_folds(x_train, x_valid, y_train, y_valid, params, num_ensembles):
    # Light gbm n ensambles average predictions

    y_pred_valid = 0.0
    y_pred_train = 0.0
    
    d_train = lgb.Dataset(x_train, label=y_train)
    
    print('Initialising Light GBM')
    for i in tqdm(range(num_ensembles)):
        # Use CV, tune hyperparameters
        params['seed'] = i
        model_lgb = lgb.train(params, d_train, 430)
        
        lg_pred_valid = model_lgb.predict(x_valid)
        lg_pred_train = model_lgb.predict(x_train)

    lg_pred_valid /= num_ensembles
    lg_pred_train /= num_ensembles
    
    print('Train MAE:', mean_absolute_error(y_train, lg_pred_train))
    print('Valid MAE:', mean_absolute_error(y_valid, lg_pred_valid))
    
    return model_lgb

In [ ]:
import random
import lightgbm as lgb

params_lg={
    'max_bin'          : 10,
    'learning_rate'    : 0.0021, # shrinkage_rate
    'boosting_type'    : 'gbdt',
    'objective'        : 'regression',
    'metric'           : 'mae',      
    'sub_feature'      : 0.345 ,   
    'bagging_fraction' : 0.85, 
    'bagging_freq'     : 40,
    'num_leaves'       : 512,   # num_leaf
    'min_data'         : 500,   # min_data_in_leaf
    'min_hessian'      : 0.05,  # min_sum_hessian_in_leaf
    'verbose'          : 1
}
d_train = lgb.Dataset(X_train, label=y_train)
model_lgb = lgb.train(params_lg, d_train, 430)

#model_lgb = light_gbm_folds(x_train, x_train_data, y_train, x_train_label, params_lg, num_ensembles=5)
models['LightGBM'] = model_lgb

In [ ]:

DNN


In [ ]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from keras.layers import Dropout, BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.layers.noise import GaussianDropout
from keras.optimizers import Adam
from sklearn.preprocessing import Imputer

def larger_model():
    # create model
    model = Sequential()
    model.add(Dense(size, input_dim=size, kernel_initializer='normal', activation='relu'))
    model.add(Dense(size*2, kernel_initializer='normal', activation='relu'))
    model.add(Dense(size, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))
    return model

# define wider model
def wider_model():
    # create model
    model = Sequential()
    model.add(Dense(size*2, input_dim=size, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))
    return model


# define base model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(size, input_dim=size, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))
    return model

def prebuilt_nn():
    nn = Sequential()
    nn.add(Dense(units = 400 , kernel_initializer = 'normal', input_dim = size))
    nn.add(PReLU())
    nn.add(Dropout(.4))
    nn.add(Dense(units = 160 , kernel_initializer = 'normal'))
    nn.add(PReLU())
    nn.add(BatchNormalization())
    nn.add(Dropout(.6))
    nn.add(Dense(units = 64 , kernel_initializer = 'normal'))
    nn.add(PReLU())
    nn.add(BatchNormalization())
    nn.add(Dropout(.5))
    nn.add(Dense(units = 26, kernel_initializer = 'normal'))
    nn.add(PReLU())
    nn.add(BatchNormalization())
    nn.add(Dropout(.6))
    nn.add(Dense(1, kernel_initializer='normal'))
    nn.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))

    return nn

In [ ]:
## Preprocessing
print("Preprocessing neural network data...")
imputer= Imputer()
imputer.fit(X_train)
x_train_nn = imputer.transform(X_train)

#imputer.fit(x_valid.iloc[:, :])
#x_valid_nn = imputer.transform(x_valid.iloc[:, :])

sc = StandardScaler()
x_train_nn = sc.fit_transform(x_train_nn)
#x_valid_nn = sc.transform(x_valid_nn)

In [ ]:
# fix random seed for reproducibility
seed = 7
size = x_train_nn.shape[1]
# Prebuit KAGGLE Kernel
np.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=prebuilt_nn, epochs=5, batch_size=50, verbose=0)))
pipeline = Pipeline(estimators)
pipeline.fit(x_train_nn, y_train)
models['DNN'] = pipeline

#print(mean_absolute_error(y_valid, pipeline.predict(x_valid_nn)))

LSTM


In [ ]:
from numpy import concatenate
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
 
#x_train = x_train.values
#x_valid = x_valid.values

# reshape input to be 3D [samples, timesteps, features]
x_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
#x_valid_lstm = x_valid.values.reshape((x_valid.shape[0], 1, x_valid.shape[1]))
 
# design network
lstm = Sequential()
lstm.add(LSTM(50, input_shape=(x_train_lstm.shape[1], x_train_lstm.shape[2])))
lstm.add(PReLU())
lstm.add(Dropout(.2))
lstm.add(Dense(units = 100 , kernel_initializer = 'normal'))
lstm.add(PReLU())
lstm.add(Dropout(.2))
lstm.add(Dense(units = 50 , kernel_initializer = 'normal'))
lstm.add(PReLU())
lstm.add(Dense(1))
lstm.compile(loss='mae', optimizer='adam')
# fit network
#validation_data=(x_valid_lstm, y_valid)
lstm.fit(x_train_lstm, y_train, epochs=15, batch_size=50, verbose=1, shuffle=False)
 
# make a prediction
#yhat = lstm.predict(x_valid_lstm)
models['LSTM'] = lstm
#mae = mean_absolute_error(y_valid, yhat)
#print('Test MAE: %.3f' % mae)

Stacking


In [ ]:
# https://github.com/dnc1994/Kaggle-Playground/blob/master/home-depot/ensemble.py
import time
from sklearn.metrics import mean_absolute_error, make_scorer
from xgboost import XGBRegressor
from sklearn.cross_validation import KFold
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, \
        ExtraTreesRegressor, AdaBoostClassifier
from sklearn import grid_search

def mean_absolute_error_(ground_truth, predictions):
    return mean_absolute_error(ground_truth, predictions)

MAE = make_scorer(mean_absolute_error_, greater_is_better=False)

params_xgb = {
    'max_depth':        5,  # shuld be 0.5 to 1% of the examples
    'subsample':        1,  # Ratio of observations to be used as samples for each tree
    'min_child_weight': 10, # Deals with imbalanced data and prevents overfitting as the value >
    'objective':        'reg:linear',
    'n_estimators':     1000, # Sequential trees to be modelled.
    'eta':              0.1,  # Shrinkage. Typically between 0.1 - 0.2 - learning rate for gradient boost (D:0.3)
    'eval_metric':      'mae'
}

class Ensemble(object):
    
    def __init__(self, n_folds, base_models, floor_models, final_model, include_features, cvgrid, stacker=None):
        self.n_folds = n_folds
        self.stacker = stacker
        self.final_model = final_model
        self.base_models = base_models
        self.floor_models = floor_models
        self.features = include_features
        self.param_grid = cvgrid
    

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        
        folds = list(KFold(len(y), n_folds=self.n_folds, shuffle=True, random_state=2016))
        S_train = np.zeros((X.shape[0], len(self.base_models)))
        
        start_time = time.time()
        
        for i, c in enumerate(self.base_models):
            print('Fitting For Base Model {} ---'.format(c))       
            clf = self.base_models[c]
            
            for j, (train_idx, test_idx) in enumerate(folds):
                print('--- Fitting For Fold %d / %d ---', j + 1, self.n_folds)
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                
                if c not in ['XGB', 'LightGBM', 'LSTM']:
                    
                    clf.fit(X_train, y_train)
                    y_pred = clf.predict(X_holdout)[:]
                    
                    S_train[test_idx, i] = y_pred
                    
                elif c in ['LSTM']:
                    x_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))

                    clf.fit(x_train_lstm, y_train, epochs=15, batch_size=50, verbose=1, shuffle=False)
                    y_pred = clf.predict(X_holdout.reshape((X_holdout.shape[0], 1, X_holdout.shape[1])))[:]
                    
                    S_train[test_idx, i] = [i[0] for i in y_pred]

                else:
                    d_train = xg.DMatrix(X_train, label=y_train, missing=-1)
                    d_valid = xg.DMatrix(X_holdout, missing=-1)
                    
                    clf = xg.train(params_xgb, d_train)
                    y_pred = clf.predict(d_valid)[:]
                    
                    S_train[test_idx, i] = y_pred
                    
                print('Elapsed: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

            print('Elapsed: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

        print('--- Base Models Trained: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

        if self.features:
            S_train = np.append(X, S_train, 1)
        
        #if self.stacker is None:
        #    self.base_models = self.floor_models
        #    self.stacker = self.final_model
        #    self.fit(S_train, y)

        d_train = xg.DMatrix(S_train, label=y, missing=-1)
        #d_valid = xg.DMatrix(x_valid, label=y_valid, missing=-1)
        xgb_gs = xg.train(params_xgb, d_train, num_boost_round=250, verbose_eval=50)

        #else:           
        #    grid = grid_search.GridSearchCV(estimator=self.stacker, param_grid=self.param_grid, n_jobs=1, cv=5, verbose=20, scoring=MAE)
        #    grid.fit(S_train, y)
        
        #try:
        #    print('Best Params:')
        #    print(grid.best_params_)
        #    print('Best CV Score:')
        #    print(-grid.best_score_)
        #    print('Best estimator:')
        #    print(grid.best_estimator_)
        #except:
        #    pass
        
        self.stacker = xgb_gs#grid.best_estimator_
        
        print('--- Stacker Trained: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

        
        
    def predict(self, X):
        X = np.array(X)
        folds = list(KFold(len(X), n_folds=self.n_folds, shuffle=True, random_state=2016))
        if self.features:
            S_test = np.append(X, np.zeros((X.shape[0], len(self.base_models))), 1)  
            print('Using features of shape', S_test.shape)
        else:
            S_test = np.zeros((X.shape[0], len(self.base_models)))
            print('Using features of shape', S_test.shape)

        for ind, c in enumerate(self.base_models):
            clf = self.base_models[c]
            
            # Uses all features.
            if self.features:
                i = X.shape[1] + ind
            else:
                i = ind
                
            S_test_i = np.zeros((X.shape[0], len(folds)))
            print('--- Predicting For  #{}'.format(c))
            
            # Makes predictions for each model
            for j, (train_idx, test_idx) in enumerate(folds):    
                if c not in ['XGB', 'LSTM']:
                    S_test_i[:, j] = clf.predict(X)[:]
                    
                elif c in ['LSTM']:
                    S_test_i[:, j] = [i for i in clf.predict(X.reshape((X.shape[0], 1, X.shape[1])))[:]]
                    
                else:
                    S_test_i[:, j] = clf.predict(X)[:]
                
            S_test[:, i] = S_test_i.mean(1)

        clf = self.stacker
        try:
            y_pred = clf.predict(S_test)[:]
        except:
            y_pred = clf.predict(xg.DMatrix(S_test))
        
        return y_pred

    
    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        start_time = time.time()
        folds = list(KFold(len(y), n_folds=self.n_folds, shuffle=True, random_state=2016))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test  = np.zeros((T.shape[0], len(self.base_models)))

        for i, c in enumerate(self.base_models):
            print('########## \nFitting For Base Model {} \n##########'.format(c))
            clf = self.base_models[c]
            S_test_i = np.zeros((T.shape[0], len(folds)))

            for j, (train_idx, test_idx) in enumerate(folds):
                print('--- Fitting For Fold #{0} / {1} ---'.format(j+1, self.n_folds))
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                
                if c not in ['XGB', 'LightGBM', 'LSTM']:
                    clf.fit(X_train, y_train)
                    y_pred = clf.predict(X_holdout)[:]
                    
                    S_train[test_idx, i] = y_pred
                    S_test_i[:, j] = clf.predict(T)[:]
                    
                elif c in ['LSTM']:
                    x_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
                    
                    clf.fit(x_train_lstm, y_train, epochs=15, batch_size=50, verbose=1, shuffle=False)
                    y_pred = clf.predict(X_holdout.reshape((X_holdout.shape[0], 1, X_holdout.shape[1])))[:]
                    
                    S_train[test_idx, i] = [i[0] for i in y_pred]
                    S_test_i[:, j] = [i for i in clf.predict(T.reshape((T.shape[0], 1, T.shape[1])))[:]]
                    
                else:
                    d_train = xg.DMatrix(X_train, label=y_train, missing=-1)
                    d_valid = xg.DMatrix(X_holdout, missing=-1)
                    
                    clf = xg.train(params_xgb, d_train)
                    y_pred = clf.predict(d_valid)[:]
                    
                    S_train[test_idx, i] = y_pred
                    data_pred = xg.DMatrix(T, missing=-1)
                    S_test_i[:, j] = clf.predict(data_pred)[:]

                print('Elapsed: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

            S_test[:, i] = S_test_i.mean(1)
            print('Elapsed: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

        print('--- Base Models Trained: %s minutes ---' % round(((time.time() - start_time) / 60), 2))

        param_grid = {'n_estimators':  [100],
                      'learning_rate': [0.05],
                      'subsample':     [0.75]}

        #grid = grid_search.GridSearchCV(estimator=self.stacker, param_grid=param_grid, n_jobs=1, cv=5, verbose=20, scoring=MAE)
        
        
        grid.fit(S_train, y)

        try:
            print('Param grid:')
            print(param_grid)
            print('Best Params:')
            print(grid.best_params_)
            print('Best CV Score:')
            print(-grid.best_score_)
            print('Best estimator:')
            print(grid.best_estimator_)
            print(message)
        except:
            pass

        print('--- Stacker Trained: %s minutes ---' % round(((time.time() - start_time) / 60), 2))
        y_pred = grid.predict(S_test)[:]

        return y_pred

In [ ]:
#del models['CatBoost']
#del models['XGB']

In [ ]:
param_grid = {'n_estimators':  [50, 100, 150],
              'learning_rate': [0.03, 0.05, 0.07],
              'subsample':     [0.5, 0.75, 1]
             }

stackers = {'GBM' : GradientBoostingRegressor(),}

ensemble = Ensemble(n_folds=5,
                    base_models=models, 
                    floor_models=stackers, 
                    final_model=models['DNN'],
                    include_features=False, 
                    cvgrid=param_grid)
                    
#model_ensemble = ensemble.fit_predict(x_train[:100], y_train[:100], x_valid)
# MAE 0.0653212760898 - lr 0.03, nest = 50, subsample: 0.5
ensemble.fit(X_train, y_train)
#final_prediction = ensemble.predict(X_test2016)
#print('MAE', mean_absolute_error(y_valid, final_prediction))

#del final_prediction

In [ ]:
def k_fold_cross_validation(X, K, randomise = False):
    """Generates K (training, validation) pairs from the items in X."""
    
    if randomise: from random import shuffle; X=list(X); shuffle(X)
    for k in range(K):
        training   = [x for i, x in enumerate(X) if i % K != k]
        validation = [x for i, x in enumerate(X) if i % K == k]
        
        yield training, validation

In [ ]:
index = 0
for training, validation in k_fold_cross_validation(x_train_data.values, K=5):
    pred_k = ensemble.predict(validation)
    print('MAE',mean_absolute_error(x_train_label[index:index+len(validation)], pred_k))
    index += len(validation)

In [ ]:
########## LAYER 1 ##########
# Submodel  1 : OLS                      # Ordinary least squares estimator Sklearn implementation
# Submodel  2 : BR                       # Bayesian ridge regression - Sklearn implementation
# Submodel  3 : DNN                      # Dense Neural Network - Keras - Dense layers 
# Submodel  4 : LightGBM                 # Light Gradient Boosting - https://github.com/Microsoft/LightGBM
# Submodel  5 : XGBoost                  # Extreme Gradient Boosting - http://xgboost.readthedocs.io/en/latest/model.html
# Submodel  6 : CatBoost                 # Categorical Boosting https://github.com/catboost/catboost
# Submodel  7 : LSTM                     # Long Short Term Memory Neural Network - Keras implementation
# Submodel  8 : RandomForestRegressor    # Sklearn implementation
# Submodel  9 : ExtraTreesRegressor      # Sklearn implementation
# Submodel 10 : SVR                      # Support vector machines for regression - Sklearn implementation
# Submodel 11 : AdaBoost                 # Adaptive Boosting Sklearn Implementation

########## LAYER 2 ##########
# https://www.kaggle.com/dragost/boosted-trees-lb-0-0643707/edit

Save data


In [ ]:
# Predicting already build test framework for 2017 and 2016

In [ ]:
test_dates = {
    '201610': pd.Timestamp('2016-09-30'),
    '201611': pd.Timestamp('2016-10-31'),
    '201612': pd.Timestamp('2016-11-30'),
    '201710': pd.Timestamp('2017-09-30'),
    '201711': pd.Timestamp('2017-10-31'),
    '201712': pd.Timestamp('2017-11-30')
}

for d in test_dates.keys():
    if '2016' in d:
        print('Predicting for 2016')
        X_test2016['month'] = int(d[-2:])
        sample_submission[d] = predict(ensemble, X_test2016)
    elif '2017' in d:
        print('Predicting for 2017')
        X_test2017['month'] = int(d[-2:]) + 12
        sample_submission[d] = predict(ensemble, X_test2017)

In [ ]:
#train['month'] = (pd.to_datetime(train['transactiondate']).dt.year - 2016)*12 + pd.to_datetime(train['transactiondate']).dt.month
X_train.shape

In [ ]:
print('Building properties data')
properties2017 = pd.read_csv('../Data/properties_2017.csv', low_memory = False)
sample_prediction = pd.merge(sample['ParcelId'].to_frame(), properties2017, how='left', left_on=['ParcelId'], right_on=['parcelid'])
#transactions[['propertycountylandusecode', 'propertyzoningdesc']] = transactions[['propertycountylandusecode', 'propertyzoningdesc']].astype(str).apply(LabelEncoder().fit_transform)
sample_prediction['taxdelinquencyflag'].replace('Y', 1, inplace=True)
sample_prediction.drop(to_drop, axis=1, inplace=True)
sample_prediction = complex_features(sample_prediction)
sample_prediction.drop(['parcelid', 'propertyzoningdesc', 'propertycountylandusecode', 'fireplacecnt'], axis=1, inplace=True)
sample_prediction.fillna(sample_prediction.median(), inplace = True)

del properties2017
gc.collect()

In [ ]:
# https://www.kaggle.com/c/zillow-prize-1/discussion/33899, Oct,Nov,Dec

WEIGHT_XGB = 0.4
WEIGHT_CAT = 0.6

test_dates = {
    '201610': pd.Timestamp('2016-09-30'),
    '201611': pd.Timestamp('2016-10-31'),
    '201612': pd.Timestamp('2016-11-30'),
    '201710': pd.Timestamp('2017-09-30'),
    '201711': pd.Timestamp('2017-10-31'),
    '201712': pd.Timestamp('2017-11-30')
}

for m in test_dates.keys():
    
    print('Processing', m)
    sample_prediction['transactiondate'] = test_dates[m]
    sample_prediction = time_data(sample_prediction)
    
    print('Ensemble Prediction', m)
    sample_prediction['ensemble'] = ensemble.predict(sample_prediction[best_columns])
    
    print('XGB - CatBoost Train', m)
    predictions_xgb = xgb_gs.predict(xg.DMatrix(sample_prediction[list(best_columns) + ['ensemble']]))
    predictions_cat = get_cat_boost_all(sample_train, sample_label, sample_prediction[list(best_columns) + ['ensemble']])

    sample[m] = (WEIGHT_XGB * predictions_xgb) + (WEIGHT_CAT * predictions_cat)
    
    del predictions_xgb, predictions_cat
    gc.collect()
    
#del x_predict

In [ ]:


In [ ]:
train2017 = pd.read_csv('../Data/train_2017.csv', parse_dates=['transactiondate'], low_memory=False)
sample_train = pd.merge(train2017, sample_prediction, how='left', left_on='parcelid' ,right_on='ParcelId')
sample_train['ensemble'] = ensemble.predict(sample_train[best_columns])
sample_label = sample_train['logerror']
sample_train = time_data(sample_train)[list(best_columns) + ['ensemble']]

In [ ]:
del train2017
gc.collect()

In [ ]:
params_xgb = {
    'max_depth':        5,  # shuld be 0.5 to 1% of the examples
    'subsample':        1,  # Ratio of observations to be used as samples for each tree
    'min_child_weight': 10, # Deals with imbalanced data and prevents overfitting as the value >
    'objective':        'reg:linear',
    'n_estimators':     1000, # Sequential trees to be modelled.
    'eta':              0.1,  # Shrinkage. Typically between 0.1 - 0.2 - learning rate for gradient boost (D:0.3)
    'eval_metric':      'mae'
}

d_train = xg.DMatrix(sample_train, label=sample_label)
xgb_gs = xg.train(params_xgb, d_train, num_boost_round=250, verbose_eval=50)

In [ ]:
def get_cat_boost_all(x_train, y_train, x_valid):
    num_ensembles = 5
    y_pred_valid = 0.0

    print('Initialising CAT Boost Regression')
    for i in tqdm(range(num_ensembles)):
        print('Building ensemble', i)
        # Use CV, tune hyperparameters
        catb = CatBoostRegressor(
                iterations=600, learning_rate=0.03,
                depth=5, l2_leaf_reg=3,
                loss_function='MAE',
                eval_metric='MAE',
                random_seed=i)

        catb.fit(x_train, y_train, cat_features=cat_index)

        y_pred_valid += catb.predict(x_valid)
        
    y_pred_valid /= num_ensembles
    
    return y_pred_valid

In [ ]:
sample_prediction.to_csv('submission5.csv',index=False)
sample_prediction.head()

RFE

model_lr = LinearRegression() model_xgb = xg.XGBRegressor() selector = RFE(model_xgb, 100, step=500) model = selector.estimator.fit(x_train, y_train) dict_features = plot_best_features(model, data=x_all, num_features=100, figsize=(5,15)) best_columns = list(dict_features.keys()) new_sparse_columns = x_all.columns x_train = pd.DataFrame(x_train, columns=x_all.columns)[best_columns].values x_test = pd.DataFrame(x_test, columns=x_all.columns)[best_columns].values

In [ ]: