In [18]:
# https://www.kaggle.com/dragost/boosted-trees-lb-0-0643707/edit
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc
from sklearn.linear_model import LinearRegression
import random
import datetime as dt

from catboost import CatBoostRegressor
from tqdm import tqdm

################
################
##  LightGBM changes ##
# V42 - sub_feature: 0.3 -> 0.35 : LB = 0.0643759
# V34 - sub_feature: 0.5 -> 0.42
# V33 - sub_feature: 0.5 -> 0.45 : LB = 0.0643866
# - sub_feature: 0.45 -> 0.3 : LB = 0.0643811 / 0.0643814 
################
################ 

##### READ IN RAW DATA
print( "\nReading data from disk ...")
prop = pd.read_csv('../Data/properties_2017.csv')
train = pd.read_csv("../Data/train_2017.csv")


Reading data from disk ...
/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (49) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [19]:
# Parameters
XGB_WEIGHT = 0.6500
BASELINE_WEIGHT = 0.0056
OLS_WEIGHT = 0.0828

XGB1_WEIGHT = 0.8083  # Weight of first in combination of two XGB models

BASELINE_PRED = 0.0115   # Baseline based on mean of training data, per Oleg

In [20]:
################
################
##  LightGBM  ##
################
################

##### PROCESS DATA FOR LIGHTGBM
print( "\nProcessing data for LightGBM ..." )
for c, dtype in zip(prop.columns, prop.dtypes):	
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

df_train = train.merge(prop, how='left', on='parcelid')
df_train.fillna(df_train.median(),inplace = True)

x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
                         'propertycountylandusecode', 'fireplacecnt', 'fireplaceflag'], axis=1)
#x_train['Ratio_1'] = x_train['taxvaluedollarcnt']/x_train['taxamount']
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)


train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

del df_train; gc.collect()

x_train = x_train.values.astype(np.float32, copy=False)
d_train = lgb.Dataset(x_train, label=y_train)



##### RUN LIGHTGBM
params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.0021 # shrinkage_rate
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'l1'          # or 'mae'
params['sub_feature'] = 0.345    
params['bagging_fraction'] = 0.85 # sub_row
params['bagging_freq'] = 40
params['num_leaves'] = 512        # num_leaf
params['min_data'] = 500         # min_data_in_leaf
params['min_hessian'] = 0.05     # min_sum_hessian_in_leaf
params['verbose'] = 0
#params['feature_fraction_seed'] = 2
#params['bagging_seed'] = 3

print("\nFitting LightGBM model ...")
clf = lgb.train(params, d_train, 430)

del d_train; gc.collect()
del x_train; gc.collect()

print("\nPrepare for LightGBM prediction ...")
print("   Read sample file ...")
sample = pd.read_csv('../Data/sample_submission.csv')
print("   ...")
sample['parcelid'] = sample['ParcelId']
print("   Merge with property data ...")
df_test = sample.merge(prop, on='parcelid', how='left')
print("   ...")
del sample, prop; gc.collect()
print("   ...")
#df_test['Ratio_1'] = df_test['taxvaluedollarcnt']/df_test['taxamount']
x_test = df_test[train_columns]
print("   ...")
del df_test; gc.collect()
print("   Preparing x_test...")
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)
print("   ...")
x_test = x_test.values.astype(np.float32, copy=False)
print("Test shape :", x_test.shape)

print("\nStart LightGBM prediction ...")
p_test = clf.predict(x_test)

del x_test; gc.collect()

print( "\nUnadjusted LightGBM predictions:" )
print( pd.DataFrame(p_test).head() )


Processing data for LightGBM ...
(77613, 53) (77613,)

Fitting LightGBM model ...

Prepare for LightGBM prediction ...
   Read sample file ...
   ...
   Merge with property data ...
   ...
   ...
   ...
   Preparing x_test...
   ...
Test shape : (2985217, 53)

Start LightGBM prediction ...

Unadjusted LightGBM predictions:
          0
0  0.025868
1  0.026412
2  0.020930
3  0.027978
4  0.020549

In [21]:
################
################
##  XGBoost   ##
################
################

##### RE-READ PROPERTIES FILE
##### (I tried keeping a copy, but the program crashed.)

print( "\nRe-reading properties file ...")
properties = pd.read_csv('../Data/properties_2017.csv')

##### PROCESS DATA FOR XGBOOST
print( "\nProcessing data for XGBoost ...")
for c in properties.columns:
    properties[c]=properties[c].fillna(-1)
    if properties[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(properties[c].values))
        properties[c] = lbl.transform(list(properties[c].values))

train_df = train.merge(properties, how='left', on='parcelid')
x_train = train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
x_test = properties.drop(['parcelid'], axis=1)
# shape        
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))

# drop out ouliers
train_df=train_df[ train_df.logerror > -0.4 ]
train_df=train_df[ train_df.logerror < 0.419 ]
x_train=train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
y_train = train_df["logerror"].values.astype(np.float32)
y_mean = np.mean(y_train)

print('After removing outliers:')     
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))




##### RUN XGBOOST
print("\nSetting up data for XGBoost ...")
# xgboost params
xgb_params = {
    'eta': 0.037,
    'max_depth': 5,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': y_mean,
    'silent': 1
}

dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

num_boost_rounds = 250
print("num_boost_rounds="+str(num_boost_rounds))

# train model
print( "\nTraining XGBoost ...")
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

print( "\nPredicting with XGBoost ...")
xgb_pred1 = model.predict(dtest)

print( "\nFirst XGBoost predictions:" )
print( pd.DataFrame(xgb_pred1).head() )



##### RUN XGBOOST AGAIN
print("\nSetting up data for XGBoost ...")
# xgboost params
xgb_params = {
    'eta': 0.033,
    'max_depth': 6,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'base_score': y_mean,
    'silent': 1
}

num_boost_rounds = 150
print("num_boost_rounds="+str(num_boost_rounds))

print( "\nTraining XGBoost again ...")
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

print( "\nPredicting with XGBoost again ...")
xgb_pred2 = model.predict(dtest)

print( "\nSecond XGBoost predictions:" )
print( pd.DataFrame(xgb_pred2).head() )



##### COMBINE XGBOOST RESULTS

xgb_pred = XGB1_WEIGHT*xgb_pred1 + (1-XGB1_WEIGHT)*xgb_pred2
#xgb_pred = xgb_pred1

print( "\nCombined XGBoost predictions:" )
print( pd.DataFrame(xgb_pred).head() )

del train_df
del x_train
del x_test
del properties
del dtest
del dtrain
del xgb_pred1
del xgb_pred2 
gc.collect()


Re-reading properties file ...
/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (49) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Processing data for XGBoost ...
Shape train: (77613, 57)
Shape test: (2985217, 57)
After removing outliers:
Shape train: (75949, 57)
Shape test: (2985217, 57)

Setting up data for XGBoost ...
num_boost_rounds=250

Training XGBoost ...

Predicting with XGBoost ...

First XGBoost predictions:
          0
0  0.064053
1  0.035960
2 -0.075041
3 -0.003773
4 -0.001748

Setting up data for XGBoost ...
num_boost_rounds=150

Training XGBoost again ...

Predicting with XGBoost again ...

Second XGBoost predictions:
          0
0  0.052026
1  0.041535
2 -0.086148
3 -0.024098
4 -0.000185

Combined XGBoost predictions:
          0
0  0.061747
1  0.037029
2 -0.077170
3 -0.007669
4 -0.001449
Out[21]:
194

In [22]:
################
################
##    OLS     ##
################
################

# This section is derived from the1owl's notebook:
#    https://www.kaggle.com/the1owl/primer-for-the-zillow-pred-approach
# which I (Andy Harless) updated and made into a script:
#    https://www.kaggle.com/aharless/updated-script-version-of-the1owl-s-basic-ols
np.random.seed(17)
random.seed(17)

train = pd.read_csv("../Data/train_2017.csv", parse_dates=["transactiondate"])
properties = pd.read_csv("../Data/properties_2017.csv")
submission = pd.read_csv("../Data/sample_submission.csv")
print(len(train),len(properties),len(submission))

def get_features(df):
    df["transactiondate"] = pd.to_datetime(df["transactiondate"])
    df["transactiondate_year"] = df["transactiondate"].dt.year
    df["transactiondate_month"] = df["transactiondate"].dt.month
    df['transactiondate'] = df['transactiondate'].dt.quarter
    df = df.fillna(-1.0)
    return df

def MAE(y, ypred):
    #logerror=log(Zestimate)−log(SalePrice)
    return np.sum([abs(y[i]-ypred[i]) for i in range(len(y))]) / len(y)

train = pd.merge(train, properties, how='left', on='parcelid')
y = train['logerror'].values
test = pd.merge(submission, properties, how='left', left_on='ParcelId', right_on='parcelid')
properties = [] #memory

exc = [train.columns[c] for c in range(len(train.columns)) if train.dtypes[c] == 'O'] + ['logerror','parcelid']
col = [c for c in train.columns if c not in exc]

train = get_features(train[col])
test['transactiondate'] = '2016-01-01' #should use the most common training date
test = get_features(test[col])

reg = LinearRegression(n_jobs=-1)
reg.fit(train, y); print('fit...')
print(MAE(y, reg.predict(train)))
train = [];  y = [] #memory

test_dates = ['2016-10-01','2016-11-01','2016-12-01','2017-10-01','2017-11-01','2017-12-01']
test_columns = ['201610','201611','201612','201710','201711','201712']


/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (49) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
77613 2985217 2985217
/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:20: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:22: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
fit...
0.0709592335615

In [23]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout, BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.layers.noise import GaussianDropout
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer


######################
######################
##  Neural Network  ##
######################
######################

# Neural network copied from this script:
#   https://www.kaggle.com/aharless/keras-neural-network-lb-06492 (version 20)
# which was built on the skeleton in this notebook:
#   https://www.kaggle.com/prasunmishra/ann-using-keras


# Read in data for neural network
print( "\n\nProcessing data for Neural Network ...")
print('\nLoading train, prop and sample data...')
train = pd.read_csv("../Data/train_2017.csv", parse_dates=["transactiondate"])
prop = pd.read_csv('../Data/properties_2017.csv')
sample = pd.read_csv('../Data/sample_submission.csv')
 
print('Fitting Label Encoder on properties...')
for c in prop.columns:
    prop[c]=prop[c].fillna(-1)
    if prop[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(prop[c].values))
        prop[c] = lbl.transform(list(prop[c].values))
        
print('Creating training set...')
df_train = train.merge(prop, how='left', on='parcelid')

df_train["transactiondate"] = pd.to_datetime(df_train["transactiondate"])
df_train["transactiondate_year"] = df_train["transactiondate"].dt.year
df_train["transactiondate_month"] = df_train["transactiondate"].dt.month
df_train['transactiondate_quarter'] = df_train['transactiondate'].dt.quarter
df_train["transactiondate"] = df_train["transactiondate"].dt.day

print('Filling NA/NaN values...' )
df_train.fillna(-1.0)

print('Creating x_train and y_train from df_train...' )
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode','fireplacecnt', 'fireplaceflag'], axis=1)
y_train = df_train["logerror"]

y_mean = np.mean(y_train)
print(x_train.shape, y_train.shape)
train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

print('Creating df_test...')
sample['parcelid'] = sample['ParcelId']

print("Merging Sample with property data...")
df_test = sample.merge(prop, on='parcelid', how='left')

df_test["transactiondate"] = pd.to_datetime('2016-11-15')  # placeholder value for preliminary version
df_test["transactiondate_year"] = df_test["transactiondate"].dt.year
df_test["transactiondate_month"] = df_test["transactiondate"].dt.month
df_test['transactiondate_quarter'] = df_test['transactiondate'].dt.quarter
df_test["transactiondate"] = df_test["transactiondate"].dt.day     
x_test = df_test[train_columns]

print('Shape of x_test:', x_test.shape)
print("Preparing x_test...")
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)

    
## Preprocessing
print("\nPreprocessing neural network data...")
imputer= Imputer()
imputer.fit(x_train.iloc[:, :])
x_train = imputer.transform(x_train.iloc[:, :])
imputer.fit(x_test.iloc[:, :])
x_test = imputer.transform(x_test.iloc[:, :])

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

len_x=int(x_train.shape[1])
print("len_x is:",len_x)


# Neural Network
print("\nSetting up neural network model...")
nn = Sequential()
nn.add(Dense(units = 400 , kernel_initializer = 'normal', input_dim = len_x))
nn.add(PReLU())
nn.add(Dropout(.4))
nn.add(Dense(units = 160 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.6))
nn.add(Dense(units = 64 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.5))
nn.add(Dense(units = 26, kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.6))
nn.add(Dense(1, kernel_initializer='normal'))
nn.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))

print("\nFitting neural network model...")
nn.fit(np.array(x_train), np.array(y_train), batch_size = 32, epochs = 70, verbose=2)

print("\nPredicting with neural network model...")
#print("x_test.shape:",x_test.shape)
y_pred_ann = nn.predict(x_test)

print( "\nPreparing results for write..." )
nn_pred = y_pred_ann.flatten()
print( "Type of nn_pred is ", type(nn_pred) )
print( "Shape of nn_pred is ", nn_pred.shape )

print( "\nNeural Network predictions:" )
print( pd.DataFrame(nn_pred).head() )


Using TensorFlow backend.

Processing data for Neural Network ...

Loading train, prop and sample data...
/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (49) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Fitting Label Encoder on properties...
Creating training set...
Filling NA/NaN values...
Creating x_train and y_train from df_train...
(77613, 56) (77613,)
Creating df_test...
Merging Sample with property data...
Shape of x_test: (2985217, 56)
Preparing x_test...

Preprocessing neural network data...
len_x is: 56

Setting up neural network model...

Fitting neural network model...
Epoch 1/70
17s - loss: 0.0731
Epoch 2/70
15s - loss: 0.0698
Epoch 3/70
14s - loss: 0.0697
Epoch 4/70
14s - loss: 0.0697
Epoch 5/70
15s - loss: 0.0697
Epoch 6/70
17s - loss: 0.0696
Epoch 7/70
17s - loss: 0.0696
Epoch 8/70
17s - loss: 0.0696
Epoch 9/70
18s - loss: 0.0695
Epoch 10/70
18s - loss: 0.0695
Epoch 11/70
18s - loss: 0.0695
Epoch 12/70
17s - loss: 0.0694
Epoch 13/70
18s - loss: 0.0694
Epoch 14/70
18s - loss: 0.0694
Epoch 15/70
17s - loss: 0.0694
Epoch 16/70
16s - loss: 0.0693
Epoch 17/70
17s - loss: 0.0693
Epoch 18/70
16s - loss: 0.0693
Epoch 19/70
16s - loss: 0.0693
Epoch 20/70
17s - loss: 0.0693
Epoch 21/70
17s - loss: 0.0693
Epoch 22/70
18s - loss: 0.0693
Epoch 23/70
18s - loss: 0.0692
Epoch 24/70
18s - loss: 0.0693
Epoch 25/70
17s - loss: 0.0693
Epoch 26/70
19s - loss: 0.0693
Epoch 27/70
23s - loss: 0.0692
Epoch 28/70
24s - loss: 0.0692
Epoch 29/70
21s - loss: 0.0692
Epoch 30/70
21s - loss: 0.0692
Epoch 31/70
24s - loss: 0.0692
Epoch 32/70
23s - loss: 0.0692
Epoch 33/70
22s - loss: 0.0692
Epoch 34/70
24s - loss: 0.0692
Epoch 35/70
20s - loss: 0.0692
Epoch 36/70
23s - loss: 0.0691
Epoch 37/70
19s - loss: 0.0692
Epoch 38/70
17s - loss: 0.0691
Epoch 39/70
18s - loss: 0.0692
Epoch 40/70
18s - loss: 0.0692
Epoch 41/70
16s - loss: 0.0691
Epoch 42/70
16s - loss: 0.0691
Epoch 43/70
16s - loss: 0.0691
Epoch 44/70
16s - loss: 0.0691
Epoch 45/70
19s - loss: 0.0691
Epoch 46/70
22s - loss: 0.0691
Epoch 47/70
24s - loss: 0.0691
Epoch 48/70
22s - loss: 0.0691
Epoch 49/70
20s - loss: 0.0691
Epoch 50/70
21s - loss: 0.0691
Epoch 51/70
23s - loss: 0.0691
Epoch 52/70
22s - loss: 0.0691
Epoch 53/70
22s - loss: 0.0691
Epoch 54/70
21s - loss: 0.0691
Epoch 55/70
20s - loss: 0.0691
Epoch 56/70
20s - loss: 0.0691
Epoch 57/70
18s - loss: 0.0691
Epoch 58/70
17s - loss: 0.0691
Epoch 59/70
17s - loss: 0.0690
Epoch 60/70
18s - loss: 0.0690
Epoch 61/70
17s - loss: 0.0691
Epoch 62/70
18s - loss: 0.0690
Epoch 63/70
17s - loss: 0.0690
Epoch 64/70
16s - loss: 0.0690
Epoch 65/70
17s - loss: 0.0690
Epoch 66/70
17s - loss: 0.0690
Epoch 67/70
18s - loss: 0.0690
Epoch 68/70
26s - loss: 0.0690
Epoch 69/70
21s - loss: 0.0690
Epoch 70/70
28s - loss: 0.0690

Predicting with neural network model...

Preparing results for write...
Type of nn_pred is  <class 'numpy.ndarray'>
Shape of nn_pred is  (2985217,)

Neural Network predictions:
          0
0  0.011157
1  0.008576
2  0.326974
3 -0.092062
4 -0.074120

In [24]:
########################
########################
##  Combine and Save  ##
########################
########################

##### COMBINE PREDICTIONS
print( "\nCombining XGBoost, LightGBM, and baseline predicitons ..." )
lgb_weight = (1 - XGB_WEIGHT - BASELINE_WEIGHT) / (1 - OLS_WEIGHT)
xgb_weight0 = XGB_WEIGHT / (1 - OLS_WEIGHT)
baseline_weight0 =  BASELINE_WEIGHT / (1 - OLS_WEIGHT)
pred0 = xgb_weight0*xgb_pred + baseline_weight0*BASELINE_PRED + lgb_weight*p_test

print( "\nCombined XGB/LGB/baseline predictions:" )
print( pd.DataFrame(pred0).head() )

print( "\nPredicting with OLS and combining with XGB/LGB/baseline predicitons: ..." )
for i in range(len(test_dates)):
    test['transactiondate'] = test_dates[i]
    pred = OLS_WEIGHT*reg.predict(get_features(test)) + (1-OLS_WEIGHT)*pred0
    submission[test_columns[i]] = [float(format(x, '.4f')) for x in pred]
    print('predict...', i)

print( "\nCombined XGB/LGB/baseline/OLS predictions:" )
print( submission.head() )



##### WRITE THE RESULTS
from datetime import datetime
print( "\nWriting results to disk ..." )
submission.to_csv('sub{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False)
print( "\nFinished ...")


Combining XGBoost, LightGBM, and baseline predicitons ...

Combined XGB/LGB/baseline predictions:
          0
0  0.053542
1  0.036229
2 -0.046760
3  0.005141
4  0.006760

Predicting with OLS and combining with XGB/LGB/baseline predicitons: ...
predict... 0
predict... 1
predict... 2
predict... 3
predict... 4
predict... 5

Combined XGB/LGB/baseline/OLS predictions:
   ParcelId  201610  201611  201612  201710  201711  201712
0  10754147  0.0552  0.0552  0.0553  0.0552  0.0552  0.0553
1  10759547  0.0389  0.0389  0.0389  0.0389  0.0389  0.0389
2  10843547 -0.0141 -0.0141 -0.0141 -0.0141 -0.0141 -0.0141
3  10859147  0.0085  0.0085  0.0086  0.0085  0.0085  0.0086
4  10879947  0.0120  0.0120  0.0120  0.0120  0.0120  0.0120

Writing results to disk ...

Finished ...

In [25]:
catboostx = pd.read_csv('Only_CatBoost.csv')

In [28]:
new_prediction = catboostx.copy()

In [34]:
genetic = pd.read_csv('xxx.csv.gz', compression='gzip')

In [40]:
categories_x = ['201610','201611','201612','201710','201711','201712']
new_prediction[categories_x] = genetic[categories_x] * 0.3 + (catboostx[categories_x]) * 0.5 + (submission[categories_x] * 0.2)

In [41]:
new_prediction.to_csv('genetic_catboost_ensemble_3_5_2.csv', index=False, float_format='%.6f')

In [ ]:


In [3]:
def plot_data(test, pred, sample, title, width=40, height=10, linewidth=0.5, color1='white', color2='orange'):
    """ Plotting method. """
    fig = plt.figure(figsize=(width, height))
    plt.plot(pred[:sample], color=color1, zorder=4, linewidth=linewidth, label='%s Prediction'%(title))
    plt.plot(test[:sample], color=color2, zorder=3, linewidth=linewidth, label='%s True Data'%(title))
    plt.title = title
    plt.legend()

# Frequency count
def get_frequency(data):
    # Gets the frequency of a column's values in 'data'. Pass on a series.
    vals = pd.merge(data.to_frame(), data.value_counts().reset_index(), 
                    how='left', left_on=data.to_frame().columns[0], right_on='index').iloc[:, -1:].values
    return vals
  
def time_data(data):
    data['transactiondate'] = pd.to_datetime(data['transactiondate'])
    data['day_of_week']     = data['transactiondate'].dt.dayofweek
    data['month_of_year']   = data['transactiondate'].dt.month
    data['quarter']         = data['transactiondate'].dt.quarter
    data['is_weekend']      = (data['day_of_week'] < 5).astype(int)
    data.drop('transactiondate', axis=1, inplace=True)
    
    print('Added time data')
    print('........')
    
    return data


def column_excluder(data, missing_perc_thresh=0.98):
    # Quick clean from https://www.kaggle.com/seesee/concise-catboost-starter-ensemble-plb-0-06435
    
    exclude_missing = []
    exclude_unique = []
    num_rows = data.shape[0]
    for c in data.columns:
        num_missing = data[c].isnull().sum()
        if num_missing == 0:
            continue
        missing_frac = num_missing / float(num_rows)
        if missing_frac > missing_perc_thresh:
            exclude_missing.append(c)

        num_uniques = len(data[c].unique())
        if data[c].isnull().sum() != 0:
            num_uniques -= 1
        if num_uniques == 1:
            exclude_unique.append(c)
            
    to_exclude = list(set(exclude_missing + exclude_unique))
    
    print('Excluded columns:')
    print(to_exclude)
    print('........')
    
    return to_exclude

def categorical_features(data):
    # Quick categories from https://www.kaggle.com/seesee/concise-catboost-starter-ensemble-plb-0-06435
        
    cat_feature_inds = []
    cat_unique_thresh = 1000
    for i, c in enumerate(data.columns):
        num_uniques = len(data[c].unique())
        if num_uniques < cat_unique_thresh \
            and not 'sqft'   in c \
            and not 'cnt'    in c \
            and not 'nbr'    in c \
            and not 'number' in c:
            cat_feature_inds.append(i)

    print("Categorical features:")
    print([data.columns[ind] for ind in cat_feature_inds])
    print('........')
    
    return cat_feature_inds


def complex_features(data):
    # Gets counts, label encoding and frequency estimates.
    
    # Frequency of occurances | length of codes | check if * is present
    data['propertyzoningdesc_frq'] = get_frequency(data['propertyzoningdesc'])
    data['propertyzoningdesc_len'] = data['propertyzoningdesc'].apply(lambda x: len(str(x)) if pd.notnull(x) else x)
    #transactions_shuffled['propertyzoningdesc_str'] = transactions_shuffled['propertyzoningdesc'].apply(lambda x: (1 if '*' in str(x) else 0) if pd.notnull(x) else x)

    # Label encoding | length of code
    #transactions_shuffled['propertycountylandusecode_enc'] = transactions_shuffled[['propertycountylandusecode']].astype(str).apply(LabelEncoder().fit_transform)
    #transactions_shuffled['propertycountylandusecode_len'] = transactions_shuffled['propertycountylandusecode'].apply(lambda x: x if pd.isnull(x) else len(x))

    # Zip code area extraction
    data['regionidzip_ab']  = data['regionidzip'].apply(lambda x: x if pd.isnull(x) else str(x)[:2]).astype(float)
    data['regionidzip_abc'] = data['regionidzip'].apply(lambda x: x if pd.isnull(x) else str(x)[:3]).astype(float)

    # Region neighbourhood area extraction
    data['regionidneighborhood_ab'] = data['regionidneighborhood'].apply(lambda x: str(x)[:2] if pd.notnull(x) else x).astype(float)

    # Rawcensustractandblock transformed
    data['code_fips_cnt']  = get_frequency(data['rawcensustractandblock'].apply(lambda x: str(x)[:4]))
    data['code_tract_cnt'] = get_frequency(data['rawcensustractandblock'].apply(lambda x: str(x)[4:11]))
    data['code_block_cnt'] = get_frequency(data['rawcensustractandblock'].apply(lambda x: str(x)[11:]))
    data.drop('rawcensustractandblock', axis=1, inplace=True)
    
    # Encode string values
    data[['propertycountylandusecode', 'propertyzoningdesc']] = data[['propertycountylandusecode', 'propertyzoningdesc']].astype(str).apply(LabelEncoder().fit_transform)
    
    print('Generating complex features')
    print('........')
    
    return data

In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from catboost import CatBoostRegressor
from tqdm import tqdm
import gc
import datetime as dt

print('Loading Properties ...')
properties2016 = pd.read_csv('../Data/properties_2016.csv', low_memory = False)
properties2017 = pd.read_csv('../Data/properties_2017.csv', low_memory = False)

print('Loading Train ...')
train2016 = pd.read_csv('../Data/train_2016_v2.csv', parse_dates=['transactiondate'], low_memory=False)
train2017 = pd.read_csv('../Data/train_2017.csv', parse_dates=['transactiondate'], low_memory=False)

def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = (df["transactiondate"].dt.year - 2016)*12 + df["transactiondate"].dt.month
    df["transaction_day"] = df["transactiondate"].dt.day
    df["transaction_quarter"] = (df["transactiondate"].dt.year - 2016)*4 +df["transactiondate"].dt.quarter
    df.drop(["transactiondate"], inplace=True, axis=1)
    return df

#train2016 = add_date_features(train2016)
#train2017 = add_date_features(train2017)


print('Loading Sample ...')
sample_submission = pd.read_csv('../Data/sample_submission.csv', low_memory = False)

print('Merge Train with Properties ...')
train2016 = pd.merge(train2016, properties2016, how = 'left', on = 'parcelid')
train2017 = pd.merge(train2017, properties2017, how = 'left', on = 'parcelid')

# Datetime transformation
train2017 = time_data(train2017)
train2017 = complex_features(train2017)

train2016 = time_data(train2016)
train2016 = complex_features(train2016)

print('Tax Features 2017  ...')
#train2017.iloc[:, train2017.columns.str.startswith('tax')] = np.nan

print('Concat Train 2016 & 2017 ...')
train_df = pd.concat([train2016, train2017], axis = 0)
test_df = pd.merge(sample_submission[['ParcelId']], properties2016.rename(columns = {'parcelid': 'ParcelId'}), how = 'left', on = 'ParcelId')

del properties2016, properties2017, train2016, train2017
gc.collect();

print('Remove missing data fields ...')

missing_perc_thresh = 0.98
exclude_missing = []
num_rows = train_df.shape[0]
for c in train_df.columns:
    num_missing = train_df[c].isnull().sum()
    if num_missing == 0:
        continue
    missing_frac = num_missing / float(num_rows)
    if missing_frac > missing_perc_thresh:
        exclude_missing.append(c)
print("We exclude: %s" % len(exclude_missing))

del num_rows, missing_perc_thresh
gc.collect();

print ("Remove features with one unique value !!")
exclude_unique = []
for c in train_df.columns:
    num_uniques = len(train_df[c].unique())
    if train_df[c].isnull().sum() != 0:
        num_uniques -= 1
    if num_uniques == 1:
        exclude_unique.append(c)
print("We exclude: %s" % len(exclude_unique))

print ("Define training features !!")
exclude_other = ['parcelid', 'logerror','propertyzoningdesc']
train_features = []
for c in train_df.columns:
    if c not in exclude_missing \
       and c not in exclude_other and c not in exclude_unique:
        train_features.append(c)
print("We use these for training: %s" % len(train_features))

print ("Define categorial features !!")
cat_feature_inds = []
cat_unique_thresh = 1000
for i, c in enumerate(train_features):
    num_uniques = len(train_df[c].unique())
    if num_uniques < cat_unique_thresh \
        and not 'sqft' in c \
        and not 'cnt' in c \
        and not 'nbr' in c \
        and not 'number' in c:
        cat_feature_inds.append(i)
        
print("Cat features are: %s" % [train_features[ind] for ind in cat_feature_inds])

print ("Replacing NaN values by -999 !!")
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

print ("Training time !!")
X_train = train_df[train_features]
y_train = train_df.logerror
print(X_train.shape, y_train.shape)

submission = pd.DataFrame({
    'ParcelId': test_df['ParcelId'],
})
test_dates = {
    '201610': pd.Timestamp('2016-09-30'),
    '201611': pd.Timestamp('2016-10-31'),
    '201612': pd.Timestamp('2016-11-30'),
    '201710': pd.Timestamp('2017-09-30'),
    '201711': pd.Timestamp('2017-10-31'),
    '201712': pd.Timestamp('2017-11-30')
}

test_df = complex_features(test_df)


Loading Properties ...
Loading Train ...
Loading Sample ...
Merge Train with Properties ...
Added time data
........
Generating complex features
........
Added time data
........
Generating complex features
........
Tax Features 2017  ...
Concat Train 2016 & 2017 ...
Remove missing data fields ...
We exclude: 13
Remove features with one unique value !!
We exclude: 9
Define training features !!
We use these for training: 50
Define categorial features !!
Cat features are: ['airconditioningtypeid', 'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'propertycountylandusecode', 'propertylandusetypeid', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'yearbuilt', 'assessmentyear', 'taxdelinquencyyear', 'day_of_week', 'month_of_year', 'quarter', 'is_weekend', 'propertyzoningdesc_frq', 'propertyzoningdesc_len', 'regionidzip_ab', 'regionidzip_abc', 'regionidneighborhood_ab']
Replacing NaN values by -999 !!
Training time !!
(167888, 50) (167888,)
Generating complex features
........

In [5]:
num_ensembles = 5
for label, test_date in test_dates.items():
    submission[label] = 0

for i in tqdm(range(num_ensembles)):
    print('Training model', i)
    model = CatBoostRegressor(
        iterations=630, learning_rate=0.03,
        depth=6, l2_leaf_reg=3,
        loss_function='MAE',
        eval_metric='MAE',
        random_seed=i)
    
    model.fit(
        X_train, y_train,
        cat_features=cat_feature_inds)

    for label, test_date in test_dates.items():
        print("Predicting for: %s ... " % (label))
        test_df['transactiondate'] = test_date 
        test_df = time_data(test_df)
        X_test = test_df[train_features]

        submission[label] += model.predict(X_test)
    
#submission.to_csv('catboost_sample.csv', float_format='%.6f',index=False)


  0%|          | 0/5 [00:00<?, ?it/s]
Training model 0
Predicting for: 201610 ... 
Added time data
........
Predicting for: 201611 ... 
Added time data
........
Predicting for: 201612 ... 
Added time data
........
Predicting for: 201710 ... 
Added time data
........
Predicting for: 201711 ... 
Added time data
........
Predicting for: 201712 ... 
Added time data
........
 20%|██        | 1/5 [20:59<1:23:57, 1259.33s/it]
Training model 1
Predicting for: 201610 ... 
Added time data
........
Predicting for: 201611 ... 
Added time data
........
Predicting for: 201612 ... 
Added time data
........
Predicting for: 201710 ... 
Added time data
........
Predicting for: 201711 ... 
Added time data
........
Predicting for: 201712 ... 
Added time data
........
 40%|████      | 2/5 [41:13<1:01:50, 1236.92s/it]
Training model 2
Predicting for: 201610 ... 
Added time data
........
Predicting for: 201611 ... 
Added time data
........
Predicting for: 201612 ... 
Added time data
........
Predicting for: 201710 ... 
Added time data
........
Predicting for: 201711 ... 
Added time data
........
Predicting for: 201712 ... 
Added time data
........
 60%|██████    | 3/5 [1:04:21<42:54, 1287.08s/it]
Training model 3
Predicting for: 201610 ... 
Added time data
........
Predicting for: 201611 ... 
Added time data
........
Predicting for: 201612 ... 
Added time data
........
Predicting for: 201710 ... 
Added time data
........
Predicting for: 201711 ... 
Added time data
........
Predicting for: 201712 ... 
Added time data
........
 80%|████████  | 4/5 [1:25:22<21:20, 1280.68s/it]
Training model 4
Predicting for: 201610 ... 
Added time data
........
Predicting for: 201611 ... 
Added time data
........
Predicting for: 201612 ... 
Added time data
........
Predicting for: 201710 ... 
Added time data
........
Predicting for: 201711 ... 
Added time data
........
Predicting for: 201712 ... 
Added time data
100%|██████████| 5/5 [1:45:54<00:00, 1270.95s/it]
........


In [ ]:
submission.to_csv('catboost_sample.csv', float_format='%.6f',index=False)

In [9]:
(submission[list(test_dates.keys())]/5).to_csv('catboost_sample.csv', float_format='%.6f',index=False)

In [15]:
new = pd.DataFrame()
new['ParcelId'] = submission['ParcelId']
new[list(test_dates.keys())] = (submission[list(test_dates.keys())]/5)

In [17]:
new.to_csv('catboost_sample.csv', float_format='%.6f',index=False)

In [ ]: