In [ ]:
# submit history
# initial submit : 0.0642702
In [3]:
# imports
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc
from sklearn.linear_model import LinearRegression
import random
import datetime as dt
################
################
## LightGBM changes ##
# V42 - sub_feature: 0.3 -> 0.35 : LB = 0.0643759
# V34 - sub_feature: 0.5 -> 0.42
# V33 - sub_feature: 0.5 -> 0.45 : LB = 0.0643866
# - sub_feature: 0.45 -> 0.3 : LB = 0.0643811 / 0.0643814
################
################
# Parameters
XGB_WEIGHT = 0.6515
BASELINE_WEIGHT = 0.0056
OLS_WEIGHT = 0.0828
XGB1_WEIGHT = 0.8083 # Weight of first in combination of two XGB models
BASELINE_PRED = 0.0115 # Baseline based on mean of training data, per Oleg
##### READ IN RAW DATA
print( "\nReading data from disk ...")
prop = pd.read_csv('input/properties_2016.csv')
train = pd.read_csv("input/train_2016_v2.csv")
from six.moves import cPickle as pickle
def make_pickle(file_name, data, force=False):
import os
if not os.path.exists("pickle"):
os.makedirs("pickle")
if os.path.exists(file_name) and not force:
# You may override by setting force=True.
print('%s already present - Skipping pickling.' % file_name)
else:
print('Pickling %s.' % file_name)
try:
with open(file_name, 'wb') as f:
pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
except Exception as e:
print('Unable to save data to', file_name, ':', e)
return file_name
lgb_xgb_combined_pred_pickle_file_name = "pickle/zillowAbhishek_lgb_xgb_combined_pred.pickle"
catboost_pred_pickle_file_name = "pickle/zillowAbhishek_y_pred.pickle"
In [4]:
################
################
## LightGBM ##
################
################
##### PROCESS DATA FOR LIGHTGBM
print( "\nProcessing data for LightGBM ..." )
for c, dtype in zip(prop.columns, prop.dtypes):
if dtype == np.float64:
prop[c] = prop[c].astype(np.float32)
df_train = train.merge(prop, how='left', on='parcelid')
df_train.fillna(df_train.median(),inplace = True)
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc',
'propertycountylandusecode', 'fireplacecnt', 'fireplaceflag'], axis=1)
#x_train['Ratio_1'] = x_train['taxvaluedollarcnt']/x_train['taxamount']
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)
train_columns = x_train.columns
for c in x_train.dtypes[x_train.dtypes == object].index.values:
x_train[c] = (x_train[c] == True)
del df_train; gc.collect()
x_train = x_train.values.astype(np.float32, copy=False)
d_train = lgb.Dataset(x_train, label=y_train)
##### RUN LIGHTGBM
params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.0021 # shrinkage_rate
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'l1' # or 'mae'
params['sub_feature'] = 0.345
params['bagging_fraction'] = 0.85 # sub_row
params['bagging_freq'] = 40
params['num_leaves'] = 512 # num_leaf
params['min_data'] = 500 # min_data_in_leaf
params['min_hessian'] = 0.05 # min_sum_hessian_in_leaf
params['verbose'] = 0
params['feature_fraction_seed'] = 2
params['bagging_seed'] = 3
print("\nFitting LightGBM model ...")
clf = lgb.train(params, d_train, 430)
del d_train; gc.collect()
del x_train; gc.collect()
print("\nPrepare for LightGBM prediction ...")
print(" Read sample file ...")
sample = pd.read_csv('input/sample_submission.csv')
print(" ...")
sample['parcelid'] = sample['ParcelId']
print(" Merge with property data ...")
df_test = sample.merge(prop, on='parcelid', how='left')
print(" ...")
del sample, prop; gc.collect()
print(" ...")
#df_test['Ratio_1'] = df_test['taxvaluedollarcnt']/df_test['taxamount']
x_test = df_test[train_columns]
print(" ...")
del df_test; gc.collect()
print(" Preparing x_test...")
for c in x_test.dtypes[x_test.dtypes == object].index.values:
x_test[c] = (x_test[c] == True)
print(" ...")
x_test = x_test.values.astype(np.float32, copy=False)
print("Test shape :", x_test.shape)
print("\nStart LightGBM prediction ...")
p_test = clf.predict(x_test)
del x_test; gc.collect()
print( "\nUnadjusted LightGBM predictions:" )
print( pd.DataFrame(p_test).head() )
In [5]:
################
################
## XGBoost ##
################
################
##### RE-READ PROPERTIES FILE
##### (I tried keeping a copy, but the program crashed.)
print( "\nRe-reading properties file ...")
properties = pd.read_csv('input/properties_2016.csv')
##### PROCESS DATA FOR XGBOOST
print( "\nProcessing data for XGBoost ...")
for c in properties.columns:
properties[c]=properties[c].fillna(-1)
if properties[c].dtype == 'object':
lbl = LabelEncoder()
lbl.fit(list(properties[c].values))
properties[c] = lbl.transform(list(properties[c].values))
train_df = train.merge(properties, how='left', on='parcelid')
x_train = train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
x_test = properties.drop(['parcelid'], axis=1)
# shape
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))
# drop out ouliers
train_df=train_df[ train_df.logerror > -0.4 ]
train_df=train_df[ train_df.logerror < 0.419 ]
x_train=train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
y_train = train_df["logerror"].values.astype(np.float32)
y_mean = np.mean(y_train)
print('After removing outliers:')
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))
##### RUN XGBOOST
print("\nSetting up data for XGBoost ...")
# xgboost params
xgb_params = {
'eta': 0.037,
'max_depth': 5,
'subsample': 0.80,
'objective': 'reg:linear',
'eval_metric': 'mae',
'lambda': 0.8,
'alpha': 0.4,
'base_score': y_mean,
'silent': 1
}
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)
num_boost_rounds = 250
print("num_boost_rounds="+str(num_boost_rounds))
# train model
print( "\nTraining XGBoost ...")
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)
print( "\nPredicting with XGBoost ...")
xgb_pred1 = model.predict(dtest)
print( "\nFirst XGBoost predictions:" )
print( pd.DataFrame(xgb_pred1).head() )
##### RUN XGBOOST AGAIN
print("\nSetting up data for XGBoost ...")
# xgboost params
xgb_params = {
'eta': 0.033,
'max_depth': 6,
'subsample': 0.80,
'objective': 'reg:linear',
'eval_metric': 'mae',
'base_score': y_mean,
'silent': 1
}
num_boost_rounds = 150
print("num_boost_rounds="+str(num_boost_rounds))
print( "\nTraining XGBoost again ...")
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)
print( "\nPredicting with XGBoost again ...")
xgb_pred2 = model.predict(dtest)
print( "\nSecond XGBoost predictions:" )
print( pd.DataFrame(xgb_pred2).head() )
##### COMBINE XGBOOST RESULTS
xgb_pred = XGB1_WEIGHT*xgb_pred1 + (1-XGB1_WEIGHT)*xgb_pred2
#xgb_pred = xgb_pred1
print( "\nCombined XGBoost predictions:" )
print( pd.DataFrame(xgb_pred).head() )
del train_df
del x_train
del x_test
del properties
del dtest
del dtrain
del xgb_pred1
del xgb_pred2
gc.collect()
Out[5]:
In [6]:
################
################
## OLS ##
################
################
# This section is derived from the1owl's notebook:
# https://www.kaggle.com/the1owl/primer-for-the-zillow-pred-approach
# which I (Andy Harless) updated and made into a script:
# https://www.kaggle.com/aharless/updated-script-version-of-the1owl-s-basic-ols
np.random.seed(17)
random.seed(17)
train = pd.read_csv("input/train_2016_v2.csv", parse_dates=["transactiondate"])
properties = pd.read_csv("input/properties_2016.csv")
submission = pd.read_csv("input/sample_submission.csv")
print(len(train),len(properties),len(submission))
def get_features(df):
df["transactiondate"] = pd.to_datetime(df["transactiondate"])
df["transactiondate_year"] = df["transactiondate"].dt.year
df["transactiondate_month"] = df["transactiondate"].dt.month
df['transactiondate'] = df['transactiondate'].dt.quarter
df = df.fillna(-1.0)
return df
def MAE(y, ypred):
return np.sum([abs(y[i]-ypred[i]) for i in range(len(y))]) / len(y)
train = pd.merge(train, properties, how='left', on='parcelid')
y = train['logerror'].values
test = pd.merge(submission, properties, how='left', left_on='ParcelId', right_on='parcelid')
properties = [] #memory
exc = [train.columns[c] for c in range(len(train.columns)) if train.dtypes[c] == 'O'] + ['logerror','parcelid']
col = [c for c in train.columns if c not in exc]
train = get_features(train[col])
test['transactiondate'] = '2016-01-01' #should use the most common training date
test = get_features(test[col])
reg = LinearRegression(n_jobs=-1)
reg.fit(train, y); print('fit...')
print(MAE(y, reg.predict(train)))
train = []; y = [] #memory
test_dates = ['2016-10-01','2016-11-01','2016-12-01','2017-10-01','2017-11-01','2017-12-01']
test_columns = ['201610','201611','201612','201710','201711','201712']
In [7]:
########################
########################
## Combine and Save ##
########################
########################
##### COMBINE PREDICTIONS
print( "\nCombining XGBoost, LightGBM, and baseline predicitons ..." )
lgb_weight = (1 - XGB_WEIGHT - BASELINE_WEIGHT) / (1 - OLS_WEIGHT)
xgb_weight0 = XGB_WEIGHT / (1 - OLS_WEIGHT)
baseline_weight0 = BASELINE_WEIGHT / (1 - OLS_WEIGHT)
pred0 = xgb_weight0*xgb_pred + baseline_weight0*BASELINE_PRED + lgb_weight*p_test
print( "\nCombined XGB/LGB/baseline predictions:" )
print( pd.DataFrame(pred0).head() )
print( "\nPredicting with OLS and combining with XGB/LGB/baseline predicitons: ..." )
for i in range(len(test_dates)):
test['transactiondate'] = test_dates[i]
pred = OLS_WEIGHT*reg.predict(get_features(test)) + (1-OLS_WEIGHT)*pred0
#submission[test_columns[i]] = [float(format(x, '.4f')) for x in pred]
print('predict...', i)
#print( "\nCombined XGB/LGB/baseline/OLS predictions:" )
#print( submission.head() )
In [11]:
print(pred[:10])
lgb_xgb_combined_pred_pickle_file_name = "pickle/zillowAbhishek_lgb_xgb_combined_pred.pickle"
make_pickle(lgb_xgb_combined_pred_pickle_file_name, pred)
Out[11]:
In [12]:
with open(lgb_xgb_combined_pred_pickle_file_name, 'rb') as f:
pred = pickle.load(f)
print(pred[:10])
In [15]:
##########************************************* catBOOst***************************
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from tqdm import tqdm
train_df = pd.read_csv('input/train_2016_v2.csv', parse_dates=['transactiondate'], low_memory=False)
test_df = pd.read_csv('input/sample_submission.csv', low_memory=False)
properties = pd.read_csv('input/properties_2016.csv', low_memory=False)
# field is named differently in submission
test_df['parcelid'] = test_df['ParcelId']
# similar to the1owl
def add_date_features(df):
df["transaction_year"] = df["transactiondate"].dt.year
df["transaction_month"] = df["transactiondate"].dt.month
df["transaction_day"] = df["transactiondate"].dt.day
df["transaction_quarter"] = df["transactiondate"].dt.quarter
df.drop(["transactiondate"], inplace=True, axis=1)
return df
train_df = add_date_features(train_df)
train_df = train_df.merge(properties, how='left', on='parcelid')
test_df = test_df.merge(properties, how='left', on='parcelid')
print("Train: ", train_df.shape)
print("Test: ", test_df.shape)
missing_perc_thresh = 0.98
exclude_missing = []
num_rows = train_df.shape[0]
for c in train_df.columns:
num_missing = train_df[c].isnull().sum()
if num_missing == 0:
continue
missing_frac = num_missing / float(num_rows)
if missing_frac > missing_perc_thresh:
exclude_missing.append(c)
print("We exclude: %s" % exclude_missing)
print(len(exclude_missing))
# exclude where we only have one unique value :D
exclude_unique = []
for c in train_df.columns:
num_uniques = len(train_df[c].unique())
if train_df[c].isnull().sum() != 0:
num_uniques -= 1
if num_uniques == 1:
exclude_unique.append(c)
print("We exclude: %s" % exclude_unique)
print(len(exclude_unique))
exclude_other = ['parcelid', 'logerror'] # for indexing/training only
# do not know what this is LARS, 'SHCG' 'COR2YY' 'LNR2RPD-R3' ?!?
exclude_other.append('propertyzoningdesc')
train_features = []
for c in train_df.columns:
if c not in exclude_missing \
and c not in exclude_other and c not in exclude_unique:
train_features.append(c)
print("We use these for training: %s" % train_features)
print(len(train_features))
cat_feature_inds = []
cat_unique_thresh = 1000
for i, c in enumerate(train_features):
num_uniques = len(train_df[c].unique())
if num_uniques < cat_unique_thresh \
and not 'sqft' in c \
and not 'cnt' in c \
and not 'nbr' in c \
and not 'number' in c:
cat_feature_inds.append(i)
print("Cat features are: %s" % [train_features[ind] for ind in cat_feature_inds])
# some out of range int is a good choice
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)
X_train = train_df[train_features]
y_train = train_df.logerror
print(X_train.shape, y_train.shape)
test_df['transactiondate'] = pd.Timestamp('2016-12-01') # Dummy
test_df = add_date_features(test_df)
X_test = test_df[train_features]
print(X_test.shape)
In [2]:
import gc
num_ensembles = 8
y_pred = 0.0
for i in range(num_ensembles):
print('start ' + str(i))
# TODO(you): Use CV, tune hyperparameters
model = CatBoostRegressor(
iterations=200,
learning_rate=0.03,
depth=6,
l2_leaf_reg=3,
loss_function='MAE',
eval_metric='MAE',
random_seed=i,
thread_count=7
)
print('model created')
model.fit(
X_train, y_train,
cat_features=cat_feature_inds)
print('fit finished')
y_pred += model.predict(X_test)
print('predict finished')
del model
gc.collect()
print('gc finished')
y_pred /= num_ensembles
In [4]:
make_pickle(catboost_pred_pickle_file_name, y_pred)
Out[4]:
In [13]:
with open(catboost_pred_pickle_file_name, 'rb') as f:
y_pred = pickle.load(f)
print(y_pred[:10])
In [16]:
submission = pd.DataFrame({
'ParcelId': test_df['parcelid'],
})
# https://www.kaggle.com/c/zillow-prize-1/discussion/33899, Oct,Nov,Dec
test_dates = ['2016-10-01','2016-11-01','2016-12-01','2017-10-01','2017-11-01','2017-12-01']
test_columns = ['201610','201611','201612','201710','201711','201712']
catboost_weight=0.7
lg_xg__weight=0.3
for i in range(len(test_dates)):
# test['transactiondate'] = test_dates[i]
pred_final = catboost_weight*y_pred + lg_xg__weight*pred
submission[test_columns[i]] = [float(format(x, '.4f')) for x in pred_final]
print('predict...', i)
print( "\nCombined XGB/LGB/baseline/OLS and catboost predictions:" )
print( submission.head() )
##### WRITE THE RESULTS
from datetime import datetime
print( "\nWriting results to disk ..." )
submission.to_csv('output/sub{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False)
print( "\nFinished ...")
############-------------------------------------------------
In [ ]: