In [18]:
# https://www.kaggle.com/dragost/boosted-trees-lb-0-0643707/edit
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc
from sklearn.linear_model import LinearRegression
import random
import datetime as dt
from catboost import CatBoostRegressor
from tqdm import tqdm
################
################
## LightGBM changes ##
# V42 - sub_feature: 0.3 -> 0.35 : LB = 0.0643759
# V34 - sub_feature: 0.5 -> 0.42
# V33 - sub_feature: 0.5 -> 0.45 : LB = 0.0643866
# - sub_feature: 0.45 -> 0.3 : LB = 0.0643811 / 0.0643814
################
################
##### READ IN RAW DATA
print( "\nReading data from disk ...")
prop = pd.read_csv('../Data/properties_2017.csv')
train = pd.read_csv("../Data/train_2017.csv")
In [19]:
# Parameters
XGB_WEIGHT = 0.6500
BASELINE_WEIGHT = 0.0056
OLS_WEIGHT = 0.0828
XGB1_WEIGHT = 0.8083 # Weight of first in combination of two XGB models
BASELINE_PRED = 0.0115 # Baseline based on mean of training data, per Oleg
In [20]:
################
################
## LightGBM ##
################
################
##### PROCESS DATA FOR LIGHTGBM
print( "\nProcessing data for LightGBM ..." )
for c, dtype in zip(prop.columns, prop.dtypes):
if dtype == np.float64:
prop[c] = prop[c].astype(np.float32)
df_train = train.merge(prop, how='left', on='parcelid')
df_train.fillna(df_train.median(),inplace = True)
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc',
'propertycountylandusecode', 'fireplacecnt', 'fireplaceflag'], axis=1)
#x_train['Ratio_1'] = x_train['taxvaluedollarcnt']/x_train['taxamount']
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)
train_columns = x_train.columns
for c in x_train.dtypes[x_train.dtypes == object].index.values:
x_train[c] = (x_train[c] == True)
del df_train; gc.collect()
x_train = x_train.values.astype(np.float32, copy=False)
d_train = lgb.Dataset(x_train, label=y_train)
##### RUN LIGHTGBM
params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.0021 # shrinkage_rate
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'l1' # or 'mae'
params['sub_feature'] = 0.345
params['bagging_fraction'] = 0.85 # sub_row
params['bagging_freq'] = 40
params['num_leaves'] = 512 # num_leaf
params['min_data'] = 500 # min_data_in_leaf
params['min_hessian'] = 0.05 # min_sum_hessian_in_leaf
params['verbose'] = 0
#params['feature_fraction_seed'] = 2
#params['bagging_seed'] = 3
print("\nFitting LightGBM model ...")
clf = lgb.train(params, d_train, 430)
del d_train; gc.collect()
del x_train; gc.collect()
print("\nPrepare for LightGBM prediction ...")
print(" Read sample file ...")
sample = pd.read_csv('../Data/sample_submission.csv')
print(" ...")
sample['parcelid'] = sample['ParcelId']
print(" Merge with property data ...")
df_test = sample.merge(prop, on='parcelid', how='left')
print(" ...")
del sample, prop; gc.collect()
print(" ...")
#df_test['Ratio_1'] = df_test['taxvaluedollarcnt']/df_test['taxamount']
x_test = df_test[train_columns]
print(" ...")
del df_test; gc.collect()
print(" Preparing x_test...")
for c in x_test.dtypes[x_test.dtypes == object].index.values:
x_test[c] = (x_test[c] == True)
print(" ...")
x_test = x_test.values.astype(np.float32, copy=False)
print("Test shape :", x_test.shape)
print("\nStart LightGBM prediction ...")
p_test = clf.predict(x_test)
del x_test; gc.collect()
print( "\nUnadjusted LightGBM predictions:" )
print( pd.DataFrame(p_test).head() )
In [21]:
################
################
## XGBoost ##
################
################
##### RE-READ PROPERTIES FILE
##### (I tried keeping a copy, but the program crashed.)
print( "\nRe-reading properties file ...")
properties = pd.read_csv('../Data/properties_2017.csv')
##### PROCESS DATA FOR XGBOOST
print( "\nProcessing data for XGBoost ...")
for c in properties.columns:
properties[c]=properties[c].fillna(-1)
if properties[c].dtype == 'object':
lbl = LabelEncoder()
lbl.fit(list(properties[c].values))
properties[c] = lbl.transform(list(properties[c].values))
train_df = train.merge(properties, how='left', on='parcelid')
x_train = train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
x_test = properties.drop(['parcelid'], axis=1)
# shape
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))
# drop out ouliers
train_df=train_df[ train_df.logerror > -0.4 ]
train_df=train_df[ train_df.logerror < 0.419 ]
x_train=train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
y_train = train_df["logerror"].values.astype(np.float32)
y_mean = np.mean(y_train)
print('After removing outliers:')
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))
##### RUN XGBOOST
print("\nSetting up data for XGBoost ...")
# xgboost params
xgb_params = {
'eta': 0.037,
'max_depth': 5,
'subsample': 0.80,
'objective': 'reg:linear',
'eval_metric': 'mae',
'lambda': 0.8,
'alpha': 0.4,
'base_score': y_mean,
'silent': 1
}
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)
num_boost_rounds = 250
print("num_boost_rounds="+str(num_boost_rounds))
# train model
print( "\nTraining XGBoost ...")
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)
print( "\nPredicting with XGBoost ...")
xgb_pred1 = model.predict(dtest)
print( "\nFirst XGBoost predictions:" )
print( pd.DataFrame(xgb_pred1).head() )
##### RUN XGBOOST AGAIN
print("\nSetting up data for XGBoost ...")
# xgboost params
xgb_params = {
'eta': 0.033,
'max_depth': 6,
'subsample': 0.80,
'objective': 'reg:linear',
'eval_metric': 'mae',
'base_score': y_mean,
'silent': 1
}
num_boost_rounds = 150
print("num_boost_rounds="+str(num_boost_rounds))
print( "\nTraining XGBoost again ...")
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)
print( "\nPredicting with XGBoost again ...")
xgb_pred2 = model.predict(dtest)
print( "\nSecond XGBoost predictions:" )
print( pd.DataFrame(xgb_pred2).head() )
##### COMBINE XGBOOST RESULTS
xgb_pred = XGB1_WEIGHT*xgb_pred1 + (1-XGB1_WEIGHT)*xgb_pred2
#xgb_pred = xgb_pred1
print( "\nCombined XGBoost predictions:" )
print( pd.DataFrame(xgb_pred).head() )
del train_df
del x_train
del x_test
del properties
del dtest
del dtrain
del xgb_pred1
del xgb_pred2
gc.collect()
Out[21]:
In [22]:
################
################
## OLS ##
################
################
# This section is derived from the1owl's notebook:
# https://www.kaggle.com/the1owl/primer-for-the-zillow-pred-approach
# which I (Andy Harless) updated and made into a script:
# https://www.kaggle.com/aharless/updated-script-version-of-the1owl-s-basic-ols
np.random.seed(17)
random.seed(17)
train = pd.read_csv("../Data/train_2017.csv", parse_dates=["transactiondate"])
properties = pd.read_csv("../Data/properties_2017.csv")
submission = pd.read_csv("../Data/sample_submission.csv")
print(len(train),len(properties),len(submission))
def get_features(df):
df["transactiondate"] = pd.to_datetime(df["transactiondate"])
df["transactiondate_year"] = df["transactiondate"].dt.year
df["transactiondate_month"] = df["transactiondate"].dt.month
df['transactiondate'] = df['transactiondate'].dt.quarter
df = df.fillna(-1.0)
return df
def MAE(y, ypred):
#logerror=log(Zestimate)−log(SalePrice)
return np.sum([abs(y[i]-ypred[i]) for i in range(len(y))]) / len(y)
train = pd.merge(train, properties, how='left', on='parcelid')
y = train['logerror'].values
test = pd.merge(submission, properties, how='left', left_on='ParcelId', right_on='parcelid')
properties = [] #memory
exc = [train.columns[c] for c in range(len(train.columns)) if train.dtypes[c] == 'O'] + ['logerror','parcelid']
col = [c for c in train.columns if c not in exc]
train = get_features(train[col])
test['transactiondate'] = '2016-01-01' #should use the most common training date
test = get_features(test[col])
reg = LinearRegression(n_jobs=-1)
reg.fit(train, y); print('fit...')
print(MAE(y, reg.predict(train)))
train = []; y = [] #memory
test_dates = ['2016-10-01','2016-11-01','2016-12-01','2017-10-01','2017-11-01','2017-12-01']
test_columns = ['201610','201611','201612','201710','201711','201712']
In [23]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout, BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.layers.noise import GaussianDropout
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
######################
######################
## Neural Network ##
######################
######################
# Neural network copied from this script:
# https://www.kaggle.com/aharless/keras-neural-network-lb-06492 (version 20)
# which was built on the skeleton in this notebook:
# https://www.kaggle.com/prasunmishra/ann-using-keras
# Read in data for neural network
print( "\n\nProcessing data for Neural Network ...")
print('\nLoading train, prop and sample data...')
train = pd.read_csv("../Data/train_2017.csv", parse_dates=["transactiondate"])
prop = pd.read_csv('../Data/properties_2017.csv')
sample = pd.read_csv('../Data/sample_submission.csv')
print('Fitting Label Encoder on properties...')
for c in prop.columns:
prop[c]=prop[c].fillna(-1)
if prop[c].dtype == 'object':
lbl = LabelEncoder()
lbl.fit(list(prop[c].values))
prop[c] = lbl.transform(list(prop[c].values))
print('Creating training set...')
df_train = train.merge(prop, how='left', on='parcelid')
df_train["transactiondate"] = pd.to_datetime(df_train["transactiondate"])
df_train["transactiondate_year"] = df_train["transactiondate"].dt.year
df_train["transactiondate_month"] = df_train["transactiondate"].dt.month
df_train['transactiondate_quarter'] = df_train['transactiondate'].dt.quarter
df_train["transactiondate"] = df_train["transactiondate"].dt.day
print('Filling NA/NaN values...' )
df_train.fillna(-1.0)
print('Creating x_train and y_train from df_train...' )
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode','fireplacecnt', 'fireplaceflag'], axis=1)
y_train = df_train["logerror"]
y_mean = np.mean(y_train)
print(x_train.shape, y_train.shape)
train_columns = x_train.columns
for c in x_train.dtypes[x_train.dtypes == object].index.values:
x_train[c] = (x_train[c] == True)
print('Creating df_test...')
sample['parcelid'] = sample['ParcelId']
print("Merging Sample with property data...")
df_test = sample.merge(prop, on='parcelid', how='left')
df_test["transactiondate"] = pd.to_datetime('2016-11-15') # placeholder value for preliminary version
df_test["transactiondate_year"] = df_test["transactiondate"].dt.year
df_test["transactiondate_month"] = df_test["transactiondate"].dt.month
df_test['transactiondate_quarter'] = df_test['transactiondate'].dt.quarter
df_test["transactiondate"] = df_test["transactiondate"].dt.day
x_test = df_test[train_columns]
print('Shape of x_test:', x_test.shape)
print("Preparing x_test...")
for c in x_test.dtypes[x_test.dtypes == object].index.values:
x_test[c] = (x_test[c] == True)
## Preprocessing
print("\nPreprocessing neural network data...")
imputer= Imputer()
imputer.fit(x_train.iloc[:, :])
x_train = imputer.transform(x_train.iloc[:, :])
imputer.fit(x_test.iloc[:, :])
x_test = imputer.transform(x_test.iloc[:, :])
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
len_x=int(x_train.shape[1])
print("len_x is:",len_x)
# Neural Network
print("\nSetting up neural network model...")
nn = Sequential()
nn.add(Dense(units = 400 , kernel_initializer = 'normal', input_dim = len_x))
nn.add(PReLU())
nn.add(Dropout(.4))
nn.add(Dense(units = 160 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.6))
nn.add(Dense(units = 64 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.5))
nn.add(Dense(units = 26, kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.6))
nn.add(Dense(1, kernel_initializer='normal'))
nn.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))
print("\nFitting neural network model...")
nn.fit(np.array(x_train), np.array(y_train), batch_size = 32, epochs = 70, verbose=2)
print("\nPredicting with neural network model...")
#print("x_test.shape:",x_test.shape)
y_pred_ann = nn.predict(x_test)
print( "\nPreparing results for write..." )
nn_pred = y_pred_ann.flatten()
print( "Type of nn_pred is ", type(nn_pred) )
print( "Shape of nn_pred is ", nn_pred.shape )
print( "\nNeural Network predictions:" )
print( pd.DataFrame(nn_pred).head() )
In [24]:
########################
########################
## Combine and Save ##
########################
########################
##### COMBINE PREDICTIONS
print( "\nCombining XGBoost, LightGBM, and baseline predicitons ..." )
lgb_weight = (1 - XGB_WEIGHT - BASELINE_WEIGHT) / (1 - OLS_WEIGHT)
xgb_weight0 = XGB_WEIGHT / (1 - OLS_WEIGHT)
baseline_weight0 = BASELINE_WEIGHT / (1 - OLS_WEIGHT)
pred0 = xgb_weight0*xgb_pred + baseline_weight0*BASELINE_PRED + lgb_weight*p_test
print( "\nCombined XGB/LGB/baseline predictions:" )
print( pd.DataFrame(pred0).head() )
print( "\nPredicting with OLS and combining with XGB/LGB/baseline predicitons: ..." )
for i in range(len(test_dates)):
test['transactiondate'] = test_dates[i]
pred = OLS_WEIGHT*reg.predict(get_features(test)) + (1-OLS_WEIGHT)*pred0
submission[test_columns[i]] = [float(format(x, '.4f')) for x in pred]
print('predict...', i)
print( "\nCombined XGB/LGB/baseline/OLS predictions:" )
print( submission.head() )
##### WRITE THE RESULTS
from datetime import datetime
print( "\nWriting results to disk ..." )
submission.to_csv('sub{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False)
print( "\nFinished ...")
In [25]:
catboostx = pd.read_csv('Only_CatBoost.csv')
In [28]:
new_prediction = catboostx.copy()
In [34]:
genetic = pd.read_csv('xxx.csv.gz', compression='gzip')
In [40]:
categories_x = ['201610','201611','201612','201710','201711','201712']
new_prediction[categories_x] = genetic[categories_x] * 0.3 + (catboostx[categories_x]) * 0.5 + (submission[categories_x] * 0.2)
In [41]:
new_prediction.to_csv('genetic_catboost_ensemble_3_5_2.csv', index=False, float_format='%.6f')
In [ ]:
In [3]:
def plot_data(test, pred, sample, title, width=40, height=10, linewidth=0.5, color1='white', color2='orange'):
""" Plotting method. """
fig = plt.figure(figsize=(width, height))
plt.plot(pred[:sample], color=color1, zorder=4, linewidth=linewidth, label='%s Prediction'%(title))
plt.plot(test[:sample], color=color2, zorder=3, linewidth=linewidth, label='%s True Data'%(title))
plt.title = title
plt.legend()
# Frequency count
def get_frequency(data):
# Gets the frequency of a column's values in 'data'. Pass on a series.
vals = pd.merge(data.to_frame(), data.value_counts().reset_index(),
how='left', left_on=data.to_frame().columns[0], right_on='index').iloc[:, -1:].values
return vals
def time_data(data):
data['transactiondate'] = pd.to_datetime(data['transactiondate'])
data['day_of_week'] = data['transactiondate'].dt.dayofweek
data['month_of_year'] = data['transactiondate'].dt.month
data['quarter'] = data['transactiondate'].dt.quarter
data['is_weekend'] = (data['day_of_week'] < 5).astype(int)
data.drop('transactiondate', axis=1, inplace=True)
print('Added time data')
print('........')
return data
def column_excluder(data, missing_perc_thresh=0.98):
# Quick clean from https://www.kaggle.com/seesee/concise-catboost-starter-ensemble-plb-0-06435
exclude_missing = []
exclude_unique = []
num_rows = data.shape[0]
for c in data.columns:
num_missing = data[c].isnull().sum()
if num_missing == 0:
continue
missing_frac = num_missing / float(num_rows)
if missing_frac > missing_perc_thresh:
exclude_missing.append(c)
num_uniques = len(data[c].unique())
if data[c].isnull().sum() != 0:
num_uniques -= 1
if num_uniques == 1:
exclude_unique.append(c)
to_exclude = list(set(exclude_missing + exclude_unique))
print('Excluded columns:')
print(to_exclude)
print('........')
return to_exclude
def categorical_features(data):
# Quick categories from https://www.kaggle.com/seesee/concise-catboost-starter-ensemble-plb-0-06435
cat_feature_inds = []
cat_unique_thresh = 1000
for i, c in enumerate(data.columns):
num_uniques = len(data[c].unique())
if num_uniques < cat_unique_thresh \
and not 'sqft' in c \
and not 'cnt' in c \
and not 'nbr' in c \
and not 'number' in c:
cat_feature_inds.append(i)
print("Categorical features:")
print([data.columns[ind] for ind in cat_feature_inds])
print('........')
return cat_feature_inds
def complex_features(data):
# Gets counts, label encoding and frequency estimates.
# Frequency of occurances | length of codes | check if * is present
data['propertyzoningdesc_frq'] = get_frequency(data['propertyzoningdesc'])
data['propertyzoningdesc_len'] = data['propertyzoningdesc'].apply(lambda x: len(str(x)) if pd.notnull(x) else x)
#transactions_shuffled['propertyzoningdesc_str'] = transactions_shuffled['propertyzoningdesc'].apply(lambda x: (1 if '*' in str(x) else 0) if pd.notnull(x) else x)
# Label encoding | length of code
#transactions_shuffled['propertycountylandusecode_enc'] = transactions_shuffled[['propertycountylandusecode']].astype(str).apply(LabelEncoder().fit_transform)
#transactions_shuffled['propertycountylandusecode_len'] = transactions_shuffled['propertycountylandusecode'].apply(lambda x: x if pd.isnull(x) else len(x))
# Zip code area extraction
data['regionidzip_ab'] = data['regionidzip'].apply(lambda x: x if pd.isnull(x) else str(x)[:2]).astype(float)
data['regionidzip_abc'] = data['regionidzip'].apply(lambda x: x if pd.isnull(x) else str(x)[:3]).astype(float)
# Region neighbourhood area extraction
data['regionidneighborhood_ab'] = data['regionidneighborhood'].apply(lambda x: str(x)[:2] if pd.notnull(x) else x).astype(float)
# Rawcensustractandblock transformed
data['code_fips_cnt'] = get_frequency(data['rawcensustractandblock'].apply(lambda x: str(x)[:4]))
data['code_tract_cnt'] = get_frequency(data['rawcensustractandblock'].apply(lambda x: str(x)[4:11]))
data['code_block_cnt'] = get_frequency(data['rawcensustractandblock'].apply(lambda x: str(x)[11:]))
data.drop('rawcensustractandblock', axis=1, inplace=True)
# Encode string values
data[['propertycountylandusecode', 'propertyzoningdesc']] = data[['propertycountylandusecode', 'propertyzoningdesc']].astype(str).apply(LabelEncoder().fit_transform)
print('Generating complex features')
print('........')
return data
In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from catboost import CatBoostRegressor
from tqdm import tqdm
import gc
import datetime as dt
print('Loading Properties ...')
properties2016 = pd.read_csv('../Data/properties_2016.csv', low_memory = False)
properties2017 = pd.read_csv('../Data/properties_2017.csv', low_memory = False)
print('Loading Train ...')
train2016 = pd.read_csv('../Data/train_2016_v2.csv', parse_dates=['transactiondate'], low_memory=False)
train2017 = pd.read_csv('../Data/train_2017.csv', parse_dates=['transactiondate'], low_memory=False)
def add_date_features(df):
df["transaction_year"] = df["transactiondate"].dt.year
df["transaction_month"] = (df["transactiondate"].dt.year - 2016)*12 + df["transactiondate"].dt.month
df["transaction_day"] = df["transactiondate"].dt.day
df["transaction_quarter"] = (df["transactiondate"].dt.year - 2016)*4 +df["transactiondate"].dt.quarter
df.drop(["transactiondate"], inplace=True, axis=1)
return df
#train2016 = add_date_features(train2016)
#train2017 = add_date_features(train2017)
print('Loading Sample ...')
sample_submission = pd.read_csv('../Data/sample_submission.csv', low_memory = False)
print('Merge Train with Properties ...')
train2016 = pd.merge(train2016, properties2016, how = 'left', on = 'parcelid')
train2017 = pd.merge(train2017, properties2017, how = 'left', on = 'parcelid')
# Datetime transformation
train2017 = time_data(train2017)
train2017 = complex_features(train2017)
train2016 = time_data(train2016)
train2016 = complex_features(train2016)
print('Tax Features 2017 ...')
#train2017.iloc[:, train2017.columns.str.startswith('tax')] = np.nan
print('Concat Train 2016 & 2017 ...')
train_df = pd.concat([train2016, train2017], axis = 0)
test_df = pd.merge(sample_submission[['ParcelId']], properties2016.rename(columns = {'parcelid': 'ParcelId'}), how = 'left', on = 'ParcelId')
del properties2016, properties2017, train2016, train2017
gc.collect();
print('Remove missing data fields ...')
missing_perc_thresh = 0.98
exclude_missing = []
num_rows = train_df.shape[0]
for c in train_df.columns:
num_missing = train_df[c].isnull().sum()
if num_missing == 0:
continue
missing_frac = num_missing / float(num_rows)
if missing_frac > missing_perc_thresh:
exclude_missing.append(c)
print("We exclude: %s" % len(exclude_missing))
del num_rows, missing_perc_thresh
gc.collect();
print ("Remove features with one unique value !!")
exclude_unique = []
for c in train_df.columns:
num_uniques = len(train_df[c].unique())
if train_df[c].isnull().sum() != 0:
num_uniques -= 1
if num_uniques == 1:
exclude_unique.append(c)
print("We exclude: %s" % len(exclude_unique))
print ("Define training features !!")
exclude_other = ['parcelid', 'logerror','propertyzoningdesc']
train_features = []
for c in train_df.columns:
if c not in exclude_missing \
and c not in exclude_other and c not in exclude_unique:
train_features.append(c)
print("We use these for training: %s" % len(train_features))
print ("Define categorial features !!")
cat_feature_inds = []
cat_unique_thresh = 1000
for i, c in enumerate(train_features):
num_uniques = len(train_df[c].unique())
if num_uniques < cat_unique_thresh \
and not 'sqft' in c \
and not 'cnt' in c \
and not 'nbr' in c \
and not 'number' in c:
cat_feature_inds.append(i)
print("Cat features are: %s" % [train_features[ind] for ind in cat_feature_inds])
print ("Replacing NaN values by -999 !!")
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)
print ("Training time !!")
X_train = train_df[train_features]
y_train = train_df.logerror
print(X_train.shape, y_train.shape)
submission = pd.DataFrame({
'ParcelId': test_df['ParcelId'],
})
test_dates = {
'201610': pd.Timestamp('2016-09-30'),
'201611': pd.Timestamp('2016-10-31'),
'201612': pd.Timestamp('2016-11-30'),
'201710': pd.Timestamp('2017-09-30'),
'201711': pd.Timestamp('2017-10-31'),
'201712': pd.Timestamp('2017-11-30')
}
test_df = complex_features(test_df)
In [5]:
num_ensembles = 5
for label, test_date in test_dates.items():
submission[label] = 0
for i in tqdm(range(num_ensembles)):
print('Training model', i)
model = CatBoostRegressor(
iterations=630, learning_rate=0.03,
depth=6, l2_leaf_reg=3,
loss_function='MAE',
eval_metric='MAE',
random_seed=i)
model.fit(
X_train, y_train,
cat_features=cat_feature_inds)
for label, test_date in test_dates.items():
print("Predicting for: %s ... " % (label))
test_df['transactiondate'] = test_date
test_df = time_data(test_df)
X_test = test_df[train_features]
submission[label] += model.predict(X_test)
#submission.to_csv('catboost_sample.csv', float_format='%.6f',index=False)
In [ ]:
submission.to_csv('catboost_sample.csv', float_format='%.6f',index=False)
In [9]:
(submission[list(test_dates.keys())]/5).to_csv('catboost_sample.csv', float_format='%.6f',index=False)
In [15]:
new = pd.DataFrame()
new['ParcelId'] = submission['ParcelId']
new[list(test_dates.keys())] = (submission[list(test_dates.keys())]/5)
In [17]:
new.to_csv('catboost_sample.csv', float_format='%.6f',index=False)
In [ ]: