In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from catboost import CatBoostRegressor
from tqdm import tqdm
import gc
import datetime as dt

print('Loading Properties ...')
properties2016 = pd.read_csv('../Data/properties_2016.csv', low_memory = False)
properties2017 = pd.read_csv('../Data/properties_2017.csv', low_memory = False)

print('Loading Train ...')
train2016 = pd.read_csv('../Data/train_2016_v2.csv', parse_dates=['transactiondate'], low_memory=False)
train2017 = pd.read_csv('../Data/train_2017.csv', parse_dates=['transactiondate'], low_memory=False)

def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = (df["transactiondate"].dt.year - 2016)*12 + df["transactiondate"].dt.month
    df["transaction_day"] = df["transactiondate"].dt.day
    df["transaction_quarter"] = (df["transactiondate"].dt.year - 2016)*4 +df["transactiondate"].dt.quarter
    df.drop(["transactiondate"], inplace=True, axis=1)
    return df

train2016 = add_date_features(train2016)
train2017 = add_date_features(train2017)

print('Loading Sample ...')
sample_submission = pd.read_csv('../Data/sample_submission.csv', low_memory = False)

print('Merge Train with Properties ...')
train2016 = pd.merge(train2016, properties2016, how = 'left', on = 'parcelid')
train2017 = pd.merge(train2017, properties2017, how = 'left', on = 'parcelid')

print('Tax Features 2017  ...')
#train2017.iloc[:, train2017.columns.str.startswith('tax')] = np.nan

print('Concat Train 2016 & 2017 ...')
train_df = train2017.copy()#pd.concat([train2016, train2017], axis = 0)
test_df = pd.merge(sample_submission[['ParcelId']], properties2016.rename(columns = {'parcelid': 'ParcelId'}), how = 'left', on = 'ParcelId')

del properties2016, properties2017, train2016, train2017
gc.collect();

print('Remove missing data fields ...')

missing_perc_thresh = 0.98
exclude_missing = []
num_rows = train_df.shape[0]
for c in train_df.columns:
    num_missing = train_df[c].isnull().sum()
    if num_missing == 0:
        continue
    missing_frac = num_missing / float(num_rows)
    if missing_frac > missing_perc_thresh:
        exclude_missing.append(c)
print("We exclude: %s" % len(exclude_missing))

del num_rows, missing_perc_thresh
gc.collect();

print ("Remove features with one unique value !!")
exclude_unique = []
for c in train_df.columns:
    num_uniques = len(train_df[c].unique())
    if train_df[c].isnull().sum() != 0:
        num_uniques -= 1
    if num_uniques == 1:
        exclude_unique.append(c)
print("We exclude: %s" % len(exclude_unique))

print ("Define training features !!")
exclude_other = ['parcelid', 'logerror','propertyzoningdesc']
train_features = []
for c in train_df.columns:
    if c not in exclude_missing \
       and c not in exclude_other and c not in exclude_unique:
        train_features.append(c)
print("We use these for training: %s" % len(train_features))

print ("Define categorial features !!")
cat_feature_inds = []
cat_unique_thresh = 1000
for i, c in enumerate(train_features):
    num_uniques = len(train_df[c].unique())
    if num_uniques < cat_unique_thresh \
        and not 'sqft' in c \
        and not 'cnt' in c \
        and not 'nbr' in c \
        and not 'number' in c:
        cat_feature_inds.append(i)
        
print("Cat features are: %s" % [train_features[ind] for ind in cat_feature_inds])

print ("Replacing NaN values by -999 !!")
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

print ("Training time !!")
X_train = train_df[train_features]
y_train = train_df.logerror
print(X_train.shape, y_train.shape)

test_df['transactiondate'] = pd.Timestamp('2016-12-01') 
test_df = add_date_features(test_df)
X_test = test_df[train_features]
print(X_test.shape)

num_ensembles = 5
y_pred = 0.0
for i in tqdm(range(num_ensembles)):
    model = CatBoostRegressor(
        iterations=630, learning_rate=0.03,
        depth=6, l2_leaf_reg=3,
        loss_function='MAE',
        eval_metric='MAE',
        random_seed=i)
    model.fit(
        X_train, y_train,
        cat_features=cat_feature_inds)
    y_pred += model.predict(X_test)
y_pred /= num_ensembles

submission = pd.DataFrame({
    'ParcelId': test_df['ParcelId'],
})
test_dates = {
    '201610': pd.Timestamp('2016-09-30'),
    '201611': pd.Timestamp('2016-10-31'),
    '201612': pd.Timestamp('2016-11-30'),
    '201710': pd.Timestamp('2017-09-30'),
    '201711': pd.Timestamp('2017-10-31'),
    '201712': pd.Timestamp('2017-11-30')
}
for label, test_date in test_dates.items():
    print("Predicting for: %s ... " % (label))
    submission[label] = y_pred
    
submission.to_csv('Only_CatBoost.csv', float_format='%.6f',index=False)


Loading Properties ...
Loading Train ...
Loading Sample ...
Merge Train with Properties ...
Tax Features 2017  ...
Concat Train 2016 & 2017 ...
Remove missing data fields ...
We exclude: 14
Remove features with one unique value !!
We exclude: 11
Define training features !!
We use these for training: 41
Define categorial features !!
Cat features are: ['transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', 'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'propertycountylandusecode', 'propertylandusetypeid', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'yearbuilt', 'taxdelinquencyyear']
Replacing NaN values by -999 !!
Training time !!
(77613, 41) (77613,)
  0%|          | 0/5 [00:00<?, ?it/s]
(2985217, 41)
100%|██████████| 5/5 [20:43<00:00, 248.64s/it]
Predicting for: 201610 ... 
Predicting for: 201611 ... 
Predicting for: 201612 ... 
Predicting for: 201710 ... 
Predicting for: 201711 ... 
Predicting for: 201712 ... 

In [ ]:
#  This is forked form Kamil Kaczmarek's Simple LGBM model (the only kernel I've found so far
#  that makes a serious attempt to combine 2016 and 2017 data while accounting for the leak),
#  but I don't think Kamil handles the leak optimally.

#  1. We should use 2017 properties data for all non-tax variables in the 2016 train set.
#     (It's more complete, and there is no known leakage outside the tax variables.)
#  2. We shoold use 2016 properties data for the tax variables in the 2016 train set.
#  3. We should use 2017 properties data for all variables in the 2017 train set.
#  4. Predictions for 2016 should use the same properties data as the 2016 train set.
#     (Otherwise they would be using tax infomration not available in 2016.)
#  5. Predictions for 2017 should use the same properties data as the 2017 train set.

# In Kamil's version, he doesn't use the 2017 tax variables for training at all.
# (They are set to NA in the 2017 train data and not merged with 2017 train data.)
# That procedure does avoid the leak, 
# but I think it also throws away potentially useful information,
# so in this verison I've tried to use all the information we can without leaking.

# (Note:  I'm not entirely comfortable with using future data to predict the past,
#   even when there isn't an identified leak, but even Kamil's version does that,
#   since it trains on the full training set that includes sales from 2017.
#   It's probably OK here, since we're only using 2016 as a sanity check.)


# Anyhow, there is a significant chance that I got the logic wrong or introduced bugs,
# so if anyone thinks this is wrong, please let me know.


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb
import gc
import datetime as dt



print('Loading data...')
# Load raw data
properties2016_raw = pd.read_csv('../input/properties_2016.csv', low_memory = False)
properties2017 = pd.read_csv('../input/properties_2017.csv', low_memory = False)
train2016 = pd.read_csv('../input/train_2016_v2.csv')
train2017 = pd.read_csv('../input/train_2017.csv')
sample_submission = pd.read_csv('../input/sample_submission.csv', low_memory = False)

# Create a new version of 2016 properties data that takes all non-tax variables from 2017
taxvars = ['structuretaxvaluedollarcnt', 'landtaxvaluedollarcnt', 'taxvaluedollarcnt', 'taxamount']
tax2016 = properties2016_raw[['parcelid']+taxvars]
properties2016 = properties2017.drop(taxvars,axis=1).merge(tax2016, 
                 how='left', on='parcelid').reindex_axis(properties2017.columns, axis=1)

# Create a training data set
train2016 = pd.merge(train2016, properties2016, how = 'left', on = 'parcelid')
train2017 = pd.merge(train2017, properties2017, how = 'left', on = 'parcelid')
train = pd.concat([train2016, train2017], axis = 0)

# Create separate test data sets for 2016 and 2017
test2016 = pd.merge(sample_submission[['ParcelId']], properties2016.rename(columns = {'parcelid': 'ParcelId'}), 
                how = 'left', on = 'ParcelId')
test2017 = pd.merge(sample_submission[['ParcelId']], properties2017.rename(columns = {'parcelid': 'ParcelId'}), 
                how = 'left', on = 'ParcelId')
del properties2016, properties2017, train2016, train2017
gc.collect();


print('Memory usage reduction...')


train[['latitude', 'longitude']] /= 1e6
train['censustractandblock'] /= 1e12

def preptest(test):
    test[['latitude', 'longitude']] /= 1e6
    test[['latitude', 'longitude']] /= 1e6
    test['censustractandblock'] /= 1e12
    test['censustractandblock'] /= 1e12

    for column in test.columns:
        if test[column].dtype == int:
            test[column] = test[column].astype(np.int32)
        if test[column].dtype == float:
            test[column] = test[column].astype(np.float32)

preptest(test2016)
preptest(test2017)
        
print('Feature engineering...')
train['month'] = (pd.to_datetime(train['transactiondate']).dt.year - 2016)*12 + pd.to_datetime(train['transactiondate']).dt.month
train = train.drop('transactiondate', axis = 1)
from sklearn.preprocessing import LabelEncoder
non_number_columns = train.dtypes[train.dtypes == object].index.values

for column in non_number_columns:
    train_test = pd.concat([train[column], test2016[column], test2017[column]], axis = 0)
    encoder = LabelEncoder().fit(train_test.astype(str))
    train[column] = encoder.transform(train[column].astype(str)).astype(np.int32)
    test2016[column] = encoder.transform(test2016[column].astype(str)).astype(np.int32)
    test2017[column] = encoder.transform(test2017[column].astype(str)).astype(np.int32)
    
feature_names = [feature for feature in train.columns[2:] if feature != 'month']

month_avgs = train.groupby('month').agg('mean')['logerror'].values - train['logerror'].mean()
                             
print('Preparing arrays and throwing out outliers...')
X_train = train[feature_names].values
y_train = train['logerror'].values
X_test2016 = test2016[feature_names].values
X_test2017 = test2017[feature_names].values

del test2016, test2017;
gc.collect();

month_values = train['month'].values
month_avg_values = np.array([month_avgs[month - 1] for month in month_values]).reshape(-1, 1)
X_train = np.hstack([X_train, month_avg_values])

X_train = X_train[np.abs(y_train) < 0.4, :]
y_train = y_train[np.abs(y_train) < 0.4]


print('Training LGBM model...')
ltrain = lgb.Dataset(X_train, label = y_train)

params = {}
params['metric'] = 'mae'
params['max_depth'] = 100
params['num_leaves'] = 32
params['feature_fraction'] = .85
params['bagging_fraction'] = .95
params['bagging_freq'] = 8
params['learning_rate'] = 0.0025
params['verbosity'] = 0

lgb_model = lgb.train(params, ltrain, valid_sets = [ltrain], verbose_eval=200, num_boost_round=2930)
                  
                  
print('Making predictions and praying for good results...')
X_test2016 = np.hstack([X_test2016, np.zeros((X_test2016.shape[0], 1))])
X_test2017 = np.hstack([X_test2016, np.zeros((X_test2017.shape[0], 1))])
folds = 20
n = int(X_test2016.shape[0] / folds)

for j in range(folds):
    results = pd.DataFrame()

    if j < folds - 1:
            X_test2016_ = X_test2016[j*n: (j+1)*n, :]
            X_test2017_ = X_test2017[j*n: (j+1)*n, :]
            results['ParcelId'] = sample_submission['ParcelId'].iloc[j*n: (j+1)*n]
    else:
            X_test2016_ = X_test2016[j*n: , :]
            X_test2017_ = X_test2017[j*n: , :]
            results['ParcelId'] = sample_submission['ParcelId'].iloc[j*n: ]

    for month in [10, 11, 12]:
        X_test2016_[:, -1] = month_avgs[month - 1]
        assert X_test2016_.shape[1] == X_test2016.shape[1]
        y_pred = lgb_model.predict(X_test2016_)
        results['2016'+ str(month)] = y_pred
        
    X_test2017_[:, -1] = month_avgs[20]
    assert X_test2017_.shape[1] == X_test2017.shape[1]
    y_pred = lgb_model.predict(X_test2017_)
    results['201710'] = y_pred
    results['201711'] = y_pred
    results['201712'] = y_pred
    
    if j == 0:
        results_ = results.copy()
    else:
        results_ = pd.concat([results_, results], axis = 0)
    print('{}% completed'.format(round(100*(j+1)/folds)))
    
    
print('Saving predictions...')
results = results_[sample_submission.columns]
assert results.shape == sample_submission.shape
results.to_csv('submission.csv', index = False, float_format = '%.5f')
print('Done!')