In [1]:
# Parameters
FUDGE_FACTOR = 1.1200  # Multiply forecasts by this

XGB_WEIGHT = 0.6200
BASELINE_WEIGHT = 0.0100
OLS_WEIGHT = 0.0620
NN_WEIGHT = 0.0800

XGB1_WEIGHT = 0.8000  # Weight of first in combination of two XGB models

BASELINE_PRED = 0.0115   # Baseline based on mean of training data, per Oleg

In [2]:
#Import Lib#
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as  xgb 
import random 
import lightgbm as lgb
import datetime as dt 
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [3]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout, BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.layers.noise import GaussianDropout
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer


Using TensorFlow backend.
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))

In [4]:
#Load data#
train = pd.read_csv('../input/train_2017.csv',parse_dates = ["transactiondate"])
properties = pd.read_csv('../input/properties_2017.csv')
test = pd.read_csv('../input/sample_submission.csv') 
test = test.rename(columns = {'ParcelId': 'parcelid'})


/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (49) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [5]:
print("Training Size:" + str(train.shape))
print("Property Size:" + str(properties.shape))
print("Sample Size:" + str(test.shape))


Training Size:(77613, 3)
Property Size:(2985217, 58)
Sample Size:(2985217, 7)

In [6]:
################
################
##  LightGBM  ##
################
################

# This section is (I think) originally derived from SIDHARTH's script:
#   https://www.kaggle.com/sidharthkumar/trying-lightgbm
# which was forked and tuned by Yuqing Xue:
#   https://www.kaggle.com/yuqingxue/lightgbm-85-97
# and updated by Andy Harless:
#   https://www.kaggle.com/aharless/lightgbm-with-outliers-remaining
# and a lot of additional changes have happened since then

In [7]:
##### PROCESS DATA FOR LIGHTGBM
print( "\nProcessing data for LightGBM ..." )
for c, dtype in zip(properties.columns, properties.dtypes):
    if dtype == np.float64:
        properties[c] = properties[c].astype(np.float32)
        
df_train = train.merge(properties, how='left', on='parcelid')
df_train.fillna(df_train.median(),inplace = True)

x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
                         'propertycountylandusecode', 'fireplacecnt', 'fireplaceflag'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)


Processing data for LightGBM ...
(77613, 53) (77613,)

In [8]:
train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

del df_train; gc.collect()

x_train = x_train.values.astype(np.float32, copy=False)
d_train = lgb.Dataset(x_train, label=y_train)

In [9]:
##### RUN LIGHTGBM

params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.0021 # shrinkage_rate
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'l1'          # or 'mae'
params['sub_feature'] = 0.345    # feature_fraction (small values => use very different submodels)
params['bagging_fraction'] = 0.85 # sub_row
params['bagging_freq'] = 40
params['num_leaves'] = 512        # num_leaf
params['min_data'] = 500         # min_data_in_leaf
params['min_hessian'] = 0.05     # min_sum_hessian_in_leaf
params['verbose'] = 0
params['feature_fraction_seed'] = 2
params['bagging_seed'] = 3

np.random.seed(0)
random.seed(0)

In [10]:
print("\nFitting LightGBM model ...")
clf = lgb.train(params, d_train, 430)


Fitting LightGBM model ...

In [11]:
del d_train; gc.collect()
del x_train; gc.collect()


Out[11]:
0

In [12]:
print("\nPrepare for LightGBM prediction ...")
print("   Read sample file ...")
sample = pd.read_csv('../input/sample_submission.csv')
print("   ...")
sample['parcelid'] = sample['ParcelId']
print("   Merge with property data ...")
df_test = sample.merge(properties, on='parcelid', how='left')
print("   ...")
del sample, properties; gc.collect()
print("   ...")
#df_test['Ratio_1'] = df_test['taxvaluedollarcnt']/df_test['taxamount']
x_test = df_test[train_columns]
print("   ...")
del df_test; gc.collect()
print("   Preparing x_test...")
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)
print("   ...")
x_test = x_test.values.astype(np.float32, copy=False)

print("\nStart LightGBM prediction ...")
p_test = clf.predict(x_test)

del x_test; gc.collect()

print( "\nUnadjusted LightGBM predictions:" )
print( pd.DataFrame(p_test).head() )


Prepare for LightGBM prediction ...
   Read sample file ...
   ...
   Merge with property data ...
   ...
   ...
   ...
   Preparing x_test...
   ...

Start LightGBM prediction ...

Unadjusted LightGBM predictions:
          0
0  0.025981
1  0.026097
2  0.022967
3  0.028343
4  0.021492

In [13]:
################
################
##  XGBoost   ##
################
################

# This section is (I think) originally derived from Infinite Wing's script:
#   https://www.kaggle.com/infinitewing/xgboost-without-outliers-lb-0-06463
# inspired by this thread:
#   https://www.kaggle.com/c/zillow-prize-1/discussion/33710
# but the code has gone through a lot of changes since then

In [14]:
print( "\nRe-reading properties file ...")
properties = pd.read_csv('../input/properties_2017.csv')


Re-reading properties file ...
/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (49) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [15]:
##### PROCESS DATA FOR XGBOOST

print( "\nProcessing data for XGBoost ...")
for c in properties.columns:
    properties[c]=properties[c].fillna(-1)
    if properties[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(properties[c].values))
        properties[c] = lbl.transform(list(properties[c].values))

train_df = train.merge(properties, how='left', on='parcelid')
x_train = train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
x_test = properties.drop(['parcelid'], axis=1)


Processing data for XGBoost ...

In [16]:
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))


Shape train: (77613, 57)
Shape test: (2985217, 57)

In [17]:
# drop out ouliers
train_df=train_df[ train_df.logerror > -0.4 ]
train_df=train_df[ train_df.logerror < 0.419 ]
x_train=train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
y_train = train_df["logerror"].values.astype(np.float32)
y_mean = np.mean(y_train)

print('After removing outliers:')     
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))


After removing outliers:
Shape train: (75949, 57)
Shape test: (2985217, 57)

In [18]:
##### RUN XGBOOST

print("\nSetting up data for XGBoost ...")
# xgboost params
xgb_params = {
    'eta': 0.037,
    'max_depth': 5,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': y_mean,
    'silent': 1
}


Setting up data for XGBoost ...

In [19]:
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

num_boost_rounds = 250
print("num_boost_rounds="+str(num_boost_rounds))

# train model
print( "\nTraining XGBoost ...")
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

print( "\nPredicting with XGBoost ...")
xgb_pred1 = model.predict(dtest)

print( "\nFirst XGBoost predictions:" )
print( pd.DataFrame(xgb_pred1).head() )


num_boost_rounds=250

Training XGBoost ...

Predicting with XGBoost ...

First XGBoost predictions:
          0
0  0.076013
1  0.056619
2 -0.064734
3  0.003192
4  0.001247

In [20]:
print("\nSetting up data for XGBoost ...")
# xgboost params
xgb_params = {
    'eta': 0.033,
    'max_depth': 6,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'base_score': y_mean,
    'silent': 1
}


num_boost_rounds = 150
print("num_boost_rounds="+str(num_boost_rounds))

print( "\nTraining XGBoost again ...")
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

print( "\nPredicting with XGBoost again ...")
xgb_pred2 = model.predict(dtest)

print( "\nSecond XGBoost predictions:" )
print( pd.DataFrame(xgb_pred2).head() )


Setting up data for XGBoost ...
num_boost_rounds=150

Training XGBoost again ...

Predicting with XGBoost again ...

Second XGBoost predictions:
          0
0  0.070272
1  0.053413
2 -0.100315
3 -0.003527
4 -0.000545

In [21]:
##### COMBINE XGBOOST RESULTS
xgb_pred = XGB1_WEIGHT*xgb_pred1 + (1-XGB1_WEIGHT)*xgb_pred2
#xgb_pred = xgb_pred1

print( "\nCombined XGBoost predictions:" )
print( pd.DataFrame(xgb_pred).head() )

del train_df
del x_train
del x_test
del properties
del dtest
del dtrain
del xgb_pred1
del xgb_pred2 
gc.collect()


Combined XGBoost predictions:
          0
0  0.074865
1  0.055978
2 -0.071850
3  0.001848
4  0.000889
Out[21]:
150

In [22]:
######################
######################
##  Neural Network  ##
######################
######################

# Neural network copied from this script:
#   https://www.kaggle.com/aharless/keras-neural-network-lb-06492 (version 20)
# which was built on the skeleton in this notebook:
#   https://www.kaggle.com/prasunmishra/ann-using-keras

In [23]:
# Read in data for neural network
print( "\n\nProcessing data for Neural Network ...")
print('\nLoading train, prop and sample data...')
train = pd.read_csv("../input/train_2017.csv", parse_dates=["transactiondate"])
prop = pd.read_csv('../input/properties_2017.csv')
sample = pd.read_csv('../input/sample_submission.csv')



Processing data for Neural Network ...

Loading train, prop and sample data...
/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (49) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [24]:
print('Fitting Label Encoder on properties...')
for c in prop.columns:
    prop[c]=prop[c].fillna(-1)
    if prop[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(prop[c].values))
        prop[c] = lbl.transform(list(prop[c].values))


Fitting Label Encoder on properties...

In [25]:
print('Creating training set...')
df_train = train.merge(prop, how='left', on='parcelid')

df_train["transactiondate"] = pd.to_datetime(df_train["transactiondate"])
df_train["transactiondate_year"] = df_train["transactiondate"].dt.year
df_train["transactiondate_month"] = df_train["transactiondate"].dt.month
df_train['transactiondate_quarter'] = df_train['transactiondate'].dt.quarter
df_train["transactiondate"] = df_train["transactiondate"].dt.day


Creating training set...

In [26]:
print('Filling NA/NaN values...' )
df_train.fillna(-1.0)


Filling NA/NaN values...
Out[26]:
parcelid logerror transactiondate airconditioningtypeid architecturalstyletypeid basementsqft bathroomcnt bedroomcnt buildingclasstypeid buildingqualitytypeid ... taxvaluedollarcnt assessmentyear landtaxvaluedollarcnt taxamount taxdelinquencyflag taxdelinquencyyear censustractandblock transactiondate_year transactiondate_month transactiondate_quarter
0 14297519 0.025595 1 -1.0 -1.0 -1.0 3.5 4.0 -1.0 -1.0 ... 1023282.0 2016.0 537569.0 11013.72 0 -1.0 6.059063e+13 2017 1 1
1 17052889 0.055619 1 -1.0 -1.0 -1.0 1.0 2.0 -1.0 -1.0 ... 464000.0 2016.0 376000.0 5672.48 0 -1.0 6.111001e+13 2017 1 1
2 14186244 0.005383 1 -1.0 -1.0 -1.0 2.0 3.0 -1.0 -1.0 ... 564778.0 2016.0 479489.0 6488.30 0 -1.0 6.059022e+13 2017 1 1
3 12177905 -0.103410 1 -1.0 -1.0 -1.0 3.0 4.0 -1.0 8.0 ... 145143.0 2016.0 36225.0 1777.51 0 -1.0 6.037300e+13 2017 1 1
4 10887214 0.006940 1 1.0 -1.0 -1.0 3.0 3.0 -1.0 8.0 ... 119407.0 2016.0 45726.0 1533.89 0 -1.0 6.037124e+13 2017 1 1
5 17143294 -0.020526 1 -1.0 -1.0 -1.0 2.0 3.0 -1.0 -1.0 ... 331064.0 2016.0 132424.0 3508.10 0 -1.0 6.111005e+13 2017 1 1
6 12095076 -0.001011 1 1.0 -1.0 -1.0 3.0 4.0 -1.0 9.0 ... 773303.0 2016.0 496619.0 9516.26 0 -1.0 6.037461e+13 2017 1 1
7 12069064 0.101723 1 -1.0 -1.0 -1.0 1.0 2.0 -1.0 5.0 ... 218552.0 2016.0 199662.0 2366.08 0 -1.0 6.037302e+13 2017 1 1
8 12790562 -0.040966 2 -1.0 -1.0 -1.0 3.0 4.0 -1.0 9.0 ... 220583.0 2016.0 43056.0 3104.19 0 -1.0 6.037500e+13 2017 1 1
9 11542646 -0.036763 2 -1.0 -1.0 -1.0 3.0 2.0 -1.0 8.0 ... 371361.0 2016.0 220058.0 4557.32 0 -1.0 6.037275e+13 2017 1 1
10 11104527 0.005963 2 1.0 -1.0 -1.0 3.0 4.0 -1.0 8.0 ... 430108.0 2016.0 158393.0 6103.36 0 -1.0 6.037920e+13 2017 1 1
11 13984101 -0.147997 2 -1.0 -1.0 -1.0 0.0 0.0 -1.0 -1.0 ... 471590.0 2016.0 309919.0 6151.38 0 -1.0 6.059042e+13 2017 1 1
12 14705431 -0.028835 2 -1.0 -1.0 -1.0 2.5 3.0 -1.0 -1.0 ... 582011.0 2016.0 463974.0 6675.04 0 -1.0 6.059099e+13 2017 1 1
13 13944538 0.045602 2 13.0 -1.0 -1.0 2.5 3.0 -1.0 -1.0 ... 319668.0 2016.0 223051.0 4078.08 0 -1.0 6.059089e+13 2017 1 1
14 14731691 -0.008935 2 -1.0 -1.0 -1.0 2.0 2.0 -1.0 -1.0 ... 132425.0 2016.0 15362.0 1771.34 0 -1.0 6.059099e+13 2017 1 1
15 17110996 0.008669 2 -1.0 -1.0 -1.0 2.5 3.0 -1.0 -1.0 ... 198054.0 2016.0 99028.0 2204.84 0 -1.0 6.111005e+13 2017 1 1
16 14375300 -0.021896 2 13.0 -1.0 -1.0 2.0 4.0 -1.0 -1.0 ... 528847.0 2016.0 359729.0 5496.44 0 -1.0 6.059052e+13 2017 1 1
17 12045625 -0.017167 2 1.0 -1.0 -1.0 3.0 3.0 -1.0 8.0 ... 259197.0 2016.0 89470.0 2831.48 0 -1.0 6.037301e+13 2017 1 1
18 11830315 -0.035085 2 -1.0 -1.0 -1.0 1.0 2.0 -1.0 4.0 ... 273745.0 2016.0 145785.0 3739.02 0 -1.0 6.037532e+13 2017 1 1
19 14730402 -0.012041 2 -1.0 -1.0 -1.0 2.0 2.0 -1.0 -1.0 ... 245910.0 2016.0 170974.0 2548.58 1 15.0 6.059032e+13 2017 1 1
20 12325145 0.042463 2 1.0 -1.0 -1.0 2.0 3.0 -1.0 6.0 ... 178646.0 2016.0 38065.0 2391.69 1 15.0 6.037554e+13 2017 1 1
21 14387959 -0.006653 2 -1.0 -1.0 -1.0 2.5 3.0 -1.0 -1.0 ... 597213.0 2016.0 423944.0 6172.28 0 -1.0 6.059042e+13 2017 1 1
22 14349322 0.044309 2 -1.0 -1.0 -1.0 2.5 3.0 -1.0 -1.0 ... 534612.0 2016.0 313179.0 7141.56 0 -1.0 6.059052e+13 2017 1 1
23 11706737 -0.120976 2 -1.0 -1.0 -1.0 1.0 2.0 -1.0 5.0 ... 171838.0 2016.0 134175.0 2220.23 1 14.0 6.037232e+13 2017 1 1
24 11671335 -0.000546 2 1.0 -1.0 -1.0 3.0 2.0 -1.0 9.0 ... 573247.0 2016.0 296606.0 6964.34 0 -1.0 6.037263e+13 2017 1 1
25 12531488 0.042041 2 -1.0 -1.0 -1.0 1.0 3.0 -1.0 4.0 ... 432754.0 2016.0 334716.0 5212.75 0 -1.0 6.037578e+13 2017 1 1
26 14314879 -0.001476 2 -1.0 -1.0 -1.0 4.5 6.0 -1.0 -1.0 ... 2040000.0 2016.0 1741737.0 21758.26 0 -1.0 6.059063e+13 2017 1 1
27 11130689 0.020110 2 1.0 -1.0 -1.0 2.0 3.0 -1.0 6.0 ... 401399.0 2016.0 179161.0 5438.41 0 -1.0 6.037920e+13 2017 1 1
28 12036177 0.206470 2 -1.0 -1.0 -1.0 2.0 3.0 -1.0 4.0 ... 85035.0 2016.0 52771.0 1232.08 0 -1.0 6.037189e+13 2017 1 1
29 14333888 -0.025392 2 -1.0 -1.0 -1.0 3.0 2.0 -1.0 -1.0 ... 954106.0 2016.0 638423.0 10696.14 0 -1.0 6.059076e+13 2017 1 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
77583 12651460 0.094750 19 -1.0 -1.0 -1.0 1.0 2.0 -1.0 5.0 ... 443726.0 2016.0 354981.0 5423.02 0 -1.0 6.037297e+13 2017 9 3
77584 10962691 -0.064765 19 -1.0 -1.0 -1.0 2.0 3.0 -1.0 4.0 ... 316229.0 2016.0 138246.0 3980.97 0 -1.0 6.037107e+13 2017 9 3
77585 11487400 0.046755 19 -1.0 -1.0 -1.0 2.0 2.0 -1.0 7.0 ... 213230.0 2016.0 77126.0 2940.30 0 -1.0 6.037621e+13 2017 9 3
77586 12433997 0.018458 19 -1.0 -1.0 -1.0 1.0 2.0 -1.0 4.0 ... 143238.0 2016.0 95811.0 1865.65 1 12.0 6.037555e+13 2017 9 3
77587 10855745 0.002138 19 1.0 -1.0 -1.0 3.0 3.0 -1.0 6.0 ... 796324.0 2016.0 391845.0 9703.53 0 -1.0 6.037129e+13 2017 9 3
77588 12883894 1.285713 19 -1.0 -1.0 -1.0 1.0 3.0 -1.0 4.0 ... 15870.0 2016.0 6143.0 500.68 0 -1.0 6.037402e+13 2017 9 3
77589 12885530 0.003457 19 -1.0 -1.0 -1.0 1.0 2.0 -1.0 4.0 ... 60841.0 2016.0 11774.0 904.27 0 -1.0 6.037403e+13 2017 9 3
77590 11490983 -0.036067 19 -1.0 -1.0 -1.0 1.0 2.0 -1.0 8.0 ... 242126.0 2016.0 109506.0 3018.98 0 -1.0 6.037621e+13 2017 9 3
77591 12401161 -0.232215 19 -1.0 -1.0 -1.0 3.0 3.0 -1.0 9.0 ... 887197.0 2016.0 293396.0 10781.20 0 -1.0 6.037551e+13 2017 9 3
77592 11068972 0.015806 19 1.0 -1.0 -1.0 5.0 4.0 -1.0 11.0 ... 1316000.0 2016.0 526000.0 16032.16 0 -1.0 6.037108e+13 2017 9 3
77593 11490089 0.068838 19 -1.0 -1.0 -1.0 2.0 2.0 -1.0 8.0 ... 302341.0 2016.0 145026.0 3774.07 0 -1.0 6.037621e+13 2017 9 3
77594 12406123 0.000099 19 -1.0 -1.0 -1.0 2.0 2.0 -1.0 4.0 ... 108344.0 2016.0 41307.0 1580.48 0 -1.0 6.037555e+13 2017 9 3
77595 12669704 0.047467 19 -1.0 -1.0 -1.0 2.0 4.0 -1.0 8.0 ... 139547.0 2016.0 73500.0 1770.95 0 -1.0 6.037651e+13 2017 9 3
77596 12410050 0.000749 19 -1.0 -1.0 -1.0 1.0 2.0 -1.0 4.0 ... 222739.0 2016.0 157067.0 2759.92 0 -1.0 6.037553e+13 2017 9 3
77597 11953526 -0.159471 19 -1.0 -1.0 -1.0 0.0 0.0 -1.0 -1.0 ... 34124.0 2016.0 27730.0 490.51 0 -1.0 6.037187e+13 2017 9 3
77598 11608641 -0.816510 19 -1.0 -1.0 -1.0 2.0 2.0 -1.0 6.0 ... 97499.0 2016.0 72878.0 1391.79 1 14.0 6.037269e+13 2017 9 3
77599 11978003 -0.062404 19 1.0 -1.0 -1.0 2.0 3.0 -1.0 6.0 ... 603333.0 2016.0 422334.0 7312.73 0 -1.0 6.037183e+13 2017 9 3
77600 11318911 0.013268 19 -1.0 -1.0 -1.0 1.0 1.0 -1.0 5.0 ... 81706.0 2016.0 32682.0 1131.24 0 -1.0 6.037901e+13 2017 9 3
77601 10722237 0.052630 19 1.0 -1.0 -1.0 1.0 2.0 -1.0 8.0 ... 259738.0 2016.0 144804.0 3225.40 0 -1.0 6.037134e+13 2017 9 3
77602 12055795 -0.140881 19 -1.0 -1.0 -1.0 1.0 2.0 -1.0 4.0 ... 312183.0 2016.0 189703.0 3465.38 0 -1.0 6.037302e+13 2017 9 3
77603 12892446 0.319408 19 -1.0 -1.0 -1.0 3.0 4.0 -1.0 8.0 ... 332251.0 2016.0 121211.0 4225.84 0 -1.0 6.037402e+13 2017 9 3
77604 12666457 0.028733 19 -1.0 -1.0 -1.0 3.0 3.0 -1.0 10.0 ... 1414710.0 2016.0 1188359.0 16388.30 0 -1.0 6.037621e+13 2017 9 3
77605 10858613 -0.075465 19 1.0 -1.0 -1.0 2.0 2.0 -1.0 8.0 ... 346788.0 2016.0 214889.0 4231.76 0 -1.0 6.037141e+13 2017 9 3
77606 10722691 0.081196 19 1.0 -1.0 -1.0 2.0 3.0 -1.0 6.0 ... 72026.0 2016.0 25242.0 1000.70 0 -1.0 6.037135e+13 2017 9 3
77607 12412492 0.001082 19 -1.0 -1.0 -1.0 2.0 4.0 -1.0 6.0 ... 346534.0 2016.0 221068.0 4175.08 0 -1.0 6.037555e+13 2017 9 3
77608 10833991 -0.002245 20 1.0 -1.0 -1.0 3.0 3.0 -1.0 8.0 ... 379000.0 2016.0 114000.0 4685.34 0 -1.0 6.037132e+13 2017 9 3
77609 11000655 0.020615 20 -1.0 -1.0 -1.0 2.0 2.0 -1.0 6.0 ... 354621.0 2016.0 283704.0 4478.43 0 -1.0 6.037101e+13 2017 9 3
77610 17239384 0.013209 21 -1.0 -1.0 -1.0 2.0 4.0 -1.0 -1.0 ... 67205.0 2016.0 16522.0 1107.48 0 -1.0 6.111008e+13 2017 9 3
77611 12773139 0.037129 21 1.0 -1.0 -1.0 1.0 3.0 -1.0 4.0 ... 49546.0 2016.0 16749.0 876.43 0 -1.0 6.037434e+13 2017 9 3
77612 12826780 0.007204 25 -1.0 -1.0 -1.0 2.0 3.0 -1.0 6.0 ... 522000.0 2016.0 382000.0 6317.15 0 -1.0 6.037503e+13 2017 9 3

77613 rows × 63 columns


In [27]:
print('Creating x_train and y_train from df_train...' )
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode','fireplacecnt', 'fireplaceflag'], axis=1)
y_train = df_train["logerror"]


Creating x_train and y_train from df_train...

In [28]:
y_mean = np.mean(y_train)
print(x_train.shape, y_train.shape)
train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)


(77613, 56) (77613,)

In [29]:
print('Creating df_test...')
sample['parcelid'] = sample['ParcelId']

print("Merging Sample with property data...")
df_test = sample.merge(prop, on='parcelid', how='left')

df_test["transactiondate"] = pd.to_datetime('2016-11-15')  # placeholder value for preliminary version
df_test["transactiondate_year"] = df_test["transactiondate"].dt.year
df_test["transactiondate_month"] = df_test["transactiondate"].dt.month
df_test['transactiondate_quarter'] = df_test['transactiondate'].dt.quarter
df_test["transactiondate"] = df_test["transactiondate"].dt.day     
x_test = df_test[train_columns]


Creating df_test...
Merging Sample with property data...

In [30]:
print('Shape of x_test:', x_test.shape)
print("Preparing x_test...")
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)


Shape of x_test: (2985217, 56)
Preparing x_test...

In [31]:
## Preprocessing
print("\nPreprocessing neural network data...")
imputer= Imputer()
imputer.fit(x_train.iloc[:, :])
x_train = imputer.transform(x_train.iloc[:, :])
imputer.fit(x_test.iloc[:, :])
x_test = imputer.transform(x_test.iloc[:, :])

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

len_x=int(x_train.shape[1])
print("len_x is:",len_x)


Preprocessing neural network data...
len_x is: 56

In [32]:
print("\nSetting up neural network model...")
nn = Sequential()
nn.add(Dense(units = 400 , kernel_initializer = 'normal', input_dim = len_x))
nn.add(PReLU())
nn.add(Dropout(.4))
nn.add(Dense(units = 160 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.6))
nn.add(Dense(units = 64 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.5))
nn.add(Dense(units = 26, kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.6))
nn.add(Dense(1, kernel_initializer='normal'))
nn.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))


Setting up neural network model...
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))

In [33]:
print("\nFitting neural network model...")
nn.fit(np.array(x_train), np.array(y_train), batch_size = 32, epochs = 70, verbose=2)

print("\nPredicting with neural network model...")
#print("x_test.shape:",x_test.shape)
y_pred_ann = nn.predict(x_test)


Fitting neural network model...
Epoch 1/70
14s - loss: 0.0732
Epoch 2/70
14s - loss: 0.0698
Epoch 3/70
14s - loss: 0.0698
Epoch 4/70
14s - loss: 0.0697
Epoch 5/70
14s - loss: 0.0697
Epoch 6/70
14s - loss: 0.0696
Epoch 7/70
14s - loss: 0.0696
Epoch 8/70
14s - loss: 0.0695
Epoch 9/70
14s - loss: 0.0695
Epoch 10/70
13s - loss: 0.0695
Epoch 11/70
14s - loss: 0.0694
Epoch 12/70
14s - loss: 0.0694
Epoch 13/70
14s - loss: 0.0694
Epoch 14/70
14s - loss: 0.0694
Epoch 15/70
14s - loss: 0.0693
Epoch 16/70
14s - loss: 0.0694
Epoch 17/70
14s - loss: 0.0693
Epoch 18/70
14s - loss: 0.0693
Epoch 19/70
14s - loss: 0.0693
Epoch 20/70
14s - loss: 0.0693
Epoch 21/70
14s - loss: 0.0693
Epoch 22/70
14s - loss: 0.0693
Epoch 23/70
14s - loss: 0.0693
Epoch 24/70
14s - loss: 0.0692
Epoch 25/70
14s - loss: 0.0693
Epoch 26/70
14s - loss: 0.0693
Epoch 27/70
14s - loss: 0.0692
Epoch 28/70
14s - loss: 0.0692
Epoch 29/70
15s - loss: 0.0692
Epoch 30/70
14s - loss: 0.0692
Epoch 31/70
16s - loss: 0.0692
Epoch 32/70
14s - loss: 0.0692
Epoch 33/70
14s - loss: 0.0692
Epoch 34/70
14s - loss: 0.0692
Epoch 35/70
14s - loss: 0.0692
Epoch 36/70
14s - loss: 0.0691
Epoch 37/70
14s - loss: 0.0692
Epoch 38/70
14s - loss: 0.0692
Epoch 39/70
14s - loss: 0.0692
Epoch 40/70
14s - loss: 0.0691
Epoch 41/70
14s - loss: 0.0691
Epoch 42/70
14s - loss: 0.0691
Epoch 43/70
14s - loss: 0.0691
Epoch 44/70
14s - loss: 0.0691
Epoch 45/70
14s - loss: 0.0691
Epoch 46/70
14s - loss: 0.0691
Epoch 47/70
14s - loss: 0.0691
Epoch 48/70
14s - loss: 0.0691
Epoch 49/70
14s - loss: 0.0691
Epoch 50/70
14s - loss: 0.0691
Epoch 51/70
14s - loss: 0.0691
Epoch 52/70
14s - loss: 0.0691
Epoch 53/70
14s - loss: 0.0691
Epoch 54/70
14s - loss: 0.0691
Epoch 55/70
14s - loss: 0.0691
Epoch 56/70
14s - loss: 0.0690
Epoch 57/70
14s - loss: 0.0691
Epoch 58/70
14s - loss: 0.0691
Epoch 59/70
14s - loss: 0.0690
Epoch 60/70
14s - loss: 0.0691
Epoch 61/70
14s - loss: 0.0691
Epoch 62/70
14s - loss: 0.0691
Epoch 63/70
14s - loss: 0.0690
Epoch 64/70
14s - loss: 0.0690
Epoch 65/70
14s - loss: 0.0690
Epoch 66/70
15s - loss: 0.0690
Epoch 67/70
14s - loss: 0.0691
Epoch 68/70
14s - loss: 0.0691
Epoch 69/70
14s - loss: 0.0690
Epoch 70/70
14s - loss: 0.0690

Predicting with neural network model...

In [34]:
#transaction dateprint( "\nPreparing results for write..." )
nn_pred = y_pred_ann.flatten()
print( "Type of nn_pred is ", type(nn_pred) )
print( "Shape of nn_pred is ", nn_pred.shape )


Type of nn_pred is  <class 'numpy.ndarray'>
Shape of nn_pred is  (2985217,)

In [35]:
# Cleanup
del train
del prop
del sample
del x_train
del x_test
del df_train
del df_test
del y_pred_ann
gc.collect()


Out[35]:
14254

In [36]:
################
################
##    OLS     ##
################
################

# This section is derived from the1owl's notebook:
#    https://www.kaggle.com/the1owl/primer-for-the-zillow-pred-approach
# which I (Andy Harless) updated and made into a script:
#    https://www.kaggle.com/aharless/updated-script-version-of-the1owl-s-basic-ols

In [37]:
np.random.seed(17)
random.seed(17)

print( "\n\nProcessing data for OLS ...")

train = pd.read_csv("../input/train_2017.csv", parse_dates=["transactiondate"])
properties = pd.read_csv("../input/properties_2017.csv")
submission = pd.read_csv("../input/sample_submission.csv")
print(len(train),len(properties),len(submission))

def get_features(df):
    df["transactiondate"] = pd.to_datetime(df["transactiondate"])
    df["transactiondate_year"] = df["transactiondate"].dt.year
    df["transactiondate_month"] = df["transactiondate"].dt.month
    df['transactiondate'] = df['transactiondate'].dt.quarter
    df = df.fillna(-1.0)
    return df



Processing data for OLS ...
/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (49) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
77613 2985217 2985217

In [38]:
def MAE(y, ypred):
    #logerror=log(Zestimate)−log(SalePrice)
    return np.sum([abs(y[i]-ypred[i]) for i in range(len(y))]) / len(y)

In [39]:
train = pd.merge(train, properties, how='left', on='parcelid')
y = train['logerror'].values
test = pd.merge(submission, properties, how='left', left_on='ParcelId', right_on='parcelid')
properties = [] #memory

exc = [train.columns[c] for c in range(len(train.columns)) if train.dtypes[c] == 'O'] + ['logerror','parcelid']
col = [c for c in train.columns if c not in exc]

In [40]:
train = get_features(train[col])
test['transactiondate'] = '2016-01-01' #should use the most common training date
test = get_features(test[col])


/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app

In [41]:
print("\nFitting OLS...")
reg = LinearRegression(n_jobs=-1)
reg.fit(train, y); print('fit...')
print(MAE(y, reg.predict(train)))
train = [];  y = [] #memory

test_dates = ['2016-10-01','2016-11-01','2016-12-01','2017-10-01','2017-11-01','2017-12-01']
test_columns = ['201610','201611','201612','201710','201711','201712']


Fitting OLS...
fit...
0.0709592203753

In [42]:
########################
########################
##  Combine and Save  ##
########################
########################


##### COMBINE PREDICTIONS

print( "\nCombining XGBoost, LightGBM, NN, and baseline predicitons ..." )
lgb_weight = 1 - XGB_WEIGHT - BASELINE_WEIGHT - NN_WEIGHT - OLS_WEIGHT 
lgb_weight0 = lgb_weight / (1 - OLS_WEIGHT)
xgb_weight0 = XGB_WEIGHT / (1 - OLS_WEIGHT)
baseline_weight0 =  BASELINE_WEIGHT / (1 - OLS_WEIGHT)
nn_weight0 = NN_WEIGHT / (1 - OLS_WEIGHT)
pred0 = 0
pred0 += xgb_weight0*xgb_pred
pred0 += baseline_weight0*BASELINE_PRED
pred0 += lgb_weight0*p_test
pred0 += nn_weight0*nn_pred


Combining XGBoost, LightGBM, NN, and baseline predicitons ...

In [43]:
print( "\nPredicting with OLS and combining with XGB/LGB/NN/baseline predicitons: ..." )
for i in range(len(test_dates)):
    test['transactiondate'] = test_dates[i]
    pred = FUDGE_FACTOR * ( OLS_WEIGHT*reg.predict(get_features(test)) + (1-OLS_WEIGHT)*pred0 )
    submission[test_columns[i]] = [float(format(x, '.4f')) for x in pred]
    print('predict...', i)
    
print( "\nWriting results to disk ..." )
submission.to_csv('1003_try.csv', index=False , float_format='%.4f')
print( "\nFinished ...")


Predicting with OLS and combining with XGB/LGB/NN/baseline predicitons: ...
predict... 0
predict... 1
predict... 2
predict... 3
predict... 4
predict... 5

Writing results to disk ...

Finished ...

In [44]:
train_c = train.copy()
train_c['trans_month'] = train_c['transactiondate'].dt.month
cnt_srs = train_c['trans_month'].value_counts()
plt.figure(figsize=(12,6))
sns.barplot(cnt_srs.index, cnt_srs.values,alpha = 0.8)
plt.xticks(rotation='vertical')
plt.xlabel('Month of transaction', fontsize=12)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.show()


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-44-aa1bb3b2923c> in <module>()
      1 train_c = train.copy()
----> 2 train_c['trans_month'] = train_c['transactiondate'].dt.month
      3 cnt_srs = train_c['trans_month'].value_counts()
      4 plt.figure(figsize=(12,6))
      5 sns.barplot(cnt_srs.index, cnt_srs.values,alpha = 0.8)

TypeError: list indices must be integers or slices, not str

In [45]:
#check number of Nulls in property dataset
missing_df = properties.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name','missing_count']
missing_df['missing_ratio'] = missing_df['missing_count'] / properties.shape[0]
missing_filtered = missing_df.loc[missing_df['missing_ratio']>0.99]
#eliminate columns that has too many null
eliminate_list = missing_filtered['column_name'].values
properties = properties.drop(eliminate_list,axis=1)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-45-afc9c8ad2429> in <module>()
      1 #check number of Nulls in property dataset
----> 2 missing_df = properties.isnull().sum(axis=0).reset_index()
      3 missing_df.columns = ['column_name','missing_count']
      4 missing_df['missing_ratio'] = missing_df['missing_count'] / properties.shape[0]
      5 missing_filtered = missing_df.loc[missing_df['missing_ratio']>0.99]

AttributeError: 'list' object has no attribute 'isnull'

In [46]:
# for c in properties.columns.values:
#     plt.figure(figsize=(12,6))
#     sns.countplot(x=c, data=properties)
#     plt.ylabel('Count', fontsize=12)
#     plt.xlabel(c, fontsize=12)
#     plt.xticks(rotation='vertical')
#     plt.show()
plt.figure(figsize=(12,6))
sns.countplot(x='finishedsquarefeet12', data=properties)
plt.ylabel('Count', fontsize=12)
plt.xlabel("finishedsquarefeet12", fontsize=12)
plt.xticks(rotation='vertical')
plt.show()


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-46-ff72ff22c48d> in <module>()
      7 #     plt.show()
      8 plt.figure(figsize=(12,6))
----> 9 sns.countplot(x='finishedsquarefeet12', data=properties)
     10 plt.ylabel('Count', fontsize=12)
     11 plt.xlabel("finishedsquarefeet12", fontsize=12)

/opt/conda/lib/python3.6/site-packages/seaborn/categorical.py in countplot(x, y, hue, data, order, hue_order, orient, color, palette, saturation, ax, **kwargs)
   3256                           estimator, ci, n_boot, units,
   3257                           orient, color, palette, saturation,
-> 3258                           errcolor)
   3259 
   3260     plotter.value_label = "count"

/opt/conda/lib/python3.6/site-packages/seaborn/categorical.py in __init__(self, x, y, hue, data, order, hue_order, estimator, ci, n_boot, units, orient, color, palette, saturation, errcolor, errwidth, capsize)
   1541         """Initialize the plotter."""
   1542         self.establish_variables(x, y, hue, data, orient,
-> 1543                                  order, hue_order, units)
   1544         self.establish_colors(color, palette, saturation)
   1545         self.estimate_statistic(estimator, ci, n_boot)

/opt/conda/lib/python3.6/site-packages/seaborn/categorical.py in establish_variables(self, x, y, hue, data, orient, order, hue_order, units)
    140             # See if we need to get variables from `data`
    141             if data is not None:
--> 142                 x = data.get(x, x)
    143                 y = data.get(y, y)
    144                 hue = data.get(hue, hue)

AttributeError: 'list' object has no attribute 'get'
<matplotlib.figure.Figure at 0x7f5f9563f2b0>

In [47]:
print('t')


t

In [48]:
#convert datatype
def convert_datatype(dataframe):
    for c, dtype in zip(dataframe.columns, dataframe.dtypes):
        if dtype == np.float64:
            dataframe[c] = dataframe[c].astype(np.float32)
        if dtype == np.int64:
            dataframe[c] = dataframe[c].astype(np.int32)
            
convert_datatype(properties)
convert_datatype(test)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-48-54da31ec157d> in <module>()
      7             dataframe[c] = dataframe[c].astype(np.int32)
      8 
----> 9 convert_datatype(properties)
     10 convert_datatype(test)

<ipython-input-48-54da31ec157d> in convert_datatype(dataframe)
      1 #convert datatype
      2 def convert_datatype(dataframe):
----> 3     for c, dtype in zip(dataframe.columns, dataframe.dtypes):
      4         if dtype == np.float64:
      5             dataframe[c] = dataframe[c].astype(np.float32)

AttributeError: 'list' object has no attribute 'columns'

In [49]:
#living area proportions 
properties['living_area_prop'] = properties['calculatedfinishedsquarefeet'] / \
properties['lotsizesquarefeet']
#tax value ratio
properties['value_ratio'] = properties['taxvaluedollarcnt'] / properties['taxamount']
#tax value proportions
properties['value_prop'] = properties['structuretaxvaluedollarcnt'] /\
properties['landtaxvaluedollarcnt']


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-49-dd88467d1374> in <module>()
      1 #living area proportions
----> 2 properties['living_area_prop'] = properties['calculatedfinishedsquarefeet'] / properties['lotsizesquarefeet']
      3 #tax value ratio
      4 properties['value_ratio'] = properties['taxvaluedollarcnt'] / properties['taxamount']
      5 #tax value proportions

TypeError: list indices must be integers or slices, not str

In [50]:
#mergeing datasets
df_train = train.merge(properties, how='left', on='parcelid')
df_test = test.merge(properties, how='left', on='parcelid')


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-50-e22b4d823fed> in <module>()
      1 #mergeing datasets
----> 2 df_train = train.merge(properties, how='left', on='parcelid')
      3 df_test = test.merge(properties, how='left', on='parcelid')

AttributeError: 'list' object has no attribute 'merge'

In [51]:
print(df_train.shape,train.shape,properties.shape)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-51-91fc6deb3e8b> in <module>()
----> 1 print(df_train.shape,train.shape,properties.shape)

NameError: name 'df_train' is not defined

In [52]:
#change missing values into 0
#change categorical into to numerical

def convert_label(dataframe):
    lbI = LabelEncoder()
    for c in dataframe.columns:
        dataframe[c] = dataframe[c].fillna(0)
        if dataframe[c].dtype == 'object':
            lbI.fit(list(dataframe[c].values))
            dataframe[c] = lbI.transform(list(dataframe[c].values))

convert_label(df_train)
convert_label(df_test)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-52-d9afe508ba44> in <module>()
     10             dataframe[c] = lbI.transform(list(dataframe[c].values))
     11 
---> 12 convert_label(df_train)
     13 convert_label(df_test)
     14 

NameError: name 'df_train' is not defined

In [53]:
#re-arranging 
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
                         'propertycountylandusecode', ], axis=1)
x_test = df_test.drop(['parcelid', 'propertyzoningdesc',
                       'propertycountylandusecode', '201610', '201611', 
                       '201612', '201710', '201711', '201712'], axis = 1) 
x_train = x_train.values
y_train = df_train['logerror'].values


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-53-32e9c69b9b38> in <module>()
      1 #re-arranging
----> 2 x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
      3                          'propertycountylandusecode', ], axis=1)
      4 x_test = df_test.drop(['parcelid', 'propertyzoningdesc',
      5                        'propertycountylandusecode', '201610', '201611',

NameError: name 'df_train' is not defined

In [54]:
from datetime import datetime