notebook.community

Edit and run



In [1]:

    
# Parameters
FUDGE_FACTOR = 1.1200  # Multiply forecasts by this

XGB_WEIGHT = 0.6200
BASELINE_WEIGHT = 0.0100
OLS_WEIGHT = 0.0620
NN_WEIGHT = 0.0800

XGB1_WEIGHT = 0.8000  # Weight of first in combination of two XGB models

BASELINE_PRED = 0.0115   # Baseline based on mean of training data, per Oleg



In [2]:

    
#Import Lib#
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as  xgb 
import random 
import lightgbm as lgb
import datetime as dt 
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score



In [3]:

    
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout, BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.layers.noise import GaussianDropout
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer









    



Using TensorFlow backend.
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))



In [4]:

    
#Load data#
train = pd.read_csv('../input/train_2017.csv',parse_dates = ["transactiondate"])
properties = pd.read_csv('../input/properties_2017.csv')
test = pd.read_csv('../input/sample_submission.csv') 
test = test.rename(columns = {'ParcelId': 'parcelid'})









    



/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (49) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)



In [5]:

    
print("Training Size:" + str(train.shape))
print("Property Size:" + str(properties.shape))
print("Sample Size:" + str(test.shape))









    



Training Size:(77613, 3)
Property Size:(2985217, 58)
Sample Size:(2985217, 7)



In [6]:

    
################
################
##  LightGBM  ##
################
################

# This section is (I think) originally derived from SIDHARTH's script:
#   https://www.kaggle.com/sidharthkumar/trying-lightgbm
# which was forked and tuned by Yuqing Xue:
#   https://www.kaggle.com/yuqingxue/lightgbm-85-97
# and updated by Andy Harless:
#   https://www.kaggle.com/aharless/lightgbm-with-outliers-remaining
# and a lot of additional changes have happened since then



In [7]:

    
##### PROCESS DATA FOR LIGHTGBM
print( "\nProcessing data for LightGBM ..." )
for c, dtype in zip(properties.columns, properties.dtypes):
    if dtype == np.float64:
        properties[c] = properties[c].astype(np.float32)
        
df_train = train.merge(properties, how='left', on='parcelid')
df_train.fillna(df_train.median(),inplace = True)

x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
                         'propertycountylandusecode', 'fireplacecnt', 'fireplaceflag'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)









    



Processing data for LightGBM ...
(77613, 53) (77613,)



In [8]:

    
train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

del df_train; gc.collect()

x_train = x_train.values.astype(np.float32, copy=False)
d_train = lgb.Dataset(x_train, label=y_train)



In [9]:

    
##### RUN LIGHTGBM

params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.0021 # shrinkage_rate
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'l1'          # or 'mae'
params['sub_feature'] = 0.345    # feature_fraction (small values => use very different submodels)
params['bagging_fraction'] = 0.85 # sub_row
params['bagging_freq'] = 40
params['num_leaves'] = 512        # num_leaf
params['min_data'] = 500         # min_data_in_leaf
params['min_hessian'] = 0.05     # min_sum_hessian_in_leaf
params['verbose'] = 0
params['feature_fraction_seed'] = 2
params['bagging_seed'] = 3

np.random.seed(0)
random.seed(0)



In [10]:

    
print("\nFitting LightGBM model ...")
clf = lgb.train(params, d_train, 430)









    



Fitting LightGBM model ...



In [11]:

    
del d_train; gc.collect()
del x_train; gc.collect()









    Out[11]:





0



In [12]:

    
print("\nPrepare for LightGBM prediction ...")
print("   Read sample file ...")
sample = pd.read_csv('../input/sample_submission.csv')
print("   ...")
sample['parcelid'] = sample['ParcelId']
print("   Merge with property data ...")
df_test = sample.merge(properties, on='parcelid', how='left')
print("   ...")
del sample, properties; gc.collect()
print("   ...")
#df_test['Ratio_1'] = df_test['taxvaluedollarcnt']/df_test['taxamount']
x_test = df_test[train_columns]
print("   ...")
del df_test; gc.collect()
print("   Preparing x_test...")
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)
print("   ...")
x_test = x_test.values.astype(np.float32, copy=False)

print("\nStart LightGBM prediction ...")
p_test = clf.predict(x_test)

del x_test; gc.collect()

print( "\nUnadjusted LightGBM predictions:" )
print( pd.DataFrame(p_test).head() )









    



Prepare for LightGBM prediction ...
   Read sample file ...
   ...
   Merge with property data ...
   ...
   ...
   ...
   Preparing x_test...
   ...

Start LightGBM prediction ...

Unadjusted LightGBM predictions:
          0
0  0.025981
1  0.026097
2  0.022967
3  0.028343
4  0.021492



In [13]:

    
################
################
##  XGBoost   ##
################
################

# This section is (I think) originally derived from Infinite Wing's script:
#   https://www.kaggle.com/infinitewing/xgboost-without-outliers-lb-0-06463
# inspired by this thread:
#   https://www.kaggle.com/c/zillow-prize-1/discussion/33710
# but the code has gone through a lot of changes since then



In [14]:

    
print( "\nRe-reading properties file ...")
properties = pd.read_csv('../input/properties_2017.csv')









    



Re-reading properties file ...






    



/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (49) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)



In [15]:

    
##### PROCESS DATA FOR XGBOOST

print( "\nProcessing data for XGBoost ...")
for c in properties.columns:
    properties[c]=properties[c].fillna(-1)
    if properties[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(properties[c].values))
        properties[c] = lbl.transform(list(properties[c].values))

train_df = train.merge(properties, how='left', on='parcelid')
x_train = train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
x_test = properties.drop(['parcelid'], axis=1)









    



Processing data for XGBoost ...



In [16]:

    
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))









    



Shape train: (77613, 57)
Shape test: (2985217, 57)



In [17]:

    
# drop out ouliers
train_df=train_df[ train_df.logerror > -0.4 ]
train_df=train_df[ train_df.logerror < 0.419 ]
x_train=train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
y_train = train_df["logerror"].values.astype(np.float32)
y_mean = np.mean(y_train)

print('After removing outliers:')     
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))









    



After removing outliers:
Shape train: (75949, 57)
Shape test: (2985217, 57)



In [18]:

    
##### RUN XGBOOST

print("\nSetting up data for XGBoost ...")
# xgboost params
xgb_params = {
    'eta': 0.037,
    'max_depth': 5,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': y_mean,
    'silent': 1
}









    



Setting up data for XGBoost ...



In [19]:

    
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

num_boost_rounds = 250
print("num_boost_rounds="+str(num_boost_rounds))

# train model
print( "\nTraining XGBoost ...")
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

print( "\nPredicting with XGBoost ...")
xgb_pred1 = model.predict(dtest)

print( "\nFirst XGBoost predictions:" )
print( pd.DataFrame(xgb_pred1).head() )









    



num_boost_rounds=250

Training XGBoost ...

Predicting with XGBoost ...

First XGBoost predictions:
          0
0  0.076013
1  0.056619
2 -0.064734
3  0.003192
4  0.001247



In [20]:

    
print("\nSetting up data for XGBoost ...")
# xgboost params
xgb_params = {
    'eta': 0.033,
    'max_depth': 6,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'base_score': y_mean,
    'silent': 1
}


num_boost_rounds = 150
print("num_boost_rounds="+str(num_boost_rounds))

print( "\nTraining XGBoost again ...")
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

print( "\nPredicting with XGBoost again ...")
xgb_pred2 = model.predict(dtest)

print( "\nSecond XGBoost predictions:" )
print( pd.DataFrame(xgb_pred2).head() )









    



Setting up data for XGBoost ...
num_boost_rounds=150

Training XGBoost again ...

Predicting with XGBoost again ...

Second XGBoost predictions:
          0
0  0.070272
1  0.053413
2 -0.100315
3 -0.003527
4 -0.000545



In [21]:

    
##### COMBINE XGBOOST RESULTS
xgb_pred = XGB1_WEIGHT*xgb_pred1 + (1-XGB1_WEIGHT)*xgb_pred2
#xgb_pred = xgb_pred1

print( "\nCombined XGBoost predictions:" )
print( pd.DataFrame(xgb_pred).head() )

del train_df
del x_train
del x_test
del properties
del dtest
del dtrain
del xgb_pred1
del xgb_pred2 
gc.collect()









    



Combined XGBoost predictions:
          0
0  0.074865
1  0.055978
2 -0.071850
3  0.001848
4  0.000889






    Out[21]:





150



In [22]:

    
######################
######################
##  Neural Network  ##
######################
######################

# Neural network copied from this script:
#   https://www.kaggle.com/aharless/keras-neural-network-lb-06492 (version 20)
# which was built on the skeleton in this notebook:
#   https://www.kaggle.com/prasunmishra/ann-using-keras



In [23]:

    
# Read in data for neural network
print( "\n\nProcessing data for Neural Network ...")
print('\nLoading train, prop and sample data...')
train = pd.read_csv("../input/train_2017.csv", parse_dates=["transactiondate"])
prop = pd.read_csv('../input/properties_2017.csv')
sample = pd.read_csv('../input/sample_submission.csv')









    




Processing data for Neural Network ...

Loading train, prop and sample data...






    



/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (49) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)



In [24]:

    
print('Fitting Label Encoder on properties...')
for c in prop.columns:
    prop[c]=prop[c].fillna(-1)
    if prop[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(prop[c].values))
        prop[c] = lbl.transform(list(prop[c].values))









    



Fitting Label Encoder on properties...



In [25]:

    
print('Creating training set...')
df_train = train.merge(prop, how='left', on='parcelid')

df_train["transactiondate"] = pd.to_datetime(df_train["transactiondate"])
df_train["transactiondate_year"] = df_train["transactiondate"].dt.year
df_train["transactiondate_month"] = df_train["transactiondate"].dt.month
df_train['transactiondate_quarter'] = df_train['transactiondate'].dt.quarter
df_train["transactiondate"] = df_train["transactiondate"].dt.day









    



Creating training set...



In [26]:

    
print('Filling NA/NaN values...' )
df_train.fillna(-1.0)









    



Filling NA/NaN values...






    Out[26]:







  
    
      
      parcelid
      logerror
      transactiondate
      airconditioningtypeid
      architecturalstyletypeid
      basementsqft
      bathroomcnt
      bedroomcnt
      buildingclasstypeid
      buildingqualitytypeid
      ...
      taxvaluedollarcnt
      assessmentyear
      landtaxvaluedollarcnt
      taxamount
      taxdelinquencyflag
      taxdelinquencyyear
      censustractandblock
      transactiondate_year
      transactiondate_month
      transactiondate_quarter
    
  
  
    
      0
      14297519
      0.025595
      1
      -1.0
      -1.0
      -1.0
      3.5
      4.0
      -1.0
      -1.0
      ...
      1023282.0
      2016.0
      537569.0
      11013.72
      0
      -1.0
      6.059063e+13
      2017
      1
      1
    
    
      1
      17052889
      0.055619
      1
      -1.0
      -1.0
      -1.0
      1.0
      2.0
      -1.0
      -1.0
      ...
      464000.0
      2016.0
      376000.0
      5672.48
      0
      -1.0
      6.111001e+13
      2017
      1
      1
    
    
      2
      14186244
      0.005383
      1
      -1.0
      -1.0
      -1.0
      2.0
      3.0
      -1.0
      -1.0
      ...
      564778.0
      2016.0
      479489.0
      6488.30
      0
      -1.0
      6.059022e+13
      2017
      1
      1
    
    
      3
      12177905
      -0.103410
      1
      -1.0
      -1.0
      -1.0
      3.0
      4.0
      -1.0
      8.0
      ...
      145143.0
      2016.0
      36225.0
      1777.51
      0
      -1.0
      6.037300e+13
      2017
      1
      1
    
    
      4
      10887214
      0.006940
      1
      1.0
      -1.0
      -1.0
      3.0
      3.0
      -1.0
      8.0
      ...
      119407.0
      2016.0
      45726.0
      1533.89
      0
      -1.0
      6.037124e+13
      2017
      1
      1
    
    
      5
      17143294
      -0.020526
      1
      -1.0
      -1.0
      -1.0
      2.0
      3.0
      -1.0
      -1.0
      ...
      331064.0
      2016.0
      132424.0
      3508.10
      0
      -1.0
      6.111005e+13
      2017
      1
      1
    
    
      6
      12095076
      -0.001011
      1
      1.0
      -1.0
      -1.0
      3.0
      4.0
      -1.0
      9.0
      ...
      773303.0
      2016.0
      496619.0
      9516.26
      0
      -1.0
      6.037461e+13
      2017
      1
      1
    
    
      7
      12069064
      0.101723
      1
      -1.0
      -1.0
      -1.0
      1.0
      2.0
      -1.0
      5.0
      ...
      218552.0
      2016.0
      199662.0
      2366.08
      0
      -1.0
      6.037302e+13
      2017
      1
      1
    
    
      8
      12790562
      -0.040966
      2
      -1.0
      -1.0
      -1.0
      3.0
      4.0
      -1.0
      9.0
      ...
      220583.0
      2016.0
      43056.0
      3104.19
      0
      -1.0
      6.037500e+13
      2017
      1
      1
    
    
      9
      11542646
      -0.036763
      2
      -1.0
      -1.0
      -1.0
      3.0
      2.0
      -1.0
      8.0
      ...
      371361.0
      2016.0
      220058.0
      4557.32
      0
      -1.0
      6.037275e+13
      2017
      1
      1
    
    
      10
      11104527
      0.005963
      2
      1.0
      -1.0
      -1.0
      3.0
      4.0
      -1.0
      8.0
      ...
      430108.0
      2016.0
      158393.0
      6103.36
      0
      -1.0
      6.037920e+13
      2017
      1
      1
    
    
      11
      13984101
      -0.147997
      2
      -1.0
      -1.0
      -1.0
      0.0
      0.0
      -1.0
      -1.0
      ...
      471590.0
      2016.0
      309919.0
      6151.38
      0
      -1.0
      6.059042e+13
      2017
      1
      1
    
    
      12
      14705431
      -0.028835
      2
      -1.0
      -1.0
      -1.0
      2.5
      3.0
      -1.0
      -1.0
      ...
      582011.0
      2016.0
      463974.0
      6675.04
      0
      -1.0
      6.059099e+13
      2017
      1
      1
    
    
      13
      13944538
      0.045602
      2
      13.0
      -1.0
      -1.0
      2.5
      3.0
      -1.0
      -1.0
      ...
      319668.0
      2016.0
      223051.0
      4078.08
      0
      -1.0
      6.059089e+13
      2017
      1
      1
    
    
      14
      14731691
      -0.008935
      2
      -1.0
      -1.0
      -1.0
      2.0
      2.0
      -1.0
      -1.0
      ...
      132425.0
      2016.0
      15362.0
      1771.34
      0
      -1.0
      6.059099e+13
      2017
      1
      1
    
    
      15
      17110996
      0.008669
      2
      -1.0
      -1.0
      -1.0
      2.5
      3.0
      -1.0
      -1.0
      ...
      198054.0
      2016.0
      99028.0
      2204.84
      0
      -1.0
      6.111005e+13
      2017
      1
      1
    
    
      16
      14375300
      -0.021896
      2
      13.0
      -1.0
      -1.0
      2.0
      4.0
      -1.0
      -1.0
      ...
      528847.0
      2016.0
      359729.0
      5496.44
      0
      -1.0
      6.059052e+13
      2017
      1
      1
    
    
      17
      12045625
      -0.017167
      2
      1.0
      -1.0
      -1.0
      3.0
      3.0
      -1.0
      8.0
      ...
      259197.0
      2016.0
      89470.0
      2831.48
      0
      -1.0
      6.037301e+13
      2017
      1
      1
    
    
      18
      11830315
      -0.035085
      2
      -1.0
      -1.0
      -1.0
      1.0
      2.0
      -1.0
      4.0
      ...
      273745.0
      2016.0
      145785.0
      3739.02
      0
      -1.0
      6.037532e+13
      2017
      1
      1
    
    
      19
      14730402
      -0.012041
      2
      -1.0
      -1.0
      -1.0
      2.0
      2.0
      -1.0
      -1.0
      ...
      245910.0
      2016.0
      170974.0
      2548.58
      1
      15.0
      6.059032e+13
      2017
      1
      1
    
    
      20
      12325145
      0.042463
      2
      1.0
      -1.0
      -1.0
      2.0
      3.0
      -1.0
      6.0
      ...
      178646.0
      2016.0
      38065.0
      2391.69
      1
      15.0
      6.037554e+13
      2017
      1
      1
    
    
      21
      14387959
      -0.006653
      2
      -1.0
      -1.0
      -1.0
      2.5
      3.0
      -1.0
      -1.0
      ...
      597213.0
      2016.0
      423944.0
      6172.28
      0
      -1.0
      6.059042e+13
      2017
      1
      1
    
    
      22
      14349322
      0.044309
      2
      -1.0
      -1.0
      -1.0
      2.5
      3.0
      -1.0
      -1.0
      ...
      534612.0
      2016.0
      313179.0
      7141.56
      0
      -1.0
      6.059052e+13
      2017
      1
      1
    
    
      23
      11706737
      -0.120976
      2
      -1.0
      -1.0
      -1.0
      1.0
      2.0
      -1.0
      5.0
      ...
      171838.0
      2016.0
      134175.0
      2220.23
      1
      14.0
      6.037232e+13
      2017
      1
      1
    
    
      24
      11671335
      -0.000546
      2
      1.0
      -1.0
      -1.0
      3.0
      2.0
      -1.0
      9.0
      ...
      573247.0
      2016.0
      296606.0
      6964.34
      0
      -1.0
      6.037263e+13
      2017
      1
      1
    
    
      25
      12531488
      0.042041
      2
      -1.0
      -1.0
      -1.0
      1.0
      3.0
      -1.0
      4.0
      ...
      432754.0
      2016.0
      334716.0
      5212.75
      0
      -1.0
      6.037578e+13
      2017
      1
      1
    
    
      26
      14314879
      -0.001476
      2
      -1.0
      -1.0
      -1.0
      4.5
      6.0
      -1.0
      -1.0
      ...
      2040000.0
      2016.0
      1741737.0
      21758.26
      0
      -1.0
      6.059063e+13
      2017
      1
      1
    
    
      27
      11130689
      0.020110
      2
      1.0
      -1.0
      -1.0
      2.0
      3.0
      -1.0
      6.0
      ...
      401399.0
      2016.0
      179161.0
      5438.41
      0
      -1.0
      6.037920e+13
      2017
      1
      1
    
    
      28
      12036177
      0.206470
      2
      -1.0
      -1.0
      -1.0
      2.0
      3.0
      -1.0
      4.0
      ...
      85035.0
      2016.0
      52771.0
      1232.08
      0
      -1.0
      6.037189e+13
      2017
      1
      1
    
    
      29
      14333888
      -0.025392
      2
      -1.0
      -1.0
      -1.0
      3.0
      2.0
      -1.0
      -1.0
      ...
      954106.0
      2016.0
      638423.0
      10696.14
      0
      -1.0
      6.059076e+13
      2017
      1
      1
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      77583
      12651460
      0.094750
      19
      -1.0
      -1.0
      -1.0
      1.0
      2.0
      -1.0
      5.0
      ...
      443726.0
      2016.0
      354981.0
      5423.02
      0
      -1.0
      6.037297e+13
      2017
      9
      3
    
    
      77584
      10962691
      -0.064765
      19
      -1.0
      -1.0
      -1.0
      2.0
      3.0
      -1.0
      4.0
      ...
      316229.0
      2016.0
      138246.0
      3980.97
      0
      -1.0
      6.037107e+13
      2017
      9
      3
    
    
      77585
      11487400
      0.046755
      19
      -1.0
      -1.0
      -1.0
      2.0
      2.0
      -1.0
      7.0
      ...
      213230.0
      2016.0
      77126.0
      2940.30
      0
      -1.0
      6.037621e+13
      2017
      9
      3
    
    
      77586
      12433997
      0.018458
      19
      -1.0
      -1.0
      -1.0
      1.0
      2.0
      -1.0
      4.0
      ...
      143238.0
      2016.0
      95811.0
      1865.65
      1
      12.0
      6.037555e+13
      2017
      9
      3
    
    
      77587
      10855745
      0.002138
      19
      1.0
      -1.0
      -1.0
      3.0
      3.0
      -1.0
      6.0
      ...
      796324.0
      2016.0
      391845.0
      9703.53
      0
      -1.0
      6.037129e+13
      2017
      9
      3
    
    
      77588
      12883894
      1.285713
      19
      -1.0
      -1.0
      -1.0
      1.0
      3.0
      -1.0
      4.0
      ...
      15870.0
      2016.0
      6143.0
      500.68
      0
      -1.0
      6.037402e+13
      2017
      9
      3
    
    
      77589
      12885530
      0.003457
      19
      -1.0
      -1.0
      -1.0
      1.0
      2.0
      -1.0
      4.0
      ...
      60841.0
      2016.0
      11774.0
      904.27
      0
      -1.0
      6.037403e+13
      2017
      9
      3
    
    
      77590
      11490983
      -0.036067
      19
      -1.0
      -1.0
      -1.0
      1.0
      2.0
      -1.0
      8.0
      ...
      242126.0
      2016.0
      109506.0
      3018.98
      0
      -1.0
      6.037621e+13
      2017
      9
      3
    
    
      77591
      12401161
      -0.232215
      19
      -1.0
      -1.0
      -1.0
      3.0
      3.0
      -1.0
      9.0
      ...
      887197.0
      2016.0
      293396.0
      10781.20
      0
      -1.0
      6.037551e+13
      2017
      9
      3
    
    
      77592
      11068972
      0.015806
      19
      1.0
      -1.0
      -1.0
      5.0
      4.0
      -1.0
      11.0
      ...
      1316000.0
      2016.0
      526000.0
      16032.16
      0
      -1.0
      6.037108e+13
      2017
      9
      3
    
    
      77593
      11490089
      0.068838
      19
      -1.0
      -1.0
      -1.0
      2.0
      2.0
      -1.0
      8.0
      ...
      302341.0
      2016.0
      145026.0
      3774.07
      0
      -1.0
      6.037621e+13
      2017
      9
      3
    
    
      77594
      12406123
      0.000099
      19
      -1.0
      -1.0
      -1.0
      2.0
      2.0
      -1.0
      4.0
      ...
      108344.0
      2016.0
      41307.0
      1580.48
      0
      -1.0
      6.037555e+13
      2017
      9
      3
    
    
      77595
      12669704
      0.047467
      19
      -1.0
      -1.0
      -1.0
      2.0
      4.0
      -1.0
      8.0
      ...
      139547.0
      2016.0
      73500.0
      1770.95
      0
      -1.0
      6.037651e+13
      2017
      9
      3
    
    
      77596
      12410050
      0.000749
      19
      -1.0
      -1.0
      -1.0
      1.0
      2.0
      -1.0
      4.0
      ...
      222739.0
      2016.0
      157067.0
      2759.92
      0
      -1.0
      6.037553e+13
      2017
      9
      3
    
    
      77597
      11953526
      -0.159471
      19
      -1.0
      -1.0
      -1.0
      0.0
      0.0
      -1.0
      -1.0
      ...
      34124.0
      2016.0
      27730.0
      490.51
      0
      -1.0
      6.037187e+13
      2017
      9
      3
    
    
      77598
      11608641
      -0.816510
      19
      -1.0
      -1.0
      -1.0
      2.0
      2.0
      -1.0
      6.0
      ...
      97499.0
      2016.0
      72878.0
      1391.79
      1
      14.0
      6.037269e+13
      2017
      9
      3
    
    
      77599
      11978003
      -0.062404
      19
      1.0
      -1.0
      -1.0
      2.0
      3.0
      -1.0
      6.0
      ...
      603333.0
      2016.0
      422334.0
      7312.73
      0
      -1.0
      6.037183e+13
      2017
      9
      3
    
    
      77600
      11318911
      0.013268
      19
      -1.0
      -1.0
      -1.0
      1.0
      1.0
      -1.0
      5.0
      ...
      81706.0
      2016.0
      32682.0
      1131.24
      0
      -1.0
      6.037901e+13
      2017
      9
      3
    
    
      77601
      10722237
      0.052630
      19
      1.0
      -1.0
      -1.0
      1.0
      2.0
      -1.0
      8.0
      ...
      259738.0
      2016.0
      144804.0
      3225.40
      0
      -1.0
      6.037134e+13
      2017
      9
      3
    
    
      77602
      12055795
      -0.140881
      19
      -1.0
      -1.0
      -1.0
      1.0
      2.0
      -1.0
      4.0
      ...
      312183.0
      2016.0
      189703.0
      3465.38
      0
      -1.0
      6.037302e+13
      2017
      9
      3
    
    
      77603
      12892446
      0.319408
      19
      -1.0
      -1.0
      -1.0
      3.0
      4.0
      -1.0
      8.0
      ...
      332251.0
      2016.0
      121211.0
      4225.84
      0
      -1.0
      6.037402e+13
      2017
      9
      3
    
    
      77604
      12666457
      0.028733
      19
      -1.0
      -1.0
      -1.0
      3.0
      3.0
      -1.0
      10.0
      ...
      1414710.0
      2016.0
      1188359.0
      16388.30
      0
      -1.0
      6.037621e+13
      2017
      9
      3
    
    
      77605
      10858613
      -0.075465
      19
      1.0
      -1.0
      -1.0
      2.0
      2.0
      -1.0
      8.0
      ...
      346788.0
      2016.0
      214889.0
      4231.76
      0
      -1.0
      6.037141e+13
      2017
      9
      3
    
    
      77606
      10722691
      0.081196
      19
      1.0
      -1.0
      -1.0
      2.0
      3.0
      -1.0
      6.0
      ...
      72026.0
      2016.0
      25242.0
      1000.70
      0
      -1.0
      6.037135e+13
      2017
      9
      3
    
    
      77607
      12412492
      0.001082
      19
      -1.0
      -1.0
      -1.0
      2.0
      4.0
      -1.0
      6.0
      ...
      346534.0
      2016.0
      221068.0
      4175.08
      0
      -1.0
      6.037555e+13
      2017
      9
      3
    
    
      77608
      10833991
      -0.002245
      20
      1.0
      -1.0
      -1.0
      3.0
      3.0
      -1.0
      8.0
      ...
      379000.0
      2016.0
      114000.0
      4685.34
      0
      -1.0
      6.037132e+13
      2017
      9
      3
    
    
      77609
      11000655
      0.020615
      20
      -1.0
      -1.0
      -1.0
      2.0
      2.0
      -1.0
      6.0
      ...
      354621.0
      2016.0
      283704.0
      4478.43
      0
      -1.0
      6.037101e+13
      2017
      9
      3
    
    
      77610
      17239384
      0.013209
      21
      -1.0
      -1.0
      -1.0
      2.0
      4.0
      -1.0
      -1.0
      ...
      67205.0
      2016.0
      16522.0
      1107.48
      0
      -1.0
      6.111008e+13
      2017
      9
      3
    
    
      77611
      12773139
      0.037129
      21
      1.0
      -1.0
      -1.0
      1.0
      3.0
      -1.0
      4.0
      ...
      49546.0
      2016.0
      16749.0
      876.43
      0
      -1.0
      6.037434e+13
      2017
      9
      3
    
    
      77612
      12826780
      0.007204
      25
      -1.0
      -1.0
      -1.0
      2.0
      3.0
      -1.0
      6.0
      ...
      522000.0
      2016.0
      382000.0
      6317.15
      0
      -1.0
      6.037503e+13
      2017
      9
      3
    
  

77613 rows × 63 columns



In [27]:

    
print('Creating x_train and y_train from df_train...' )
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode','fireplacecnt', 'fireplaceflag'], axis=1)
y_train = df_train["logerror"]









    



Creating x_train and y_train from df_train...



In [28]:

    
y_mean = np.mean(y_train)
print(x_train.shape, y_train.shape)
train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)









    



(77613, 56) (77613,)



In [29]:

    
print('Creating df_test...')
sample['parcelid'] = sample['ParcelId']

print("Merging Sample with property data...")
df_test = sample.merge(prop, on='parcelid', how='left')

df_test["transactiondate"] = pd.to_datetime('2016-11-15')  # placeholder value for preliminary version
df_test["transactiondate_year"] = df_test["transactiondate"].dt.year
df_test["transactiondate_month"] = df_test["transactiondate"].dt.month
df_test['transactiondate_quarter'] = df_test['transactiondate'].dt.quarter
df_test["transactiondate"] = df_test["transactiondate"].dt.day     
x_test = df_test[train_columns]









    



Creating df_test...
Merging Sample with property data...



In [30]:

    
print('Shape of x_test:', x_test.shape)
print("Preparing x_test...")
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)









    



Shape of x_test: (2985217, 56)
Preparing x_test...



In [31]:

    
## Preprocessing
print("\nPreprocessing neural network data...")
imputer= Imputer()
imputer.fit(x_train.iloc[:, :])
x_train = imputer.transform(x_train.iloc[:, :])
imputer.fit(x_test.iloc[:, :])
x_test = imputer.transform(x_test.iloc[:, :])

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

len_x=int(x_train.shape[1])
print("len_x is:",len_x)









    



Preprocessing neural network data...
len_x is: 56



In [32]:

    
print("\nSetting up neural network model...")
nn = Sequential()
nn.add(Dense(units = 400 , kernel_initializer = 'normal', input_dim = len_x))
nn.add(PReLU())
nn.add(Dropout(.4))
nn.add(Dense(units = 160 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.6))
nn.add(Dense(units = 64 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.5))
nn.add(Dense(units = 26, kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.6))
nn.add(Dense(1, kernel_initializer='normal'))
nn.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))









    



Setting up neural network model...






    



/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))
/opt/conda/lib/python3.6/site-packages/tensorflow/python/util/tf_inspect.py:45: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()
  if d.decorator_argspec is not None), _inspect.getargspec(target))



In [33]:

    
print("\nFitting neural network model...")
nn.fit(np.array(x_train), np.array(y_train), batch_size = 32, epochs = 70, verbose=2)

print("\nPredicting with neural network model...")
#print("x_test.shape:",x_test.shape)
y_pred_ann = nn.predict(x_test)









    



Fitting neural network model...
Epoch 1/70
14s - loss: 0.0732
Epoch 2/70
14s - loss: 0.0698
Epoch 3/70
14s - loss: 0.0698
Epoch 4/70
14s - loss: 0.0697
Epoch 5/70
14s - loss: 0.0697
Epoch 6/70
14s - loss: 0.0696
Epoch 7/70
14s - loss: 0.0696
Epoch 8/70
14s - loss: 0.0695
Epoch 9/70
14s - loss: 0.0695
Epoch 10/70
13s - loss: 0.0695
Epoch 11/70
14s - loss: 0.0694
Epoch 12/70
14s - loss: 0.0694
Epoch 13/70
14s - loss: 0.0694
Epoch 14/70
14s - loss: 0.0694
Epoch 15/70
14s - loss: 0.0693
Epoch 16/70
14s - loss: 0.0694
Epoch 17/70
14s - loss: 0.0693
Epoch 18/70
14s - loss: 0.0693
Epoch 19/70
14s - loss: 0.0693
Epoch 20/70
14s - loss: 0.0693
Epoch 21/70
14s - loss: 0.0693
Epoch 22/70
14s - loss: 0.0693
Epoch 23/70
14s - loss: 0.0693
Epoch 24/70
14s - loss: 0.0692
Epoch 25/70
14s - loss: 0.0693
Epoch 26/70
14s - loss: 0.0693
Epoch 27/70
14s - loss: 0.0692
Epoch 28/70
14s - loss: 0.0692
Epoch 29/70
15s - loss: 0.0692
Epoch 30/70
14s - loss: 0.0692
Epoch 31/70
16s - loss: 0.0692
Epoch 32/70
14s - loss: 0.0692
Epoch 33/70
14s - loss: 0.0692
Epoch 34/70
14s - loss: 0.0692
Epoch 35/70
14s - loss: 0.0692
Epoch 36/70
14s - loss: 0.0691
Epoch 37/70
14s - loss: 0.0692
Epoch 38/70
14s - loss: 0.0692
Epoch 39/70
14s - loss: 0.0692
Epoch 40/70
14s - loss: 0.0691
Epoch 41/70
14s - loss: 0.0691
Epoch 42/70
14s - loss: 0.0691
Epoch 43/70
14s - loss: 0.0691
Epoch 44/70
14s - loss: 0.0691
Epoch 45/70
14s - loss: 0.0691
Epoch 46/70
14s - loss: 0.0691
Epoch 47/70
14s - loss: 0.0691
Epoch 48/70
14s - loss: 0.0691
Epoch 49/70
14s - loss: 0.0691
Epoch 50/70
14s - loss: 0.0691
Epoch 51/70
14s - loss: 0.0691
Epoch 52/70
14s - loss: 0.0691
Epoch 53/70
14s - loss: 0.0691
Epoch 54/70
14s - loss: 0.0691
Epoch 55/70
14s - loss: 0.0691
Epoch 56/70
14s - loss: 0.0690
Epoch 57/70
14s - loss: 0.0691
Epoch 58/70
14s - loss: 0.0691
Epoch 59/70
14s - loss: 0.0690
Epoch 60/70
14s - loss: 0.0691
Epoch 61/70
14s - loss: 0.0691
Epoch 62/70
14s - loss: 0.0691
Epoch 63/70
14s - loss: 0.0690
Epoch 64/70
14s - loss: 0.0690
Epoch 65/70
14s - loss: 0.0690
Epoch 66/70
15s - loss: 0.0690
Epoch 67/70
14s - loss: 0.0691
Epoch 68/70
14s - loss: 0.0691
Epoch 69/70
14s - loss: 0.0690
Epoch 70/70
14s - loss: 0.0690

Predicting with neural network model...



In [34]:

    
#transaction dateprint( "\nPreparing results for write..." )
nn_pred = y_pred_ann.flatten()
print( "Type of nn_pred is ", type(nn_pred) )
print( "Shape of nn_pred is ", nn_pred.shape )









    



Type of nn_pred is  <class 'numpy.ndarray'>
Shape of nn_pred is  (2985217,)



In [35]:

    
# Cleanup
del train
del prop
del sample
del x_train
del x_test
del df_train
del df_test
del y_pred_ann
gc.collect()









    Out[35]:





14254



In [36]:

    
################
################
##    OLS     ##
################
################

# This section is derived from the1owl's notebook:
#    https://www.kaggle.com/the1owl/primer-for-the-zillow-pred-approach
# which I (Andy Harless) updated and made into a script:
#    https://www.kaggle.com/aharless/updated-script-version-of-the1owl-s-basic-ols



In [37]:

    
np.random.seed(17)
random.seed(17)

print( "\n\nProcessing data for OLS ...")

train = pd.read_csv("../input/train_2017.csv", parse_dates=["transactiondate"])
properties = pd.read_csv("../input/properties_2017.csv")
submission = pd.read_csv("../input/sample_submission.csv")
print(len(train),len(properties),len(submission))

def get_features(df):
    df["transactiondate"] = pd.to_datetime(df["transactiondate"])
    df["transactiondate_year"] = df["transactiondate"].dt.year
    df["transactiondate_month"] = df["transactiondate"].dt.month
    df['transactiondate'] = df['transactiondate'].dt.quarter
    df = df.fillna(-1.0)
    return df









    




Processing data for OLS ...






    



/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (49) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)






    



77613 2985217 2985217



In [38]:

    
def MAE(y, ypred):
    #logerror=log(Zestimate)−log(SalePrice)
    return np.sum([abs(y[i]-ypred[i]) for i in range(len(y))]) / len(y)



In [39]:

    
train = pd.merge(train, properties, how='left', on='parcelid')
y = train['logerror'].values
test = pd.merge(submission, properties, how='left', left_on='ParcelId', right_on='parcelid')
properties = [] #memory

exc = [train.columns[c] for c in range(len(train.columns)) if train.dtypes[c] == 'O'] + ['logerror','parcelid']
col = [c for c in train.columns if c not in exc]



In [40]:

    
train = get_features(train[col])
test['transactiondate'] = '2016-01-01' #should use the most common training date
test = get_features(test[col])









    



/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app



In [41]:

    
print("\nFitting OLS...")
reg = LinearRegression(n_jobs=-1)
reg.fit(train, y); print('fit...')
print(MAE(y, reg.predict(train)))
train = [];  y = [] #memory

test_dates = ['2016-10-01','2016-11-01','2016-12-01','2017-10-01','2017-11-01','2017-12-01']
test_columns = ['201610','201611','201612','201710','201711','201712']









    



Fitting OLS...
fit...
0.0709592203753



In [42]:

    
########################
########################
##  Combine and Save  ##
########################
########################


##### COMBINE PREDICTIONS

print( "\nCombining XGBoost, LightGBM, NN, and baseline predicitons ..." )
lgb_weight = 1 - XGB_WEIGHT - BASELINE_WEIGHT - NN_WEIGHT - OLS_WEIGHT 
lgb_weight0 = lgb_weight / (1 - OLS_WEIGHT)
xgb_weight0 = XGB_WEIGHT / (1 - OLS_WEIGHT)
baseline_weight0 =  BASELINE_WEIGHT / (1 - OLS_WEIGHT)
nn_weight0 = NN_WEIGHT / (1 - OLS_WEIGHT)
pred0 = 0
pred0 += xgb_weight0*xgb_pred
pred0 += baseline_weight0*BASELINE_PRED
pred0 += lgb_weight0*p_test
pred0 += nn_weight0*nn_pred









    



Combining XGBoost, LightGBM, NN, and baseline predicitons ...



In [43]:

    
print( "\nPredicting with OLS and combining with XGB/LGB/NN/baseline predicitons: ..." )
for i in range(len(test_dates)):
    test['transactiondate'] = test_dates[i]
    pred = FUDGE_FACTOR * ( OLS_WEIGHT*reg.predict(get_features(test)) + (1-OLS_WEIGHT)*pred0 )
    submission[test_columns[i]] = [float(format(x, '.4f')) for x in pred]
    print('predict...', i)
    
print( "\nWriting results to disk ..." )
submission.to_csv('1003_try.csv', index=False , float_format='%.4f')
print( "\nFinished ...")









    



Predicting with OLS and combining with XGB/LGB/NN/baseline predicitons: ...
predict... 0
predict... 1
predict... 2
predict... 3
predict... 4
predict... 5

Writing results to disk ...

Finished ...



In [44]:

    
train_c = train.copy()
train_c['trans_month'] = train_c['transactiondate'].dt.month
cnt_srs = train_c['trans_month'].value_counts()
plt.figure(figsize=(12,6))
sns.barplot(cnt_srs.index, cnt_srs.values,alpha = 0.8)
plt.xticks(rotation='vertical')
plt.xlabel('Month of transaction', fontsize=12)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.show()









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-44-aa1bb3b2923c> in <module>()
      1 train_c = train.copy()
----> 2 train_c['trans_month'] = train_c['transactiondate'].dt.month
      3 cnt_srs = train_c['trans_month'].value_counts()
      4 plt.figure(figsize=(12,6))
      5 sns.barplot(cnt_srs.index, cnt_srs.values,alpha = 0.8)

TypeError: list indices must be integers or slices, not str



In [45]:

    
#check number of Nulls in property dataset
missing_df = properties.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name','missing_count']
missing_df['missing_ratio'] = missing_df['missing_count'] / properties.shape[0]
missing_filtered = missing_df.loc[missing_df['missing_ratio']>0.99]
#eliminate columns that has too many null
eliminate_list = missing_filtered['column_name'].values
properties = properties.drop(eliminate_list,axis=1)









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-45-afc9c8ad2429> in <module>()
      1 #check number of Nulls in property dataset
----> 2 missing_df = properties.isnull().sum(axis=0).reset_index()
      3 missing_df.columns = ['column_name','missing_count']
      4 missing_df['missing_ratio'] = missing_df['missing_count'] / properties.shape[0]
      5 missing_filtered = missing_df.loc[missing_df['missing_ratio']>0.99]

AttributeError: 'list' object has no attribute 'isnull'



In [46]:

    
# for c in properties.columns.values:
#     plt.figure(figsize=(12,6))
#     sns.countplot(x=c, data=properties)
#     plt.ylabel('Count', fontsize=12)
#     plt.xlabel(c, fontsize=12)
#     plt.xticks(rotation='vertical')
#     plt.show()
plt.figure(figsize=(12,6))
sns.countplot(x='finishedsquarefeet12', data=properties)
plt.ylabel('Count', fontsize=12)
plt.xlabel("finishedsquarefeet12", fontsize=12)
plt.xticks(rotation='vertical')
plt.show()









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-46-ff72ff22c48d> in <module>()
      7 #     plt.show()
      8 plt.figure(figsize=(12,6))
----> 9 sns.countplot(x='finishedsquarefeet12', data=properties)
     10 plt.ylabel('Count', fontsize=12)
     11 plt.xlabel("finishedsquarefeet12", fontsize=12)

/opt/conda/lib/python3.6/site-packages/seaborn/categorical.py in countplot(x, y, hue, data, order, hue_order, orient, color, palette, saturation, ax, **kwargs)
   3256                           estimator, ci, n_boot, units,
   3257                           orient, color, palette, saturation,
-> 3258                           errcolor)
   3259 
   3260     plotter.value_label = "count"

/opt/conda/lib/python3.6/site-packages/seaborn/categorical.py in __init__(self, x, y, hue, data, order, hue_order, estimator, ci, n_boot, units, orient, color, palette, saturation, errcolor, errwidth, capsize)
   1541         """Initialize the plotter."""
   1542         self.establish_variables(x, y, hue, data, orient,
-> 1543                                  order, hue_order, units)
   1544         self.establish_colors(color, palette, saturation)
   1545         self.estimate_statistic(estimator, ci, n_boot)

/opt/conda/lib/python3.6/site-packages/seaborn/categorical.py in establish_variables(self, x, y, hue, data, orient, order, hue_order, units)
    140             # See if we need to get variables from `data`
    141             if data is not None:
--> 142                 x = data.get(x, x)
    143                 y = data.get(y, y)
    144                 hue = data.get(hue, hue)

AttributeError: 'list' object has no attribute 'get'





    





<matplotlib.figure.Figure at 0x7f5f9563f2b0>



In [47]:

    
print('t')



In [48]:

    
#convert datatype
def convert_datatype(dataframe):
    for c, dtype in zip(dataframe.columns, dataframe.dtypes):
        if dtype == np.float64:
            dataframe[c] = dataframe[c].astype(np.float32)
        if dtype == np.int64:
            dataframe[c] = dataframe[c].astype(np.int32)
            
convert_datatype(properties)
convert_datatype(test)









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-48-54da31ec157d> in <module>()
      7             dataframe[c] = dataframe[c].astype(np.int32)
      8 
----> 9 convert_datatype(properties)
     10 convert_datatype(test)

<ipython-input-48-54da31ec157d> in convert_datatype(dataframe)
      1 #convert datatype
      2 def convert_datatype(dataframe):
----> 3     for c, dtype in zip(dataframe.columns, dataframe.dtypes):
      4         if dtype == np.float64:
      5             dataframe[c] = dataframe[c].astype(np.float32)

AttributeError: 'list' object has no attribute 'columns'



In [49]:

    
#living area proportions 
properties['living_area_prop'] = properties['calculatedfinishedsquarefeet'] / \
properties['lotsizesquarefeet']
#tax value ratio
properties['value_ratio'] = properties['taxvaluedollarcnt'] / properties['taxamount']
#tax value proportions
properties['value_prop'] = properties['structuretaxvaluedollarcnt'] /\
properties['landtaxvaluedollarcnt']









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-49-dd88467d1374> in <module>()
      1 #living area proportions
----> 2 properties['living_area_prop'] = properties['calculatedfinishedsquarefeet'] / properties['lotsizesquarefeet']
      3 #tax value ratio
      4 properties['value_ratio'] = properties['taxvaluedollarcnt'] / properties['taxamount']
      5 #tax value proportions

TypeError: list indices must be integers or slices, not str



In [50]:

    
#mergeing datasets
df_train = train.merge(properties, how='left', on='parcelid')
df_test = test.merge(properties, how='left', on='parcelid')









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-50-e22b4d823fed> in <module>()
      1 #mergeing datasets
----> 2 df_train = train.merge(properties, how='left', on='parcelid')
      3 df_test = test.merge(properties, how='left', on='parcelid')

AttributeError: 'list' object has no attribute 'merge'



In [51]:

    
print(df_train.shape,train.shape,properties.shape)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-51-91fc6deb3e8b> in <module>()
----> 1 print(df_train.shape,train.shape,properties.shape)

NameError: name 'df_train' is not defined



In [52]:

    
#change missing values into 0
#change categorical into to numerical

def convert_label(dataframe):
    lbI = LabelEncoder()
    for c in dataframe.columns:
        dataframe[c] = dataframe[c].fillna(0)
        if dataframe[c].dtype == 'object':
            lbI.fit(list(dataframe[c].values))
            dataframe[c] = lbI.transform(list(dataframe[c].values))

convert_label(df_train)
convert_label(df_test)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-52-d9afe508ba44> in <module>()
     10             dataframe[c] = lbI.transform(list(dataframe[c].values))
     11 
---> 12 convert_label(df_train)
     13 convert_label(df_test)
     14 

NameError: name 'df_train' is not defined



In [53]:

    
#re-arranging 
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
                         'propertycountylandusecode', ], axis=1)
x_test = df_test.drop(['parcelid', 'propertyzoningdesc',
                       'propertycountylandusecode', '201610', '201611', 
                       '201612', '201710', '201711', '201712'], axis = 1) 
x_train = x_train.values
y_train = df_train['logerror'].values









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-53-32e9c69b9b38> in <module>()
      1 #re-arranging
----> 2 x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
      3                          'propertycountylandusecode', ], axis=1)
      4 x_test = df_test.drop(['parcelid', 'propertyzoningdesc',
      5                        'propertycountylandusecode', '201610', '201611',

NameError: name 'df_train' is not defined



In [54]:

    
from datetime import datetime

	parcelid	logerror	transactiondate	airconditioningtypeid	architecturalstyletypeid	basementsqft	bathroomcnt	bedroomcnt	buildingclasstypeid	buildingqualitytypeid	...	taxvaluedollarcnt	assessmentyear	landtaxvaluedollarcnt	taxamount	taxdelinquencyflag	taxdelinquencyyear	censustractandblock	transactiondate_year	transactiondate_month	transactiondate_quarter
0	14297519	0.025595	1	-1.0	-1.0	-1.0	3.5	4.0	-1.0	-1.0	...	1023282.0	2016.0	537569.0	11013.72	0	-1.0	6.059063e+13	2017	1	1
1	17052889	0.055619	1	-1.0	-1.0	-1.0	1.0	2.0	-1.0	-1.0	...	464000.0	2016.0	376000.0	5672.48	0	-1.0	6.111001e+13	2017	1	1
2	14186244	0.005383	1	-1.0	-1.0	-1.0	2.0	3.0	-1.0	-1.0	...	564778.0	2016.0	479489.0	6488.30	0	-1.0	6.059022e+13	2017	1	1
3	12177905	-0.103410	1	-1.0	-1.0	-1.0	3.0	4.0	-1.0	8.0	...	145143.0	2016.0	36225.0	1777.51	0	-1.0	6.037300e+13	2017	1	1
4	10887214	0.006940	1	1.0	-1.0	-1.0	3.0	3.0	-1.0	8.0	...	119407.0	2016.0	45726.0	1533.89	0	-1.0	6.037124e+13	2017	1	1
5	17143294	-0.020526	1	-1.0	-1.0	-1.0	2.0	3.0	-1.0	-1.0	...	331064.0	2016.0	132424.0	3508.10	0	-1.0	6.111005e+13	2017	1	1
6	12095076	-0.001011	1	1.0	-1.0	-1.0	3.0	4.0	-1.0	9.0	...	773303.0	2016.0	496619.0	9516.26	0	-1.0	6.037461e+13	2017	1	1
7	12069064	0.101723	1	-1.0	-1.0	-1.0	1.0	2.0	-1.0	5.0	...	218552.0	2016.0	199662.0	2366.08	0	-1.0	6.037302e+13	2017	1	1
8	12790562	-0.040966	2	-1.0	-1.0	-1.0	3.0	4.0	-1.0	9.0	...	220583.0	2016.0	43056.0	3104.19	0	-1.0	6.037500e+13	2017	1	1
9	11542646	-0.036763	2	-1.0	-1.0	-1.0	3.0	2.0	-1.0	8.0	...	371361.0	2016.0	220058.0	4557.32	0	-1.0	6.037275e+13	2017	1	1
10	11104527	0.005963	2	1.0	-1.0	-1.0	3.0	4.0	-1.0	8.0	...	430108.0	2016.0	158393.0	6103.36	0	-1.0	6.037920e+13	2017	1	1
11	13984101	-0.147997	2	-1.0	-1.0	-1.0	0.0	0.0	-1.0	-1.0	...	471590.0	2016.0	309919.0	6151.38	0	-1.0	6.059042e+13	2017	1	1
12	14705431	-0.028835	2	-1.0	-1.0	-1.0	2.5	3.0	-1.0	-1.0	...	582011.0	2016.0	463974.0	6675.04	0	-1.0	6.059099e+13	2017	1	1
13	13944538	0.045602	2	13.0	-1.0	-1.0	2.5	3.0	-1.0	-1.0	...	319668.0	2016.0	223051.0	4078.08	0	-1.0	6.059089e+13	2017	1	1
14	14731691	-0.008935	2	-1.0	-1.0	-1.0	2.0	2.0	-1.0	-1.0	...	132425.0	2016.0	15362.0	1771.34	0	-1.0	6.059099e+13	2017	1	1
15	17110996	0.008669	2	-1.0	-1.0	-1.0	2.5	3.0	-1.0	-1.0	...	198054.0	2016.0	99028.0	2204.84	0	-1.0	6.111005e+13	2017	1	1
16	14375300	-0.021896	2	13.0	-1.0	-1.0	2.0	4.0	-1.0	-1.0	...	528847.0	2016.0	359729.0	5496.44	0	-1.0	6.059052e+13	2017	1	1
17	12045625	-0.017167	2	1.0	-1.0	-1.0	3.0	3.0	-1.0	8.0	...	259197.0	2016.0	89470.0	2831.48	0	-1.0	6.037301e+13	2017	1	1
18	11830315	-0.035085	2	-1.0	-1.0	-1.0	1.0	2.0	-1.0	4.0	...	273745.0	2016.0	145785.0	3739.02	0	-1.0	6.037532e+13	2017	1	1
19	14730402	-0.012041	2	-1.0	-1.0	-1.0	2.0	2.0	-1.0	-1.0	...	245910.0	2016.0	170974.0	2548.58	1	15.0	6.059032e+13	2017	1	1
20	12325145	0.042463	2	1.0	-1.0	-1.0	2.0	3.0	-1.0	6.0	...	178646.0	2016.0	38065.0	2391.69	1	15.0	6.037554e+13	2017	1	1
21	14387959	-0.006653	2	-1.0	-1.0	-1.0	2.5	3.0	-1.0	-1.0	...	597213.0	2016.0	423944.0	6172.28	0	-1.0	6.059042e+13	2017	1	1
22	14349322	0.044309	2	-1.0	-1.0	-1.0	2.5	3.0	-1.0	-1.0	...	534612.0	2016.0	313179.0	7141.56	0	-1.0	6.059052e+13	2017	1	1
23	11706737	-0.120976	2	-1.0	-1.0	-1.0	1.0	2.0	-1.0	5.0	...	171838.0	2016.0	134175.0	2220.23	1	14.0	6.037232e+13	2017	1	1
24	11671335	-0.000546	2	1.0	-1.0	-1.0	3.0	2.0	-1.0	9.0	...	573247.0	2016.0	296606.0	6964.34	0	-1.0	6.037263e+13	2017	1	1
25	12531488	0.042041	2	-1.0	-1.0	-1.0	1.0	3.0	-1.0	4.0	...	432754.0	2016.0	334716.0	5212.75	0	-1.0	6.037578e+13	2017	1	1
26	14314879	-0.001476	2	-1.0	-1.0	-1.0	4.5	6.0	-1.0	-1.0	...	2040000.0	2016.0	1741737.0	21758.26	0	-1.0	6.059063e+13	2017	1	1
27	11130689	0.020110	2	1.0	-1.0	-1.0	2.0	3.0	-1.0	6.0	...	401399.0	2016.0	179161.0	5438.41	0	-1.0	6.037920e+13	2017	1	1
28	12036177	0.206470	2	-1.0	-1.0	-1.0	2.0	3.0	-1.0	4.0	...	85035.0	2016.0	52771.0	1232.08	0	-1.0	6.037189e+13	2017	1	1
29	14333888	-0.025392	2	-1.0	-1.0	-1.0	3.0	2.0	-1.0	-1.0	...	954106.0	2016.0	638423.0	10696.14	0	-1.0	6.059076e+13	2017	1	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
77583	12651460	0.094750	19	-1.0	-1.0	-1.0	1.0	2.0	-1.0	5.0	...	443726.0	2016.0	354981.0	5423.02	0	-1.0	6.037297e+13	2017	9	3
77584	10962691	-0.064765	19	-1.0	-1.0	-1.0	2.0	3.0	-1.0	4.0	...	316229.0	2016.0	138246.0	3980.97	0	-1.0	6.037107e+13	2017	9	3
77585	11487400	0.046755	19	-1.0	-1.0	-1.0	2.0	2.0	-1.0	7.0	...	213230.0	2016.0	77126.0	2940.30	0	-1.0	6.037621e+13	2017	9	3
77586	12433997	0.018458	19	-1.0	-1.0	-1.0	1.0	2.0	-1.0	4.0	...	143238.0	2016.0	95811.0	1865.65	1	12.0	6.037555e+13	2017	9	3
77587	10855745	0.002138	19	1.0	-1.0	-1.0	3.0	3.0	-1.0	6.0	...	796324.0	2016.0	391845.0	9703.53	0	-1.0	6.037129e+13	2017	9	3
77588	12883894	1.285713	19	-1.0	-1.0	-1.0	1.0	3.0	-1.0	4.0	...	15870.0	2016.0	6143.0	500.68	0	-1.0	6.037402e+13	2017	9	3
77589	12885530	0.003457	19	-1.0	-1.0	-1.0	1.0	2.0	-1.0	4.0	...	60841.0	2016.0	11774.0	904.27	0	-1.0	6.037403e+13	2017	9	3
77590	11490983	-0.036067	19	-1.0	-1.0	-1.0	1.0	2.0	-1.0	8.0	...	242126.0	2016.0	109506.0	3018.98	0	-1.0	6.037621e+13	2017	9	3
77591	12401161	-0.232215	19	-1.0	-1.0	-1.0	3.0	3.0	-1.0	9.0	...	887197.0	2016.0	293396.0	10781.20	0	-1.0	6.037551e+13	2017	9	3
77592	11068972	0.015806	19	1.0	-1.0	-1.0	5.0	4.0	-1.0	11.0	...	1316000.0	2016.0	526000.0	16032.16	0	-1.0	6.037108e+13	2017	9	3
77593	11490089	0.068838	19	-1.0	-1.0	-1.0	2.0	2.0	-1.0	8.0	...	302341.0	2016.0	145026.0	3774.07	0	-1.0	6.037621e+13	2017	9	3
77594	12406123	0.000099	19	-1.0	-1.0	-1.0	2.0	2.0	-1.0	4.0	...	108344.0	2016.0	41307.0	1580.48	0	-1.0	6.037555e+13	2017	9	3
77595	12669704	0.047467	19	-1.0	-1.0	-1.0	2.0	4.0	-1.0	8.0	...	139547.0	2016.0	73500.0	1770.95	0	-1.0	6.037651e+13	2017	9	3
77596	12410050	0.000749	19	-1.0	-1.0	-1.0	1.0	2.0	-1.0	4.0	...	222739.0	2016.0	157067.0	2759.92	0	-1.0	6.037553e+13	2017	9	3
77597	11953526	-0.159471	19	-1.0	-1.0	-1.0	0.0	0.0	-1.0	-1.0	...	34124.0	2016.0	27730.0	490.51	0	-1.0	6.037187e+13	2017	9	3
77598	11608641	-0.816510	19	-1.0	-1.0	-1.0	2.0	2.0	-1.0	6.0	...	97499.0	2016.0	72878.0	1391.79	1	14.0	6.037269e+13	2017	9	3
77599	11978003	-0.062404	19	1.0	-1.0	-1.0	2.0	3.0	-1.0	6.0	...	603333.0	2016.0	422334.0	7312.73	0	-1.0	6.037183e+13	2017	9	3
77600	11318911	0.013268	19	-1.0	-1.0	-1.0	1.0	1.0	-1.0	5.0	...	81706.0	2016.0	32682.0	1131.24	0	-1.0	6.037901e+13	2017	9	3
77601	10722237	0.052630	19	1.0	-1.0	-1.0	1.0	2.0	-1.0	8.0	...	259738.0	2016.0	144804.0	3225.40	0	-1.0	6.037134e+13	2017	9	3
77602	12055795	-0.140881	19	-1.0	-1.0	-1.0	1.0	2.0	-1.0	4.0	...	312183.0	2016.0	189703.0	3465.38	0	-1.0	6.037302e+13	2017	9	3
77603	12892446	0.319408	19	-1.0	-1.0	-1.0	3.0	4.0	-1.0	8.0	...	332251.0	2016.0	121211.0	4225.84	0	-1.0	6.037402e+13	2017	9	3
77604	12666457	0.028733	19	-1.0	-1.0	-1.0	3.0	3.0	-1.0	10.0	...	1414710.0	2016.0	1188359.0	16388.30	0	-1.0	6.037621e+13	2017	9	3
77605	10858613	-0.075465	19	1.0	-1.0	-1.0	2.0	2.0	-1.0	8.0	...	346788.0	2016.0	214889.0	4231.76	0	-1.0	6.037141e+13	2017	9	3
77606	10722691	0.081196	19	1.0	-1.0	-1.0	2.0	3.0	-1.0	6.0	...	72026.0	2016.0	25242.0	1000.70	0	-1.0	6.037135e+13	2017	9	3
77607	12412492	0.001082	19	-1.0	-1.0	-1.0	2.0	4.0	-1.0	6.0	...	346534.0	2016.0	221068.0	4175.08	0	-1.0	6.037555e+13	2017	9	3
77608	10833991	-0.002245	20	1.0	-1.0	-1.0	3.0	3.0	-1.0	8.0	...	379000.0	2016.0	114000.0	4685.34	0	-1.0	6.037132e+13	2017	9	3
77609	11000655	0.020615	20	-1.0	-1.0	-1.0	2.0	2.0	-1.0	6.0	...	354621.0	2016.0	283704.0	4478.43	0	-1.0	6.037101e+13	2017	9	3
77610	17239384	0.013209	21	-1.0	-1.0	-1.0	2.0	4.0	-1.0	-1.0	...	67205.0	2016.0	16522.0	1107.48	0	-1.0	6.111008e+13	2017	9	3
77611	12773139	0.037129	21	1.0	-1.0	-1.0	1.0	3.0	-1.0	4.0	...	49546.0	2016.0	16749.0	876.43	0	-1.0	6.037434e+13	2017	9	3
77612	12826780	0.007204	25	-1.0	-1.0	-1.0	2.0	3.0	-1.0	6.0	...	522000.0	2016.0	382000.0	6317.15	0	-1.0	6.037503e+13	2017	9	3