In [ ]:
# submit History
# 0.0652418 : initial submit
# 0.0649065 : some change?
# 0.0979253 : fillna with mean value, convert_outlier
# 0.0984987 : remove convert_outlier
# 0.0654573 : remove fillna with mean value
# ? : TODO apply convert_outlier to logerror

In [64]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

%matplotlib inline

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999

In [65]:
def draw_scatter_plot(df, col_name):
    np_array = df[col_name].values
    
    plt.figure(figsize=(8,6))
    plt.scatter(range(len(np_array)), np.sort(np_array))
    plt.xlabel('index', fontsize=12)
    plt.ylabel(col_name, fontsize=12)
    plt.show()
    
    
def draw_dist_plot(df, col_name):
    np_array = df[col_name].values
    
    plt.figure(figsize=(12,8))
    sns.distplot(np_array, bins=50, kde=False)
    plt.xlabel(col_name, fontsize=12)
    plt.ylabel('count', fontsize=12)
    plt.show()
    
    
def draw_count_plot(df, col_name, title='plot'):
    plt.figure(figsize=(12,8))
    sns.countplot(data=df, x=col_name)
    plt.xticks(rotation='vertical')
    plt.xlabel(col_name, fontsize=12)
    plt.ylabel('count', fontsize=12)
    plt.title(title, fontsize=15)
    plt.show()
    
    
def draw_box_plot(df, x_col, y_col):
    plt.figure(figsize=(12,8))
    sns.boxplot(data=df, x=x_col, y=y_col)
    plt.xlabel(x_col, fontsize=12)
    plt.ylabel(y_col, fontsize=12)
    plt.show()
    
    
def draw_violin_plot(df, x_col, y_col):
    plt.figure(figsize=(12,8))
    sns.violinplot(data=df, x=x_col, y=y_col)
    plt.xlabel(x_col, fontsize=12)
    plt.ylabel(y_col, fontsize=12)
    plt.show()
    

def draw_plots(df, col_name):
    draw_scatter_plot(df, col_name)
    draw_dist_plot(df, col_name)

    
def draw_np_array_scatter_plot(np_array, col_name):
    plt.figure(figsize=(8,6))
    plt.scatter(range(len(np_array)), np.sort(np_array))
    plt.xlabel('index', fontsize=12)
    plt.ylabel(col_name, fontsize=12)
    plt.show()
    
    
def draw_np_array_dist_plot(np_array, col_name):
    plt.figure(figsize=(12,8))
    sns.distplot(np_array, bins=50, kde=False)
    plt.xlabel(col_name, fontsize=12)
    plt.ylabel('count', fontsize=12)
    plt.show()
    

def convert_outlier_value(df, col_name, upper_percentile=99.0, lower_percentile=1.0):
    np_array = df[col_name].values
    
    ulimit = np.percentile(np_array, upper_percentile)
    llimit = np.percentile(np_array, lower_percentile)
    print('upper limit :', ulimit, ', lower limit :', llimit)
    
    # convert
    df[col_name].loc[df[col_name] > ulimit] = ulimit
    df[col_name].loc[df[col_name] < llimit] = llimit

In [66]:
print('Loading data ...')

train_df = pd.read_csv('input/train_2016_v2.csv', parse_dates=['transactiondate'])
prop_df = pd.read_csv('input/properties_2016.csv')
sample = pd.read_csv('input/sample_submission.csv')


Loading data ...
/home/voyageth/develop/anaconda3/envs/kaggle/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (22,32,34,49,55) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [67]:
print('Binding to float32')

for c, dtype in zip(prop_df.columns, prop_df.dtypes):
    if dtype == np.float64:
        prop_df[c] = prop_df[c].astype(np.float32)


Binding to float32

In [68]:
print('df_train.shape :', train_df.shape)
display(train_df.head())


df_train.shape : (90275, 3)
parcelid logerror transactiondate
0 11016594 0.0276 2016-01-01
1 14366692 -0.1684 2016-01-01
2 12098116 -0.0040 2016-01-01
3 12643413 0.0218 2016-01-02
4 14432541 -0.0050 2016-01-02

In [69]:
draw_plots(train_df, 'logerror')



In [70]:
# convert_outlier_value(train_df, 'logerror', lower_percentile=1, upper_percentile=99)

In [71]:
draw_plots(train_df, 'logerror')



In [72]:
missing_df = prop_df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df = missing_df.loc[missing_df['missing_count'] > 0]
missing_df = missing_df.sort_values(by='missing_count', ascending=False)
missing_df['missing_ratio'] = missing_df['missing_count'] / len(prop_df)
display(missing_df)


column_name missing_count missing_ratio
41 storytypeid 2983593 0.999456
3 basementsqft 2983589 0.999455
46 yardbuildingsqft26 2982570 0.999113
49 fireplaceflag 2980054 0.998270
2 architecturalstyletypeid 2979156 0.997970
43 typeconstructiontypeid 2978470 0.997740
13 finishedsquarefeet13 2977545 0.997430
6 buildingclasstypeid 2972588 0.995769
9 decktypeid 2968121 0.994273
16 finishedsquarefeet6 2963216 0.992630
28 poolsizesum 2957257 0.990634
30 pooltypeid2 2953142 0.989255
29 pooltypeid10 2948278 0.987626
55 taxdelinquencyflag 2928755 0.981086
56 taxdelinquencyyear 2928753 0.981085
22 hashottuborspa 2916203 0.976881
45 yardbuildingsqft17 2904862 0.973082
14 finishedsquarefeet15 2794419 0.936086
15 finishedsquarefeet50 2782500 0.932093
10 finishedfloor1squarefeet 2782500 0.932093
42 threequarterbathnbr 2673586 0.895609
18 fireplacecnt 2672580 0.895272
31 pooltypeid7 2499758 0.837379
27 poolcnt 2467683 0.826634
48 numberofstories 2303148 0.771518
1 airconditioningtypeid 2173698 0.728154
21 garagetotalsqft 2101950 0.704120
20 garagecarcnt 2101950 0.704120
38 regionidneighborhood 1828815 0.612624
23 heatingorsystemtypeid 1178816 0.394885
7 buildingqualitytypeid 1046729 0.350637
44 unitcnt 1007727 0.337572
34 propertyzoningdesc 1006588 0.337191
26 lotsizesquarefeet 276099 0.092489
12 finishedsquarefeet12 276033 0.092467
8 calculatedbathnbr 128912 0.043183
19 fullbathcnt 128912 0.043183
57 censustractandblock 75126 0.025166
53 landtaxvaluedollarcnt 67733 0.022689
36 regionidcity 62845 0.021052
47 yearbuilt 59928 0.020075
11 calculatedfinishedsquarefeet 55565 0.018613
50 structuretaxvaluedollarcnt 54982 0.018418
51 taxvaluedollarcnt 42550 0.014254
54 taxamount 31250 0.010468
39 regionidzip 13980 0.004683
32 propertycountylandusecode 12277 0.004113
40 roomcnt 11475 0.003844
4 bathroomcnt 11462 0.003840
5 bedroomcnt 11450 0.003836
52 assessmentyear 11439 0.003832
33 propertylandusetypeid 11437 0.003831
24 latitude 11437 0.003831
35 rawcensustractandblock 11437 0.003831
17 fips 11437 0.003831
25 longitude 11437 0.003831
37 regionidcounty 11437 0.003831

In [73]:
print('Creating training set ...')

df_train = train_df.merge(prop_df, how='left', on='parcelid')
df_train.head()


Creating training set ...
Out[73]:
parcelid logerror transactiondate airconditioningtypeid architecturalstyletypeid basementsqft bathroomcnt bedroomcnt buildingclasstypeid buildingqualitytypeid calculatedbathnbr decktypeid finishedfloor1squarefeet calculatedfinishedsquarefeet finishedsquarefeet12 finishedsquarefeet13 finishedsquarefeet15 finishedsquarefeet50 finishedsquarefeet6 fips fireplacecnt fullbathcnt garagecarcnt garagetotalsqft hashottuborspa heatingorsystemtypeid latitude longitude lotsizesquarefeet poolcnt poolsizesum pooltypeid10 pooltypeid2 pooltypeid7 propertycountylandusecode propertylandusetypeid propertyzoningdesc rawcensustractandblock regionidcity regionidcounty regionidneighborhood regionidzip roomcnt storytypeid threequarterbathnbr typeconstructiontypeid unitcnt yardbuildingsqft17 yardbuildingsqft26 yearbuilt numberofstories fireplaceflag structuretaxvaluedollarcnt taxvaluedollarcnt assessmentyear landtaxvaluedollarcnt taxamount taxdelinquencyflag taxdelinquencyyear censustractandblock
0 11016594 0.0276 2016-01-01 1.0 NaN NaN 2.0 3.0 NaN 4.0 2.0 NaN NaN 1684.0 1684.0 NaN NaN NaN NaN 6037.0 NaN 2.0 NaN NaN NaN 2.0 34280992.0 -118488536.0 7528.0 NaN NaN NaN NaN NaN 0100 261.0 LARS 60371068.0 12447.0 3101.0 31817.0 96370.0 0.0 NaN NaN NaN 1.0 NaN NaN 1959.0 NaN NaN 122754.0 360170.0 2015.0 237416.0 6735.879883 NaN NaN 6.037107e+13
1 14366692 -0.1684 2016-01-01 NaN NaN NaN 3.5 4.0 NaN NaN 3.5 NaN NaN 2263.0 2263.0 NaN NaN NaN NaN 6059.0 NaN 3.0 2.0 468.0 NaN NaN 33668120.0 -117677552.0 3643.0 NaN NaN NaN NaN NaN 1 261.0 NaN 60590524.0 32380.0 1286.0 NaN 96962.0 0.0 NaN 1.0 NaN NaN NaN NaN 2014.0 NaN NaN 346458.0 585529.0 2015.0 239071.0 10153.019531 NaN NaN NaN
2 12098116 -0.0040 2016-01-01 1.0 NaN NaN 3.0 2.0 NaN 4.0 3.0 NaN NaN 2217.0 2217.0 NaN NaN NaN NaN 6037.0 NaN 3.0 NaN NaN NaN 2.0 34136312.0 -118175032.0 11423.0 NaN NaN NaN NaN NaN 0100 261.0 PSR6 60374640.0 47019.0 3101.0 275411.0 96293.0 0.0 NaN NaN NaN 1.0 NaN NaN 1940.0 NaN NaN 61994.0 119906.0 2015.0 57912.0 11484.480469 NaN NaN 6.037464e+13
3 12643413 0.0218 2016-01-02 1.0 NaN NaN 2.0 2.0 NaN 4.0 2.0 NaN NaN 839.0 839.0 NaN NaN NaN NaN 6037.0 NaN 2.0 NaN NaN NaN 2.0 33755800.0 -118309000.0 70859.0 NaN NaN NaN NaN NaN 010C 266.0 LAR3 60372964.0 12447.0 3101.0 54300.0 96222.0 0.0 NaN NaN NaN 1.0 NaN NaN 1987.0 NaN NaN 171518.0 244880.0 2015.0 73362.0 3048.739990 NaN NaN 6.037296e+13
4 14432541 -0.0050 2016-01-02 NaN NaN NaN 2.5 4.0 NaN NaN 2.5 NaN NaN 2283.0 2283.0 NaN NaN NaN NaN 6059.0 NaN 2.0 2.0 598.0 NaN NaN 33485644.0 -117700232.0 6000.0 1.0 NaN NaN NaN 1.0 122 261.0 NaN 60590424.0 17686.0 1286.0 NaN 96961.0 8.0 NaN 1.0 NaN NaN NaN NaN 1981.0 2.0 NaN 169574.0 434551.0 2015.0 264977.0 5488.959961 NaN NaN 6.059042e+13

In [74]:
pd.options.display.max_rows = 65
dtype_df = df_train.dtypes.reset_index()
dtype_df.columns = ['Count', 'Column Type']
dtype_df.groupby('Column Type').aggregate('count').reset_index()


Out[74]:
Column Type Count
0 int64 1
1 float64 1
2 datetime64[ns] 1
3 float32 52
4 object 5

In [75]:
# fillna with mean value
mean_values = df_train.mean(axis=0)
# df_train = df_train.fillna(mean_values) # not good choice? LB increased to 0.09!

In [76]:
missing_df = df_train.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df['missing_ratio'] = missing_df['missing_count'] / df_train.shape[0]
missing_df.loc[missing_df['missing_ratio'] > 0]['column_name']


Out[76]:
3            airconditioningtypeid
4         architecturalstyletypeid
5                     basementsqft
8              buildingclasstypeid
9            buildingqualitytypeid
10               calculatedbathnbr
11                      decktypeid
12        finishedfloor1squarefeet
13    calculatedfinishedsquarefeet
14            finishedsquarefeet12
15            finishedsquarefeet13
16            finishedsquarefeet15
17            finishedsquarefeet50
18             finishedsquarefeet6
20                    fireplacecnt
21                     fullbathcnt
22                    garagecarcnt
23                 garagetotalsqft
24                  hashottuborspa
25           heatingorsystemtypeid
28               lotsizesquarefeet
29                         poolcnt
30                     poolsizesum
31                    pooltypeid10
32                     pooltypeid2
33                     pooltypeid7
34       propertycountylandusecode
36              propertyzoningdesc
38                    regionidcity
40            regionidneighborhood
41                     regionidzip
43                     storytypeid
44             threequarterbathnbr
45          typeconstructiontypeid
46                         unitcnt
47              yardbuildingsqft17
48              yardbuildingsqft26
49                       yearbuilt
50                 numberofstories
51                   fireplaceflag
52      structuretaxvaluedollarcnt
53               taxvaluedollarcnt
55           landtaxvaluedollarcnt
56                       taxamount
57              taxdelinquencyflag
58              taxdelinquencyyear
59             censustractandblock
Name: column_name, dtype: object

In [77]:
print(df_train['taxdelinquencyflag'].value_counts())
df_train['taxdelinquencyflag'] = df_train['taxdelinquencyflag'] \
                .apply(lambda x: True if x == True or x == 'Y' else False)
print(df_train['taxdelinquencyflag'].value_counts())


Y    1783
Name: taxdelinquencyflag, dtype: int64
False    88492
True      1783
Name: taxdelinquencyflag, dtype: int64

In [78]:
# df_train = df_train.fillna('UNKNOWN')

In [79]:
convert_outlier_value(df_train, 'finishedsquarefeet12', lower_percentile=0.5, upper_percentile=99.5)


upper limit : nan , lower limit : nan
/home/voyageth/develop/anaconda3/envs/kaggle/lib/python3.6/site-packages/numpy/lib/function_base.py:4269: RuntimeWarning: Invalid value encountered in percentile
  interpolation=interpolation)

In [80]:
convert_outlier_value(df_train, 'calculatedfinishedsquarefeet', lower_percentile=0.5, upper_percentile=99.5)


upper limit : nan , lower limit : nan
/home/voyageth/develop/anaconda3/envs/kaggle/lib/python3.6/site-packages/numpy/lib/function_base.py:4269: RuntimeWarning: Invalid value encountered in percentile
  interpolation=interpolation)

In [81]:
convert_outlier_value(df_train, 'taxamount', lower_percentile=0.5, upper_percentile=99.5)


upper limit : nan , lower limit : nan
/home/voyageth/develop/anaconda3/envs/kaggle/lib/python3.6/site-packages/numpy/lib/function_base.py:4269: RuntimeWarning: Invalid value encountered in percentile
  interpolation=interpolation)

In [82]:
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 
                         'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)
train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

print('Building DMatrix...')

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)


(90275, 55) (90275,)
Building DMatrix...

In [83]:
print('Training ...')

xgb_params = {
    'eta': 0.037,
    'max_depth': 5,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'silent': 1
}

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(xgb_params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)


Training ...
[0]	train-mae:0.479881	valid-mae:0.472958
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[10]	train-mae:0.335647	valid-mae:0.328957
[20]	train-mae:0.238757	valid-mae:0.232561
[30]	train-mae:0.174448	valid-mae:0.168915
[40]	train-mae:0.132411	valid-mae:0.127551
[50]	train-mae:0.105671	valid-mae:0.101447
[60]	train-mae:0.089278	valid-mae:0.085609
[70]	train-mae:0.079654	valid-mae:0.076619
[80]	train-mae:0.07424	valid-mae:0.071776
[90]	train-mae:0.071273	valid-mae:0.069254
[100]	train-mae:0.069625	valid-mae:0.067984
[110]	train-mae:0.068702	valid-mae:0.067345
[120]	train-mae:0.06815	valid-mae:0.067025
[130]	train-mae:0.067833	valid-mae:0.066897
[140]	train-mae:0.067624	valid-mae:0.066832
[150]	train-mae:0.067475	valid-mae:0.066803
[160]	train-mae:0.067374	valid-mae:0.066801
[170]	train-mae:0.06729	valid-mae:0.066816
[180]	train-mae:0.067218	valid-mae:0.066836
[190]	train-mae:0.067159	valid-mae:0.06686
[200]	train-mae:0.067096	valid-mae:0.066895
[210]	train-mae:0.067031	valid-mae:0.066918
[220]	train-mae:0.066978	valid-mae:0.066949
[230]	train-mae:0.066913	valid-mae:0.066983
[240]	train-mae:0.066855	valid-mae:0.066991
[250]	train-mae:0.066793	valid-mae:0.067009
Stopping. Best iteration:
[153]	train-mae:0.067436	valid-mae:0.066792


In [84]:
print('Building test set ...')

sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop_df, on='parcelid', how='left')

x_test = df_test[train_columns]
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)

d_test = xgb.DMatrix(x_test)
print('Predicting on test ...')

p_test = clf.predict(d_test)

sub = pd.read_csv('input/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test

print('Writing csv ...')
sub.to_csv('output/xgb_starter.csv', index=False, float_format='%.4f') # Thanks to @inversion
print('Write finished')


Building test set ...
Predicting on test ...
Writing csv ...
Write finished

In [ ]: