In [ ]:
# submit History
# 0.0652418 : initial submit
# 0.0649065 : some change?
# 0.0979253 : fillna with mean value, convert_outlier
# 0.0984987 : remove convert_outlier
# 0.0654573 : remove fillna with mean value
# ? : TODO apply convert_outlier to logerror
In [64]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
In [65]:
def draw_scatter_plot(df, col_name):
np_array = df[col_name].values
plt.figure(figsize=(8,6))
plt.scatter(range(len(np_array)), np.sort(np_array))
plt.xlabel('index', fontsize=12)
plt.ylabel(col_name, fontsize=12)
plt.show()
def draw_dist_plot(df, col_name):
np_array = df[col_name].values
plt.figure(figsize=(12,8))
sns.distplot(np_array, bins=50, kde=False)
plt.xlabel(col_name, fontsize=12)
plt.ylabel('count', fontsize=12)
plt.show()
def draw_count_plot(df, col_name, title='plot'):
plt.figure(figsize=(12,8))
sns.countplot(data=df, x=col_name)
plt.xticks(rotation='vertical')
plt.xlabel(col_name, fontsize=12)
plt.ylabel('count', fontsize=12)
plt.title(title, fontsize=15)
plt.show()
def draw_box_plot(df, x_col, y_col):
plt.figure(figsize=(12,8))
sns.boxplot(data=df, x=x_col, y=y_col)
plt.xlabel(x_col, fontsize=12)
plt.ylabel(y_col, fontsize=12)
plt.show()
def draw_violin_plot(df, x_col, y_col):
plt.figure(figsize=(12,8))
sns.violinplot(data=df, x=x_col, y=y_col)
plt.xlabel(x_col, fontsize=12)
plt.ylabel(y_col, fontsize=12)
plt.show()
def draw_plots(df, col_name):
draw_scatter_plot(df, col_name)
draw_dist_plot(df, col_name)
def draw_np_array_scatter_plot(np_array, col_name):
plt.figure(figsize=(8,6))
plt.scatter(range(len(np_array)), np.sort(np_array))
plt.xlabel('index', fontsize=12)
plt.ylabel(col_name, fontsize=12)
plt.show()
def draw_np_array_dist_plot(np_array, col_name):
plt.figure(figsize=(12,8))
sns.distplot(np_array, bins=50, kde=False)
plt.xlabel(col_name, fontsize=12)
plt.ylabel('count', fontsize=12)
plt.show()
def convert_outlier_value(df, col_name, upper_percentile=99.0, lower_percentile=1.0):
np_array = df[col_name].values
ulimit = np.percentile(np_array, upper_percentile)
llimit = np.percentile(np_array, lower_percentile)
print('upper limit :', ulimit, ', lower limit :', llimit)
# convert
df[col_name].loc[df[col_name] > ulimit] = ulimit
df[col_name].loc[df[col_name] < llimit] = llimit
In [66]:
print('Loading data ...')
train_df = pd.read_csv('input/train_2016_v2.csv', parse_dates=['transactiondate'])
prop_df = pd.read_csv('input/properties_2016.csv')
sample = pd.read_csv('input/sample_submission.csv')
In [67]:
print('Binding to float32')
for c, dtype in zip(prop_df.columns, prop_df.dtypes):
if dtype == np.float64:
prop_df[c] = prop_df[c].astype(np.float32)
In [68]:
print('df_train.shape :', train_df.shape)
display(train_df.head())
In [69]:
draw_plots(train_df, 'logerror')
In [70]:
# convert_outlier_value(train_df, 'logerror', lower_percentile=1, upper_percentile=99)
In [71]:
draw_plots(train_df, 'logerror')
In [72]:
missing_df = prop_df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df = missing_df.loc[missing_df['missing_count'] > 0]
missing_df = missing_df.sort_values(by='missing_count', ascending=False)
missing_df['missing_ratio'] = missing_df['missing_count'] / len(prop_df)
display(missing_df)
In [73]:
print('Creating training set ...')
df_train = train_df.merge(prop_df, how='left', on='parcelid')
df_train.head()
Out[73]:
In [74]:
pd.options.display.max_rows = 65
dtype_df = df_train.dtypes.reset_index()
dtype_df.columns = ['Count', 'Column Type']
dtype_df.groupby('Column Type').aggregate('count').reset_index()
Out[74]:
In [75]:
# fillna with mean value
mean_values = df_train.mean(axis=0)
# df_train = df_train.fillna(mean_values) # not good choice? LB increased to 0.09!
In [76]:
missing_df = df_train.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df['missing_ratio'] = missing_df['missing_count'] / df_train.shape[0]
missing_df.loc[missing_df['missing_ratio'] > 0]['column_name']
Out[76]:
In [77]:
print(df_train['taxdelinquencyflag'].value_counts())
df_train['taxdelinquencyflag'] = df_train['taxdelinquencyflag'] \
.apply(lambda x: True if x == True or x == 'Y' else False)
print(df_train['taxdelinquencyflag'].value_counts())
In [78]:
# df_train = df_train.fillna('UNKNOWN')
In [79]:
convert_outlier_value(df_train, 'finishedsquarefeet12', lower_percentile=0.5, upper_percentile=99.5)
In [80]:
convert_outlier_value(df_train, 'calculatedfinishedsquarefeet', lower_percentile=0.5, upper_percentile=99.5)
In [81]:
convert_outlier_value(df_train, 'taxamount', lower_percentile=0.5, upper_percentile=99.5)
In [82]:
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate',
'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)
train_columns = x_train.columns
for c in x_train.dtypes[x_train.dtypes == object].index.values:
x_train[c] = (x_train[c] == True)
split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]
print('Building DMatrix...')
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)
In [83]:
print('Training ...')
xgb_params = {
'eta': 0.037,
'max_depth': 5,
'subsample': 0.80,
'objective': 'reg:linear',
'eval_metric': 'mae',
'lambda': 0.8,
'alpha': 0.4,
'silent': 1
}
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(xgb_params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)
In [84]:
print('Building test set ...')
sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop_df, on='parcelid', how='left')
x_test = df_test[train_columns]
for c in x_test.dtypes[x_test.dtypes == object].index.values:
x_test[c] = (x_test[c] == True)
d_test = xgb.DMatrix(x_test)
print('Predicting on test ...')
p_test = clf.predict(d_test)
sub = pd.read_csv('input/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
sub[c] = p_test
print('Writing csv ...')
sub.to_csv('output/xgb_starter.csv', index=False, float_format='%.4f') # Thanks to @inversion
print('Write finished')
In [ ]: