In [120]:
# this an adaptation of the kernel created by meikegw. Mainly just added some more verbose explanation
# and were functions ('def') are used, alternatives where provided for who to do the same without
# creating the functions. //Special Thanks to Meikegw!
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
#from subprocess import check_output
#print(check_output(["ls", "../input"]).decode("utf8"))
# Any results you write to the current directory are saved as output.
In [121]:
import os
os.getcwd()
Out[121]:
In [122]:
houseprice = pd.read_csv('/Users/steven/Documents/Kaggle/Houses/data/train.csv')
# or ...pd.read_csv('../input/train.csv')
#pandas.set_option('display.max_columns', None)
In [123]:
houseprice.head(3)
Out[123]:
In [124]:
# To check how many columns have missing values:
def show_missing():
missing = houseprice.columns[houseprice.isnull().any()].tolist()
return missing
#thank you @meikegw
# This can also be done without creating a function, but this is somewhat less 'pretty
# by using the follwing:
# houseprice[houseprice.columns[houseprice.isnull().any()].tolist()].isnull().sum()
In [125]:
# Let's see how much work there is to be done regarding cleaning up NaN's and missing values
# this bit will come back several times to check out progress.
houseprice[show_missing()].isnull().sum()
Out[125]:
In [126]:
# check correlation with LotArea
houseprice['LotFrontage'].corr(houseprice['LotArea'])
Out[126]:
In [127]:
# improvement - and good enough for now
houseprice['SqrtLotArea']=np.sqrt(houseprice['LotArea'])
houseprice['LotFrontage'].corr(houseprice['SqrtLotArea'])
Out[127]:
In [128]:
# Looking at categorical values
def cat_exploration(column):
return houseprice[column].value_counts()
In [129]:
# Imputing the missing values
def cat_imputation(column, value):
houseprice.loc[houseprice[column].isnull(),column] = value
In [130]:
houseprice.head(10)
Out[130]:
In [131]:
# Saeborn for visualisations, pylab inline make them within this notebook window.
import seaborn as sns
%pylab inline
In [132]:
# pairplot is good for visualising small amount of variables
# Keep in mind when chosing pairplot; amount of plots is exponential, 2 vars is 2^2,
# for 10 vars is is 10^2, etc..
sns.pairplot(houseprice[['LotFrontage','SqrtLotArea']].dropna())
Out[132]:
In [133]:
# take the cells with empty values in LotFrontage
cond = houseprice['LotFrontage'].isnull()
In [134]:
#replace those ecells with values from the correlated SqrtLotArea
houseprice.LotFrontage[cond] = houseprice.SqrtLotArea[cond]
In [135]:
houseprice.head(8)
Out[135]:
In [136]:
#check whether LotFrontage is no longer in list of missing values
houseprice[show_missing()].isnull().sum()
Out[136]:
In [137]:
cat_exploration('Alley')
# This cat_exploration is possible because we have created this function (def..)
# If you haven't same result can be gotten with te following:
# houseprice['Alley'].value_counts()
Out[137]:
In [138]:
# I assume empty fields here means no alley access
cat_imputation('Alley','None')
# again, this is possible because we have created this function (input 12).
# If we hadn't done this, we would get the same result with the following:
# houseprice.loc[houseprice['Alley'].isnull(),'Alley'] = 'None'
In [139]:
# Let's see how much work there is to be done regarding cleaning up NaN's and missing values
# this bit will come back several times to check out progress.
houseprice[show_missing()].isnull().sum()
# As said before, this can be done without using the created function (def..)
# by using the following:
# houseprice[houseprice.columns[houseprice.isnull().any()].tolist()].isnull().sum()
Out[139]:
In [140]:
houseprice['MasVnrType'].isnull().sum()
Out[140]:
In [141]:
# Is MasVnrArea empty when MasVnrType is empty?
houseprice[['MasVnrType','MasVnrArea']][houseprice['MasVnrType'].isnull()==True]
Out[141]:
In [143]:
# What do the values look like for MasVnrType?
cat_exploration('MasVnrType')
# or:
# houseprice['MasVnrType'].value_counts()
Out[143]:
In [145]:
#cat_exploration('MasVnrArea')
In [99]:
# Mostly 'None' so we for now will replace NaN's with None's,
# and for MasVnrArea replace NaN with zero.
cat_imputation('MasVnrType', 'None')
cat_imputation('MasVnrArea', 0.0)
In [100]:
# A lot of variables which are all basement related. Create group, see group to check whether all are
# Nan and zero together:
basement_cols=['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','BsmtFinSF1','BsmtFinSF2']
houseprice[basement_cols][houseprice['BsmtQual'].isnull()==True]
Out[100]:
In [101]:
# Little loop here. Its details confused me. Goes through each of column (x) within
# the created basement 'group'. For each column it goes through the cat_imputation funtcion which
# does this:
# houseprice.loc[houseprice['x'].isnull(),'x'] = 'None'
# The 'FinSF' is to differentiate between the numerical (both contain 'FinSF' in header)
# and the categorical which already contain zero's.
for x in basement_cols:
if 'FinSF'not in x:
cat_imputation(x,'None')
In [102]:
# Let's see how much work there is to be done regarding cleaning up NaN's and missing values
# this bit will come back several times to check out progress.
houseprice[show_missing()].isnull().sum()
Out[102]:
In [103]:
cat_exploration('Electrical')
Out[103]:
In [104]:
houseprice['Electrical'].isnull().sum()
Out[104]:
In [105]:
# Just one missing, impute most frequent value (SBrkr with 1334 instances)
cat_imputation('Electrical','SBrkr')
In [106]:
# Let's see how much work there is to be done regarding cleaning up NaN's and missing values
# this bit will come back several times to check out progress.
houseprice[show_missing()].isnull().sum()
Out[106]:
In [107]:
cat_exploration('FireplaceQu')
Out[107]:
In [108]:
houseprice['FireplaceQu'].isnull().sum()
Out[108]:
In [109]:
#houseprice['Fireplaces'][houseprice['FireplaceQu'].isnull()==True].describe()
#checking whether FireplaceQu might be empty especially when fireplace itself is missing
houseprice[['Fireplaces','FireplaceQu']][houseprice['FireplaceQu'].isnull()==True]
Out[109]:
In [110]:
#So yes, it seems that indeed FireplaceQu is empty when Fireplaces is missing.
#Assumption therefore will be that Fireplaces is empty because there is no fireplace.
cat_imputation('FireplaceQu','None')
In [111]:
pd.crosstab(houseprice.Fireplaces, houseprice.FireplaceQu)
Out[111]:
In [112]:
# Let's see how much work there is to be done regarding cleaning up NaN's and missing values
# this bit will come back several times to check out progress.
houseprice[show_missing()].isnull().sum()
Out[112]:
In [113]:
#Same idea as with basement columns.
#garage_cols=['GarageType','GarageQual','GarageCond','GarageYrBlt','GarageFinish','GarageCars','GarageArea']
#houseprice[garage_cols][houseprice['GarageType'].isnull()==True]
# Later on it doesn't recognize garage_cols.. workaround:
houseprice[['GarageType','GarageQual','GarageCond','GarageYrBlt','GarageFinish','GarageCars','GarageArea']][houseprice['GarageType'].isnull()==True]
Out[113]:
In [116]:
#Garage Imputation
#for x in garage_cols:
# if houseprice[cols].dtype==np.object:
# cat_imputation(x,'None')
# else:
# cat_imputation(x, 0)
# doesn't recognize garage_cols.. or cols..
In [117]:
# Let's see how much work there is to be done regarding cleaning up NaN's and missing values
# this bit will come back several times to check out progress.
houseprice[show_missing()].isnull().sum()
Out[117]:
In [118]:
cat_exploration('PoolQC')
Out[118]:
In [59]:
# is Poolarea missing when PoolQC is missing?
houseprice['PoolArea'][houseprice['PoolQC'].isnull()==True].describe()
Out[59]:
In [60]:
# So, here I am going to delete this feature because Pool itself is so often not present, poolQC
# will not be a good feature for modelling. (this should technically be done only after visualising etc)
del houseprice['PoolQC']
In [61]:
# If you don't want to delete: cat_imputation('PoolQC', 'None')
In [62]:
cat_imputation('Fence', 'None')
In [63]:
cat_imputation('MiscFeature', 'None')
In [64]:
# Let's see how much work there is to be done regarding cleaning up NaN's and missing values
# this bit will come back several times to check out progress.
houseprice[show_missing()].isnull().sum()
Out[64]:
In [98]:
#houseprices.to_csv('../pathhere../submission.csv', index=False)
In [99]:
os.getcwd()
Out[99]:
In [109]:
#houseprice.to_csv('/Users/steven/Documents/Kaggle/cnerwnvew.csv')
In [146]:
houseprice.to_csv('almostcleanhouses.csv')
In [ ]: