In [120]:
# this an adaptation of the kernel created by meikegw. Mainly just added some more verbose explanation
# and were functions ('def') are used, alternatives where provided for who to do the same without
# creating the functions. //Special Thanks to Meikegw!

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

      #from subprocess import check_output
      #print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [121]:
import os
os.getcwd()


Out[121]:
'/Users/steven/Documents/Kaggle/Houses'

In [122]:
houseprice = pd.read_csv('/Users/steven/Documents/Kaggle/Houses/data/train.csv')
# or ...pd.read_csv('../input/train.csv')

#pandas.set_option('display.max_columns', None)

In [123]:
houseprice.head(3)


Out[123]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500

3 rows × 81 columns


In [124]:
# To check how many columns have missing values:
def show_missing():
    missing = houseprice.columns[houseprice.isnull().any()].tolist()
    return missing
#thank you @meikegw

# This can also be done without creating a function, but this is somewhat less 'pretty
# by using the follwing:
# houseprice[houseprice.columns[houseprice.isnull().any()].tolist()].isnull().sum()

In [125]:
# Let's see how much work there is to be done regarding cleaning up NaN's and missing values
# this bit will come back several times to check out progress.
houseprice[show_missing()].isnull().sum()


Out[125]:
LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [126]:
# check correlation with LotArea
houseprice['LotFrontage'].corr(houseprice['LotArea'])


Out[126]:
0.42609501877180816

In [127]:
# improvement - and good enough for now
houseprice['SqrtLotArea']=np.sqrt(houseprice['LotArea'])
houseprice['LotFrontage'].corr(houseprice['SqrtLotArea'])


Out[127]:
0.6020022167939364

In [128]:
# Looking at categorical values
def cat_exploration(column):
    return houseprice[column].value_counts()

In [129]:
# Imputing the missing values
def cat_imputation(column, value):
    houseprice.loc[houseprice[column].isnull(),column] = value

In [130]:
houseprice.head(10)


Out[130]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice SqrtLotArea
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... NaN NaN NaN 0 2 2008 WD Normal 208500 91.923882
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... NaN NaN NaN 0 5 2007 WD Normal 181500 97.979590
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... NaN NaN NaN 0 9 2008 WD Normal 223500 106.066017
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... NaN NaN NaN 0 2 2006 WD Abnorml 140000 97.724101
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... NaN NaN NaN 0 12 2008 WD Normal 250000 119.415242
5 6 50 RL 85.0 14115 Pave NaN IR1 Lvl AllPub ... NaN MnPrv Shed 700 10 2009 WD Normal 143000 118.806565
6 7 20 RL 75.0 10084 Pave NaN Reg Lvl AllPub ... NaN NaN NaN 0 8 2007 WD Normal 307000 100.419122
7 8 60 RL NaN 10382 Pave NaN IR1 Lvl AllPub ... NaN NaN Shed 350 11 2009 WD Normal 200000 101.892100
8 9 50 RM 51.0 6120 Pave NaN Reg Lvl AllPub ... NaN NaN NaN 0 4 2008 WD Abnorml 129900 78.230429
9 10 190 RL 50.0 7420 Pave NaN Reg Lvl AllPub ... NaN NaN NaN 0 1 2008 WD Normal 118000 86.139422

10 rows × 82 columns


In [131]:
# Saeborn for visualisations, pylab inline make them within this notebook window.
import seaborn as sns
%pylab inline


Populating the interactive namespace from numpy and matplotlib
/Users/steven/anaconda/lib/python3.6/site-packages/IPython/core/magics/pylab.py:161: UserWarning: pylab import has clobbered these variables: ['cond']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"

In [132]:
# pairplot is good for visualising small amount of variables
# Keep in mind when chosing pairplot; amount of plots is exponential, 2 vars is 2^2, 
# for 10 vars is is 10^2, etc..
sns.pairplot(houseprice[['LotFrontage','SqrtLotArea']].dropna())


Out[132]:
<seaborn.axisgrid.PairGrid at 0x117c95940>

In [133]:
# take the cells with empty values in LotFrontage
cond = houseprice['LotFrontage'].isnull()

In [134]:
#replace those ecells with values from the correlated SqrtLotArea
houseprice.LotFrontage[cond] = houseprice.SqrtLotArea[cond]


/Users/steven/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  

In [135]:
houseprice.head(8)


Out[135]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice SqrtLotArea
0 1 60 RL 65.0000 8450 Pave NaN Reg Lvl AllPub ... NaN NaN NaN 0 2 2008 WD Normal 208500 91.923882
1 2 20 RL 80.0000 9600 Pave NaN Reg Lvl AllPub ... NaN NaN NaN 0 5 2007 WD Normal 181500 97.979590
2 3 60 RL 68.0000 11250 Pave NaN IR1 Lvl AllPub ... NaN NaN NaN 0 9 2008 WD Normal 223500 106.066017
3 4 70 RL 60.0000 9550 Pave NaN IR1 Lvl AllPub ... NaN NaN NaN 0 2 2006 WD Abnorml 140000 97.724101
4 5 60 RL 84.0000 14260 Pave NaN IR1 Lvl AllPub ... NaN NaN NaN 0 12 2008 WD Normal 250000 119.415242
5 6 50 RL 85.0000 14115 Pave NaN IR1 Lvl AllPub ... NaN MnPrv Shed 700 10 2009 WD Normal 143000 118.806565
6 7 20 RL 75.0000 10084 Pave NaN Reg Lvl AllPub ... NaN NaN NaN 0 8 2007 WD Normal 307000 100.419122
7 8 60 RL 101.8921 10382 Pave NaN IR1 Lvl AllPub ... NaN NaN Shed 350 11 2009 WD Normal 200000 101.892100

8 rows × 82 columns


In [136]:
#check whether LotFrontage is no longer in list of missing values
houseprice[show_missing()].isnull().sum()


Out[136]:
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [137]:
cat_exploration('Alley')
# This cat_exploration is possible because we have created this function (def..)
# If you haven't same result can be gotten with te following:

# houseprice['Alley'].value_counts()


Out[137]:
Grvl    50
Pave    41
Name: Alley, dtype: int64

In [138]:
# I assume empty fields here means no alley access
cat_imputation('Alley','None')
# again, this is possible because we have created this function (input 12).
# If we hadn't done this, we would get the same result with the following:

# houseprice.loc[houseprice['Alley'].isnull(),'Alley'] = 'None'

In [139]:
# Let's see how much work there is to be done regarding cleaning up NaN's and missing values
# this bit will come back several times to check out progress.

houseprice[show_missing()].isnull().sum()
# As said before, this can be done without using the created function (def..) 
# by using the following:

# houseprice[houseprice.columns[houseprice.isnull().any()].tolist()].isnull().sum()


Out[139]:
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [140]:
houseprice['MasVnrType'].isnull().sum()


Out[140]:
8

In [141]:
# Is MasVnrArea empty when MasVnrType is empty?
houseprice[['MasVnrType','MasVnrArea']][houseprice['MasVnrType'].isnull()==True]


Out[141]:
MasVnrType MasVnrArea
234 NaN NaN
529 NaN NaN
650 NaN NaN
936 NaN NaN
973 NaN NaN
977 NaN NaN
1243 NaN NaN
1278 NaN NaN

In [143]:
# What do the values look like for MasVnrType?
cat_exploration('MasVnrType')
# or:
# houseprice['MasVnrType'].value_counts()


Out[143]:
None       864
BrkFace    445
Stone      128
BrkCmn      15
Name: MasVnrType, dtype: int64

In [145]:
#cat_exploration('MasVnrArea')

In [99]:
# Mostly 'None' so we for now will replace NaN's with None's, 
# and for MasVnrArea replace NaN with zero.
cat_imputation('MasVnrType', 'None')
cat_imputation('MasVnrArea', 0.0)

In [100]:
# A lot of variables which are all basement related. Create group, see group to check whether all are
# Nan and zero together:
basement_cols=['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','BsmtFinSF1','BsmtFinSF2']
houseprice[basement_cols][houseprice['BsmtQual'].isnull()==True]


Out[100]:
BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2 BsmtFinSF1 BsmtFinSF2
17 NaN NaN NaN NaN NaN 0 0
39 NaN NaN NaN NaN NaN 0 0
90 NaN NaN NaN NaN NaN 0 0
102 NaN NaN NaN NaN NaN 0 0
156 NaN NaN NaN NaN NaN 0 0
182 NaN NaN NaN NaN NaN 0 0
259 NaN NaN NaN NaN NaN 0 0
342 NaN NaN NaN NaN NaN 0 0
362 NaN NaN NaN NaN NaN 0 0
371 NaN NaN NaN NaN NaN 0 0
392 NaN NaN NaN NaN NaN 0 0
520 NaN NaN NaN NaN NaN 0 0
532 NaN NaN NaN NaN NaN 0 0
533 NaN NaN NaN NaN NaN 0 0
553 NaN NaN NaN NaN NaN 0 0
646 NaN NaN NaN NaN NaN 0 0
705 NaN NaN NaN NaN NaN 0 0
736 NaN NaN NaN NaN NaN 0 0
749 NaN NaN NaN NaN NaN 0 0
778 NaN NaN NaN NaN NaN 0 0
868 NaN NaN NaN NaN NaN 0 0
894 NaN NaN NaN NaN NaN 0 0
897 NaN NaN NaN NaN NaN 0 0
984 NaN NaN NaN NaN NaN 0 0
1000 NaN NaN NaN NaN NaN 0 0
1011 NaN NaN NaN NaN NaN 0 0
1035 NaN NaN NaN NaN NaN 0 0
1045 NaN NaN NaN NaN NaN 0 0
1048 NaN NaN NaN NaN NaN 0 0
1049 NaN NaN NaN NaN NaN 0 0
1090 NaN NaN NaN NaN NaN 0 0
1179 NaN NaN NaN NaN NaN 0 0
1216 NaN NaN NaN NaN NaN 0 0
1218 NaN NaN NaN NaN NaN 0 0
1232 NaN NaN NaN NaN NaN 0 0
1321 NaN NaN NaN NaN NaN 0 0
1412 NaN NaN NaN NaN NaN 0 0

In [101]:
# Little loop here. Its details confused me. Goes through each of column (x) within
# the created basement 'group'. For each column it goes through the cat_imputation funtcion which
# does this: 

# houseprice.loc[houseprice['x'].isnull(),'x'] = 'None'

# The 'FinSF' is to differentiate between the numerical (both contain 'FinSF' in header) 
# and the categorical which already contain zero's. 

for x in basement_cols:
    if 'FinSF'not in x:
        cat_imputation(x,'None')

In [102]:
# Let's see how much work there is to be done regarding cleaning up NaN's and missing values
# this bit will come back several times to check out progress.

houseprice[show_missing()].isnull().sum()


Out[102]:
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [103]:
cat_exploration('Electrical')


Out[103]:
SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: Electrical, dtype: int64

In [104]:
houseprice['Electrical'].isnull().sum()


Out[104]:
1

In [105]:
# Just one missing, impute most frequent value (SBrkr with 1334 instances)
cat_imputation('Electrical','SBrkr')

In [106]:
# Let's see how much work there is to be done regarding cleaning up NaN's and missing values
# this bit will come back several times to check out progress.

houseprice[show_missing()].isnull().sum()


Out[106]:
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [107]:
cat_exploration('FireplaceQu')


Out[107]:
Gd    380
TA    313
Fa     33
Ex     24
Po     20
Name: FireplaceQu, dtype: int64

In [108]:
houseprice['FireplaceQu'].isnull().sum()


Out[108]:
690

In [109]:
#houseprice['Fireplaces'][houseprice['FireplaceQu'].isnull()==True].describe()

#checking whether FireplaceQu might be empty especially when fireplace itself is missing
houseprice[['Fireplaces','FireplaceQu']][houseprice['FireplaceQu'].isnull()==True]


Out[109]:
Fireplaces FireplaceQu
0 0 NaN
5 0 NaN
10 0 NaN
12 0 NaN
15 0 NaN
17 0 NaN
18 0 NaN
19 0 NaN
26 0 NaN
29 0 NaN
30 0 NaN
31 0 NaN
32 0 NaN
36 0 NaN
38 0 NaN
39 0 NaN
42 0 NaN
43 0 NaN
44 0 NaN
47 0 NaN
48 0 NaN
49 0 NaN
50 0 NaN
52 0 NaN
56 0 NaN
57 0 NaN
59 0 NaN
60 0 NaN
61 0 NaN
63 0 NaN
... ... ...
1391 0 NaN
1397 0 NaN
1398 0 NaN
1403 0 NaN
1404 0 NaN
1406 0 NaN
1407 0 NaN
1408 0 NaN
1410 0 NaN
1411 0 NaN
1412 0 NaN
1416 0 NaN
1418 0 NaN
1422 0 NaN
1425 0 NaN
1431 0 NaN
1432 0 NaN
1436 0 NaN
1438 0 NaN
1444 0 NaN
1445 0 NaN
1446 0 NaN
1448 0 NaN
1449 0 NaN
1450 0 NaN
1452 0 NaN
1453 0 NaN
1454 0 NaN
1458 0 NaN
1459 0 NaN

690 rows × 2 columns


In [110]:
#So yes, it seems that indeed FireplaceQu is empty when Fireplaces is missing.
#Assumption therefore will be that Fireplaces is empty because there is no fireplace.
cat_imputation('FireplaceQu','None')

In [111]:
pd.crosstab(houseprice.Fireplaces, houseprice.FireplaceQu)


Out[111]:
FireplaceQu Ex Fa Gd None Po TA
Fireplaces
0 0 0 0 690 0 0
1 19 28 324 0 20 259
2 4 4 54 0 0 53
3 1 1 2 0 0 1

In [112]:
# Let's see how much work there is to be done regarding cleaning up NaN's and missing values
# this bit will come back several times to check out progress.

houseprice[show_missing()].isnull().sum()


Out[112]:
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [113]:
#Same idea as with basement columns.
#garage_cols=['GarageType','GarageQual','GarageCond','GarageYrBlt','GarageFinish','GarageCars','GarageArea']
#houseprice[garage_cols][houseprice['GarageType'].isnull()==True]

# Later on it doesn't recognize garage_cols.. workaround:
houseprice[['GarageType','GarageQual','GarageCond','GarageYrBlt','GarageFinish','GarageCars','GarageArea']][houseprice['GarageType'].isnull()==True]


Out[113]:
GarageType GarageQual GarageCond GarageYrBlt GarageFinish GarageCars GarageArea
39 NaN NaN NaN NaN NaN 0 0
48 NaN NaN NaN NaN NaN 0 0
78 NaN NaN NaN NaN NaN 0 0
88 NaN NaN NaN NaN NaN 0 0
89 NaN NaN NaN NaN NaN 0 0
99 NaN NaN NaN NaN NaN 0 0
108 NaN NaN NaN NaN NaN 0 0
125 NaN NaN NaN NaN NaN 0 0
127 NaN NaN NaN NaN NaN 0 0
140 NaN NaN NaN NaN NaN 0 0
148 NaN NaN NaN NaN NaN 0 0
155 NaN NaN NaN NaN NaN 0 0
163 NaN NaN NaN NaN NaN 0 0
165 NaN NaN NaN NaN NaN 0 0
198 NaN NaN NaN NaN NaN 0 0
210 NaN NaN NaN NaN NaN 0 0
241 NaN NaN NaN NaN NaN 0 0
250 NaN NaN NaN NaN NaN 0 0
287 NaN NaN NaN NaN NaN 0 0
291 NaN NaN NaN NaN NaN 0 0
307 NaN NaN NaN NaN NaN 0 0
375 NaN NaN NaN NaN NaN 0 0
386 NaN NaN NaN NaN NaN 0 0
393 NaN NaN NaN NaN NaN 0 0
431 NaN NaN NaN NaN NaN 0 0
434 NaN NaN NaN NaN NaN 0 0
441 NaN NaN NaN NaN NaN 0 0
464 NaN NaN NaN NaN NaN 0 0
495 NaN NaN NaN NaN NaN 0 0
520 NaN NaN NaN NaN NaN 0 0
... ... ... ... ... ... ... ...
954 NaN NaN NaN NaN NaN 0 0
960 NaN NaN NaN NaN NaN 0 0
968 NaN NaN NaN NaN NaN 0 0
970 NaN NaN NaN NaN NaN 0 0
976 NaN NaN NaN NaN NaN 0 0
1009 NaN NaN NaN NaN NaN 0 0
1011 NaN NaN NaN NaN NaN 0 0
1030 NaN NaN NaN NaN NaN 0 0
1038 NaN NaN NaN NaN NaN 0 0
1096 NaN NaN NaN NaN NaN 0 0
1123 NaN NaN NaN NaN NaN 0 0
1131 NaN NaN NaN NaN NaN 0 0
1137 NaN NaN NaN NaN NaN 0 0
1143 NaN NaN NaN NaN NaN 0 0
1173 NaN NaN NaN NaN NaN 0 0
1179 NaN NaN NaN NaN NaN 0 0
1218 NaN NaN NaN NaN NaN 0 0
1219 NaN NaN NaN NaN NaN 0 0
1234 NaN NaN NaN NaN NaN 0 0
1257 NaN NaN NaN NaN NaN 0 0
1283 NaN NaN NaN NaN NaN 0 0
1323 NaN NaN NaN NaN NaN 0 0
1325 NaN NaN NaN NaN NaN 0 0
1326 NaN NaN NaN NaN NaN 0 0
1337 NaN NaN NaN NaN NaN 0 0
1349 NaN NaN NaN NaN NaN 0 0
1407 NaN NaN NaN NaN NaN 0 0
1449 NaN NaN NaN NaN NaN 0 0
1450 NaN NaN NaN NaN NaN 0 0
1453 NaN NaN NaN NaN NaN 0 0

81 rows × 7 columns


In [116]:
#Garage Imputation
#for x in garage_cols:
#    if houseprice[cols].dtype==np.object:
#        cat_imputation(x,'None')
#    else:
#        cat_imputation(x, 0)
        
# doesn't recognize garage_cols.. or cols..


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-116-3ea2e1bae244> in <module>()
      9 
     10 for x in garage_cols:
---> 11     if houseprice[cols].dtype==np.object:
     12         cat_imputation(x,'None')
     13     else:

NameError: name 'cols' is not defined

In [117]:
# Let's see how much work there is to be done regarding cleaning up NaN's and missing values
# this bit will come back several times to check out progress.

houseprice[show_missing()].isnull().sum()


Out[117]:
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [118]:
cat_exploration('PoolQC')


Out[118]:
Gd    3
Fa    2
Ex    2
Name: PoolQC, dtype: int64

In [59]:
# is Poolarea missing when PoolQC is missing?
houseprice['PoolArea'][houseprice['PoolQC'].isnull()==True].describe()


Out[59]:
count    1453.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         0.0
Name: PoolArea, dtype: float64

In [60]:
# So, here I am going to delete this feature because Pool itself is so often not present, poolQC
# will not be a good feature for modelling. (this should technically be done only after visualising etc)
del houseprice['PoolQC']

In [61]:
# If you don't want to delete: cat_imputation('PoolQC', 'None')

In [62]:
cat_imputation('Fence', 'None')

In [63]:
cat_imputation('MiscFeature', 'None')

In [64]:
# Let's see how much work there is to be done regarding cleaning up NaN's and missing values
# this bit will come back several times to check out progress.

houseprice[show_missing()].isnull().sum()


Out[64]:
GarageType      81
GarageYrBlt     81
GarageFinish    81
GarageQual      81
GarageCond      81
dtype: int64

In [98]:
#houseprices.to_csv('../pathhere../submission.csv', index=False)

In [99]:
os.getcwd()


Out[99]:
'/kaggle/working'

In [109]:
#houseprice.to_csv('/Users/steven/Documents/Kaggle/cnerwnvew.csv')

In [146]:
houseprice.to_csv('almostcleanhouses.csv')

In [ ]: