Overall



In [1]:

    
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.









    



sample_submission.csv
test.csv
train.csv



In [2]:

    
df = pd.read_csv('../input/train.csv')
#df.drop('SalePrice', axis = 1, inplace = True)
#test = pd.read_csv('../input/test.csv')
#df = df.append(test, ignore_index = True)
df.head()









    Out[2]:






  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      PoolArea
      PoolQC
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
      SalePrice
    
  
  
    
      0
      1
      60
      RL
      65.0
      8450
      Pave
      NaN
      Reg
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      2
      2008
      WD
      Normal
      208500
    
    
      1
      2
      20
      RL
      80.0
      9600
      Pave
      NaN
      Reg
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      5
      2007
      WD
      Normal
      181500
    
    
      2
      3
      60
      RL
      68.0
      11250
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      9
      2008
      WD
      Normal
      223500
    
    
      3
      4
      70
      RL
      60.0
      9550
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      2
      2006
      WD
      Abnorml
      140000
    
    
      4
      5
      60
      RL
      84.0
      14260
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      12
      2008
      WD
      Normal
      250000
    
  

5 rows × 81 columns



In [3]:

    
df.describe()









    



/opt/conda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)






    Out[3]:






  
    
      
      Id
      MSSubClass
      LotFrontage
      LotArea
      OverallQual
      OverallCond
      YearBuilt
      YearRemodAdd
      MasVnrArea
      BsmtFinSF1
      ...
      WoodDeckSF
      OpenPorchSF
      EnclosedPorch
      3SsnPorch
      ScreenPorch
      PoolArea
      MiscVal
      MoSold
      YrSold
      SalePrice
    
  
  
    
      count
      1460.000000
      1460.000000
      1201.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1452.000000
      1460.000000
      ...
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
    
    
      mean
      730.500000
      56.897260
      70.049958
      10516.828082
      6.099315
      5.575342
      1971.267808
      1984.865753
      103.685262
      443.639726
      ...
      94.244521
      46.660274
      21.954110
      3.409589
      15.060959
      2.758904
      43.489041
      6.321918
      2007.815753
      180921.195890
    
    
      std
      421.610009
      42.300571
      24.284752
      9981.264932
      1.382997
      1.112799
      30.202904
      20.645407
      181.066207
      456.098091
      ...
      125.338794
      66.256028
      61.119149
      29.317331
      55.757415
      40.177307
      496.123024
      2.703626
      1.328095
      79442.502883
    
    
      min
      1.000000
      20.000000
      21.000000
      1300.000000
      1.000000
      1.000000
      1872.000000
      1950.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
      2006.000000
      34900.000000
    
    
      25%
      365.750000
      20.000000
      NaN
      7553.500000
      5.000000
      5.000000
      1954.000000
      1967.000000
      NaN
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      5.000000
      2007.000000
      129975.000000
    
    
      50%
      730.500000
      50.000000
      NaN
      9478.500000
      6.000000
      5.000000
      1973.000000
      1994.000000
      NaN
      383.500000
      ...
      0.000000
      25.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      6.000000
      2008.000000
      163000.000000
    
    
      75%
      1095.250000
      70.000000
      NaN
      11601.500000
      7.000000
      6.000000
      2000.000000
      2004.000000
      NaN
      712.250000
      ...
      168.000000
      68.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      8.000000
      2009.000000
      214000.000000
    
    
      max
      1460.000000
      190.000000
      313.000000
      215245.000000
      10.000000
      9.000000
      2010.000000
      2010.000000
      1600.000000
      5644.000000
      ...
      857.000000
      547.000000
      552.000000
      508.000000
      480.000000
      738.000000
      15500.000000
      12.000000
      2010.000000
      755000.000000
    
  

8 rows × 38 columns



In [4]:

    
df.columns









    Out[4]:





Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition', 'SalePrice'],
      dtype='object')



In [5]:

    
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline



In [6]:

    
print("Some Statistics of the Housing Price:\n")
print(df['SalePrice'].describe())
print("\nThe median of the Housing Price is: ", df['SalePrice'].median(axis = 0))









    



Some Statistics of the Housing Price:

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

The median of the Housing Price is:  163000.0



In [7]:

    
sns.distplot(df['SalePrice'], kde = False, color = 'b', hist_kws={'alpha': 0.9})









    Out[7]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f74cdf82358>

Numerical Features



In [8]:

    
corr = df.select_dtypes(include = ['float64', 'int64']).iloc[:, 1:].corr()
plt.figure(figsize=(12, 12))
sns.heatmap(corr, vmax=1, square=True)









    Out[8]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f74ce002a58>



In [9]:

    
cor_dict = corr['SalePrice'].to_dict()
del cor_dict['SalePrice']
print("List the numerical features decendingly by their correlation with Sale Price:\n")
for ele in sorted(cor_dict.items(), key = lambda x: -abs(x[1])):
    print("{0}: \t{1}".format(*ele))









    



List the numerical features decendingly by their correlation with Sale Price:

OverallQual: 	0.7909816005838047
GrLivArea: 	0.7086244776126511
GarageCars: 	0.640409197258349
GarageArea: 	0.6234314389183598
TotalBsmtSF: 	0.6135805515591944
1stFlrSF: 	0.6058521846919166
FullBath: 	0.5606637627484452
TotRmsAbvGrd: 	0.5337231555820238
YearBuilt: 	0.5228973328794967
YearRemodAdd: 	0.5071009671113867
GarageYrBlt: 	0.48636167748786213
MasVnrArea: 	0.4774930470957107
Fireplaces: 	0.4669288367515242
BsmtFinSF1: 	0.38641980624215627
LotFrontage: 	0.35179909657067854
WoodDeckSF: 	0.32441344456813076
2ndFlrSF: 	0.31933380283206614
OpenPorchSF: 	0.31585622711605577
HalfBath: 	0.2841076755947784
LotArea: 	0.2638433538714063
BsmtFullBath: 	0.22712223313149718
BsmtUnfSF: 	0.214479105546969
BedroomAbvGr: 	0.1682131543007415
KitchenAbvGr: 	-0.1359073708421417
EnclosedPorch: 	-0.12857795792595636
ScreenPorch: 	0.11144657114291048
PoolArea: 	0.09240354949187278
MSSubClass: 	-0.08428413512659523
OverallCond: 	-0.0778558940486776
MoSold: 	0.04643224522381936
3SsnPorch: 	0.04458366533574792
YrSold: 	-0.028922585168730426
LowQualFinSF: 	-0.02560613000068015
MiscVal: 	-0.02118957964030379
BsmtHalfBath: 	-0.016844154297359294
BsmtFinSF2: 	-0.011378121450215216

The housing price correlates strongly with OverallQual, GrLivArea(GarageCars), GargeArea, TotalBsmtSF, 1stFlrSF, FullBath, TotRmsAbvGrd, YearBuilt, YearRemodAdd, GargeYrBlt, MasVnrArea and Fireplaces. But some of those features are highly correlated among each others.



In [10]:

    
sns.regplot(x = 'OverallQual', y = 'SalePrice', data = df, color = 'Orange')









    Out[10]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f74cb061ba8>



In [11]:

    
plt.figure(1)
f, axarr = plt.subplots(3, 2, figsize=(10, 9))
price = df.SalePrice.values
axarr[0, 0].scatter(df.GrLivArea.values, price)
axarr[0, 0].set_title('GrLiveArea')
axarr[0, 1].scatter(df.GarageArea.values, price)
axarr[0, 1].set_title('GarageArea')
axarr[1, 0].scatter(df.TotalBsmtSF.values, price)
axarr[1, 0].set_title('TotalBsmtSF')
axarr[1, 1].scatter(df['1stFlrSF'].values, price)
axarr[1, 1].set_title('1stFlrSF')
axarr[2, 0].scatter(df.TotRmsAbvGrd.values, price)
axarr[2, 0].set_title('TotRmsAbvGrd')
axarr[2, 1].scatter(df.MasVnrArea.values, price)
axarr[2, 1].set_title('MasVnrArea')
f.text(-0.01, 0.5, 'Sale Price', va='center', rotation='vertical', fontsize = 12)
plt.tight_layout()
plt.show()









    





<matplotlib.figure.Figure at 0x7f74cb06e7f0>



In [12]:

    
fig = plt.figure(2, figsize=(9, 7))
plt.subplot(211)
plt.scatter(df.YearBuilt.values, price)
plt.title('YearBuilt')

plt.subplot(212)
plt.scatter(df.YearRemodAdd.values, price)
plt.title('YearRemodAdd')

fig.text(-0.01, 0.5, 'Sale Price', va = 'center', rotation = 'vertical', fontsize = 12)

plt.tight_layout()

Categorical Features



In [13]:

    
print(df.select_dtypes(include=['object']).columns.values)









    



['MSZoning' 'Street' 'Alley' 'LotShape' 'LandContour' 'Utilities'
 'LotConfig' 'LandSlope' 'Neighborhood' 'Condition1' 'Condition2'
 'BldgType' 'HouseStyle' 'RoofStyle' 'RoofMatl' 'Exterior1st' 'Exterior2nd'
 'MasVnrType' 'ExterQual' 'ExterCond' 'Foundation' 'BsmtQual' 'BsmtCond'
 'BsmtExposure' 'BsmtFinType1' 'BsmtFinType2' 'Heating' 'HeatingQC'
 'CentralAir' 'Electrical' 'KitchenQual' 'Functional' 'FireplaceQu'
 'GarageType' 'GarageFinish' 'GarageQual' 'GarageCond' 'PavedDrive'
 'PoolQC' 'Fence' 'MiscFeature' 'SaleType' 'SaleCondition']

Neighborhood



In [14]:

    
plt.figure(figsize = (12, 6))
sns.boxplot(x = 'Neighborhood', y = 'SalePrice',  data = df)
xt = plt.xticks(rotation=45)



In [15]:

    
plt.figure(figsize = (12, 6))
sns.countplot(x = 'Neighborhood', data = df)
xt = plt.xticks(rotation=45)

Could group those Neighborhoods with similar housing price into a same bucket for dimension-reduction.

Housing Price vs Sales

Sale Type & Condition
Sales Seasonality



In [16]:

    
fig, ax = plt.subplots(2, 1, figsize = (10, 6))
sns.boxplot(x = 'SaleType', y = 'SalePrice', data = df, ax = ax[0])
sns.boxplot(x = 'SaleCondition', y = 'SalePrice', data = df, ax = ax[1])
plt.tight_layout()



In [17]:

    
g = sns.FacetGrid(df, col = 'YrSold', col_wrap = 3)
g.map(sns.boxplot, 'MoSold', 'SalePrice', palette='Set2', order = range(1, 13))\
.set(ylim = (0, 500000))
plt.tight_layout()

Sale's timing does not seem to hugely affect the house.

Housing Style



In [18]:

    
fig, ax = plt.subplots(2, 1, figsize = (10, 8))
sns.boxplot(x = 'BldgType', y = 'SalePrice', data = df, ax = ax[0])
sns.boxplot(x = 'HouseStyle', y = 'SalePrice', data = df, ax = ax[1])









    Out[18]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f74a92c1198>

Housing Condition



In [19]:

    
fig, ax = plt.subplots(2, 1, figsize = (10, 8))
sns.boxplot(x = 'Condition1', y = 'SalePrice', data = df, ax = ax[0])
sns.boxplot(x = 'Exterior1st', y = 'SalePrice', data = df, ax = ax[1])
x = plt.xticks(rotation = 45)
plt.show()

Basement Conditions



In [20]:

    
fig, ax = plt.subplots(2, 2, figsize = (10, 8))
sns.boxplot('BsmtCond', 'SalePrice', data = df, ax = ax[0, 0])
sns.boxplot('BsmtQual', 'SalePrice', data = df, ax = ax[0, 1])
sns.boxplot('BsmtExposure', 'SalePrice', data = df, ax = ax[1, 0])
sns.boxplot('BsmtFinType1', 'SalePrice', data = df, ax = ax[1, 1])









    Out[20]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f74a8ff4ef0>

Home Functionality



In [21]:

    
sns.violinplot('Functional', 'SalePrice', data = df)









    Out[21]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f74a908a550>

FirePlaceQu



In [22]:

    
sns.factorplot('FireplaceQu', 'SalePrice', data = df, color = 'm', \
               estimator = np.median, order = ['Ex', 'Gd', 'TA', 'Fa', 'Po'], size = 4.5,  aspect=1.35)









    Out[22]:





<seaborn.axisgrid.FacetGrid at 0x7f74a8dd5fd0>



In [23]:

    
pd.crosstab(df.Fireplaces, df.FireplaceQu)









    Out[23]:






  
    
      FireplaceQu
      Ex
      Fa
      Gd
      Po
      TA
    
    
      Fireplaces
      
      
      
      
      
    
  
  
    
      1
      19
      28
      324
      20
      259
    
    
      2
      4
      4
      54
      0
      53
    
    
      3
      1
      1
      2
      0
      1



In [24]:

    
g = sns.FacetGrid(df, col = 'FireplaceQu', col_wrap = 3, col_order=['Ex', 'Gd', 'TA', 'Fa', 'Po'])
g.map(sns.boxplot, 'Fireplaces', 'SalePrice', order = [1, 2, 3], palette = 'Set2')









    Out[24]:





<seaborn.axisgrid.FacetGrid at 0x7f74a8f8d2e8>

Heating

Ames is a cold place in winter, so heating (as well as fireplace qualities) are quite important.



In [25]:

    
pd.crosstab(df.HeatingQC, df.CentralAir)



In [26]:

    
pd.crosstab(df.HeatingQC, df.FireplaceQu)









    Out[26]:






  
    
      FireplaceQu
      Ex
      Fa
      Gd
      Po
      TA
    
    
      HeatingQC
      
      
      
      
      
    
  
  
    
      Ex
      22
      14
      254
      4
      160
    
    
      Fa
      0
      1
      13
      1
      5
    
    
      Gd
      2
      3
      45
      5
      57
    
    
      TA
      0
      15
      68
      10
      91



In [27]:

    
sns.factorplot('HeatingQC', 'SalePrice', hue = 'CentralAir', estimator = np.mean, data = df, 
             size = 4.5, aspect = 1.4)









    Out[27]:





<seaborn.axisgrid.FacetGrid at 0x7f74a8958cc0>

Clearly, having AC or not has a big impact on housing price.



In [28]:

    
fig, ax = plt.subplots(1, 2, figsize = (10, 4))
sns.boxplot('Electrical', 'SalePrice', data = df, ax = ax[0]).set(ylim = (0, 400000))
sns.countplot('Electrical', data = df)
plt.tight_layout()

Kitchen Quality



In [29]:

    
sns.factorplot('KitchenQual', 'SalePrice', estimator = np.mean, 
               size = 4.5, aspect = 1.4, data = df, order = ['Ex', 'Gd', 'TA', 'Fa'])









    Out[29]:





<seaborn.axisgrid.FacetGrid at 0x7f74a88bc710>

MSZonig



In [30]:

    
sns.boxplot(x = 'MSZoning', y = 'SalePrice', data = df)









    Out[30]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f74a8752cc0>

Street & Alley Access



In [31]:

    
fig, ax = plt.subplots(1, 2, figsize = (10, 4))
sns.boxplot(x = 'Street', y = 'SalePrice', data = df, ax = ax[0])
sns.boxplot(x = 'Alley', y = 'SalePrice', data = df, ax = ax[1])
plt.tight_layout()



In [32]:

    
print("The NA's in Alley is: ", df['Alley'].isnull().sum())
print("\nThere are so many NA's in Alley. When Alley is NA, Street = ", 
      df[df.Alley.notnull()].Street.unique())
print("\n", pd.crosstab(df.Street, df.Alley))









    



The NA's in Alley is:  1369

There are so many NA's in Alley. When Alley is NA, Street =  ['Pave']

 Alley   Grvl  Pave
Street            
Pave      50    41



In [33]:

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	12	2008	WD	Normal	250000

	Id	MSSubClass	LotFrontage	LotArea	OverallQual	OverallCond	YearBuilt	YearRemodAdd	MasVnrArea	BsmtFinSF1	...	WoodDeckSF	OpenPorchSF	EnclosedPorch	3SsnPorch	ScreenPorch	PoolArea	MiscVal	MoSold	YrSold	SalePrice
count	1460.000000	1460.000000	1201.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1452.000000	1460.000000	...	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000
mean	730.500000	56.897260	70.049958	10516.828082	6.099315	5.575342	1971.267808	1984.865753	103.685262	443.639726	...	94.244521	46.660274	21.954110	3.409589	15.060959	2.758904	43.489041	6.321918	2007.815753	180921.195890
std	421.610009	42.300571	24.284752	9981.264932	1.382997	1.112799	30.202904	20.645407	181.066207	456.098091	...	125.338794	66.256028	61.119149	29.317331	55.757415	40.177307	496.123024	2.703626	1.328095	79442.502883
min	1.000000	20.000000	21.000000	1300.000000	1.000000	1.000000	1872.000000	1950.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	2006.000000	34900.000000
25%	365.750000	20.000000	NaN	7553.500000	5.000000	5.000000	1954.000000	1967.000000	NaN	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	5.000000	2007.000000	129975.000000
50%	730.500000	50.000000	NaN	9478.500000	6.000000	5.000000	1973.000000	1994.000000	NaN	383.500000	...	0.000000	25.000000	0.000000	0.000000	0.000000	0.000000	0.000000	6.000000	2008.000000	163000.000000
75%	1095.250000	70.000000	NaN	11601.500000	7.000000	6.000000	2000.000000	2004.000000	NaN	712.250000	...	168.000000	68.000000	0.000000	0.000000	0.000000	0.000000	0.000000	8.000000	2009.000000	214000.000000
max	1460.000000	190.000000	313.000000	215245.000000	10.000000	9.000000	2010.000000	2010.000000	1600.000000	5644.000000	...	857.000000	547.000000	552.000000	508.000000	480.000000	738.000000	15500.000000	12.000000	2010.000000	755000.000000