1. Set up the environment and import data



In [1]:

    
%matplotlib inline
import numpy as np
import pandas as pd
import os
os.chdir("/Users/zhouyu/Documents/Zhou_Yu/DS/kaggle_challenge/House price/")



In [2]:

    
dataset = pd.read_csv('train.csv')

dataset.head(5)









    Out[2]:






  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      PoolArea
      PoolQC
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
      SalePrice
    
  
  
    
      0
      1
      60
      RL
      65.0
      8450
      Pave
      NaN
      Reg
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      2
      2008
      WD
      Normal
      208500
    
    
      1
      2
      20
      RL
      80.0
      9600
      Pave
      NaN
      Reg
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      5
      2007
      WD
      Normal
      181500
    
    
      2
      3
      60
      RL
      68.0
      11250
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      9
      2008
      WD
      Normal
      223500
    
    
      3
      4
      70
      RL
      60.0
      9550
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      2
      2006
      WD
      Abnorml
      140000
    
    
      4
      5
      60
      RL
      84.0
      14260
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      12
      2008
      WD
      Normal
      250000
    
  

5 rows × 81 columns



In [3]:

    
testset  = pd.read_csv("test.csv")
testset.head(5)









    Out[3]:






  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      ScreenPorch
      PoolArea
      PoolQC
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
    
  
  
    
      0
      1461
      20
      RH
      80.0
      11622
      Pave
      NaN
      Reg
      Lvl
      AllPub
      ...
      120
      0
      NaN
      MnPrv
      NaN
      0
      6
      2010
      WD
      Normal
    
    
      1
      1462
      20
      RL
      81.0
      14267
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      0
      NaN
      NaN
      Gar2
      12500
      6
      2010
      WD
      Normal
    
    
      2
      1463
      60
      RL
      74.0
      13830
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      0
      NaN
      MnPrv
      NaN
      0
      3
      2010
      WD
      Normal
    
    
      3
      1464
      60
      RL
      78.0
      9978
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      0
      NaN
      NaN
      NaN
      0
      6
      2010
      WD
      Normal
    
    
      4
      1465
      120
      RL
      43.0
      5005
      Pave
      NaN
      IR1
      HLS
      AllPub
      ...
      144
      0
      NaN
      NaN
      NaN
      0
      1
      2010
      WD
      Normal
    
  

5 rows × 80 columns

2. Visualization and preprocessing



In [4]:

    
dataset.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-null object
MasVnrType       1452 non-null object
MasVnrArea       1452 non-null float64
ExterQual        1460 non-null object
ExterCond        1460 non-null object
Foundation       1460 non-null object
BsmtQual         1423 non-null object
BsmtCond         1423 non-null object
BsmtExposure     1422 non-null object
BsmtFinType1     1423 non-null object
BsmtFinSF1       1460 non-null int64
BsmtFinType2     1422 non-null object
BsmtFinSF2       1460 non-null int64
BsmtUnfSF        1460 non-null int64
TotalBsmtSF      1460 non-null int64
Heating          1460 non-null object
HeatingQC        1460 non-null object
CentralAir       1460 non-null object
Electrical       1459 non-null object
1stFlrSF         1460 non-null int64
2ndFlrSF         1460 non-null int64
LowQualFinSF     1460 non-null int64
GrLivArea        1460 non-null int64
BsmtFullBath     1460 non-null int64
BsmtHalfBath     1460 non-null int64
FullBath         1460 non-null int64
HalfBath         1460 non-null int64
BedroomAbvGr     1460 non-null int64
KitchenAbvGr     1460 non-null int64
KitchenQual      1460 non-null object
TotRmsAbvGrd     1460 non-null int64
Functional       1460 non-null object
Fireplaces       1460 non-null int64
FireplaceQu      770 non-null object
GarageType       1379 non-null object
GarageYrBlt      1379 non-null float64
GarageFinish     1379 non-null object
GarageCars       1460 non-null int64
GarageArea       1460 non-null int64
GarageQual       1379 non-null object
GarageCond       1379 non-null object
PavedDrive       1460 non-null object
WoodDeckSF       1460 non-null int64
OpenPorchSF      1460 non-null int64
EnclosedPorch    1460 non-null int64
3SsnPorch        1460 non-null int64
ScreenPorch      1460 non-null int64
PoolArea         1460 non-null int64
PoolQC           7 non-null object
Fence            281 non-null object
MiscFeature      54 non-null object
MiscVal          1460 non-null int64
MoSold           1460 non-null int64
YrSold           1460 non-null int64
SaleType         1460 non-null object
SaleCondition    1460 non-null object
SalePrice        1460 non-null int64
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB



In [5]:

    
dataset.columns[dataset.isnull().sum()>0]









    Out[5]:





Index([u'LotFrontage', u'Alley', u'MasVnrType', u'MasVnrArea', u'BsmtQual',
       u'BsmtCond', u'BsmtExposure', u'BsmtFinType1', u'BsmtFinType2',
       u'Electrical', u'FireplaceQu', u'GarageType', u'GarageYrBlt',
       u'GarageFinish', u'GarageQual', u'GarageCond', u'PoolQC', u'Fence',
       u'MiscFeature'],
      dtype='object')



In [6]:

    
testset.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
Id               1459 non-null int64
MSSubClass       1459 non-null int64
MSZoning         1455 non-null object
LotFrontage      1232 non-null float64
LotArea          1459 non-null int64
Street           1459 non-null object
Alley            107 non-null object
LotShape         1459 non-null object
LandContour      1459 non-null object
Utilities        1457 non-null object
LotConfig        1459 non-null object
LandSlope        1459 non-null object
Neighborhood     1459 non-null object
Condition1       1459 non-null object
Condition2       1459 non-null object
BldgType         1459 non-null object
HouseStyle       1459 non-null object
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null object
RoofMatl         1459 non-null object
Exterior1st      1458 non-null object
Exterior2nd      1458 non-null object
MasVnrType       1443 non-null object
MasVnrArea       1444 non-null float64
ExterQual        1459 non-null object
ExterCond        1459 non-null object
Foundation       1459 non-null object
BsmtQual         1415 non-null object
BsmtCond         1414 non-null object
BsmtExposure     1415 non-null object
BsmtFinType1     1417 non-null object
BsmtFinSF1       1458 non-null float64
BsmtFinType2     1417 non-null object
BsmtFinSF2       1458 non-null float64
BsmtUnfSF        1458 non-null float64
TotalBsmtSF      1458 non-null float64
Heating          1459 non-null object
HeatingQC        1459 non-null object
CentralAir       1459 non-null object
Electrical       1459 non-null object
1stFlrSF         1459 non-null int64
2ndFlrSF         1459 non-null int64
LowQualFinSF     1459 non-null int64
GrLivArea        1459 non-null int64
BsmtFullBath     1457 non-null float64
BsmtHalfBath     1457 non-null float64
FullBath         1459 non-null int64
HalfBath         1459 non-null int64
BedroomAbvGr     1459 non-null int64
KitchenAbvGr     1459 non-null int64
KitchenQual      1458 non-null object
TotRmsAbvGrd     1459 non-null int64
Functional       1457 non-null object
Fireplaces       1459 non-null int64
FireplaceQu      729 non-null object
GarageType       1383 non-null object
GarageYrBlt      1381 non-null float64
GarageFinish     1381 non-null object
GarageCars       1458 non-null float64
GarageArea       1458 non-null float64
GarageQual       1381 non-null object
GarageCond       1381 non-null object
PavedDrive       1459 non-null object
WoodDeckSF       1459 non-null int64
OpenPorchSF      1459 non-null int64
EnclosedPorch    1459 non-null int64
3SsnPorch        1459 non-null int64
ScreenPorch      1459 non-null int64
PoolArea         1459 non-null int64
PoolQC           3 non-null object
Fence            290 non-null object
MiscFeature      51 non-null object
MiscVal          1459 non-null int64
MoSold           1459 non-null int64
YrSold           1459 non-null int64
SaleType         1458 non-null object
SaleCondition    1459 non-null object
dtypes: float64(11), int64(26), object(43)
memory usage: 911.9+ KB



In [7]:

    
testset.columns[testset.isnull().sum()>0]









    Out[7]:





Index([u'MSZoning', u'LotFrontage', u'Alley', u'Utilities', u'Exterior1st',
       u'Exterior2nd', u'MasVnrType', u'MasVnrArea', u'BsmtQual', u'BsmtCond',
       u'BsmtExposure', u'BsmtFinType1', u'BsmtFinSF1', u'BsmtFinType2',
       u'BsmtFinSF2', u'BsmtUnfSF', u'TotalBsmtSF', u'BsmtFullBath',
       u'BsmtHalfBath', u'KitchenQual', u'Functional', u'FireplaceQu',
       u'GarageType', u'GarageYrBlt', u'GarageFinish', u'GarageCars',
       u'GarageArea', u'GarageQual', u'GarageCond', u'PoolQC', u'Fence',
       u'MiscFeature', u'SaleType'],
      dtype='object')



In [8]:

    
dataset_exp = dataset; # create an experimental dataset
testset_exp = testset; # create an experimental testset
#split daataset_exp into features and target
feas = dataset_exp.shape[1]
X_train = dataset_exp.iloc[:,:feas-1]
y_target = dataset_exp['SalePrice']
print "Train data shape is:", X_train.shape
print "target data shape is:", y_target.shape
X = X_train.append(testset_exp,ignore_index = True)
print "all features shape is:", X.shape









    



Train data shape is: (1460, 80)
target data shape is: (1460,)
all features shape is: (2919, 80)



In [9]:

    
X.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 80 columns):
Id               2919 non-null int64
MSSubClass       2919 non-null int64
MSZoning         2915 non-null object
LotFrontage      2433 non-null float64
LotArea          2919 non-null int64
Street           2919 non-null object
Alley            198 non-null object
LotShape         2919 non-null object
LandContour      2919 non-null object
Utilities        2917 non-null object
LotConfig        2919 non-null object
LandSlope        2919 non-null object
Neighborhood     2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
BldgType         2919 non-null object
HouseStyle       2919 non-null object
OverallQual      2919 non-null int64
OverallCond      2919 non-null int64
YearBuilt        2919 non-null int64
YearRemodAdd     2919 non-null int64
RoofStyle        2919 non-null object
RoofMatl         2919 non-null object
Exterior1st      2918 non-null object
Exterior2nd      2918 non-null object
MasVnrType       2895 non-null object
MasVnrArea       2896 non-null float64
ExterQual        2919 non-null object
ExterCond        2919 non-null object
Foundation       2919 non-null object
BsmtQual         2838 non-null object
BsmtCond         2837 non-null object
BsmtExposure     2837 non-null object
BsmtFinType1     2840 non-null object
BsmtFinSF1       2918 non-null float64
BsmtFinType2     2839 non-null object
BsmtFinSF2       2918 non-null float64
BsmtUnfSF        2918 non-null float64
TotalBsmtSF      2918 non-null float64
Heating          2919 non-null object
HeatingQC        2919 non-null object
CentralAir       2919 non-null object
Electrical       2918 non-null object
1stFlrSF         2919 non-null int64
2ndFlrSF         2919 non-null int64
LowQualFinSF     2919 non-null int64
GrLivArea        2919 non-null int64
BsmtFullBath     2917 non-null float64
BsmtHalfBath     2917 non-null float64
FullBath         2919 non-null int64
HalfBath         2919 non-null int64
BedroomAbvGr     2919 non-null int64
KitchenAbvGr     2919 non-null int64
KitchenQual      2918 non-null object
TotRmsAbvGrd     2919 non-null int64
Functional       2917 non-null object
Fireplaces       2919 non-null int64
FireplaceQu      1499 non-null object
GarageType       2762 non-null object
GarageYrBlt      2760 non-null float64
GarageFinish     2760 non-null object
GarageCars       2918 non-null float64
GarageArea       2918 non-null float64
GarageQual       2760 non-null object
GarageCond       2760 non-null object
PavedDrive       2919 non-null object
WoodDeckSF       2919 non-null int64
OpenPorchSF      2919 non-null int64
EnclosedPorch    2919 non-null int64
3SsnPorch        2919 non-null int64
ScreenPorch      2919 non-null int64
PoolArea         2919 non-null int64
PoolQC           10 non-null object
Fence            571 non-null object
MiscFeature      105 non-null object
MiscVal          2919 non-null int64
MoSold           2919 non-null int64
YrSold           2919 non-null int64
SaleType         2918 non-null object
SaleCondition    2919 non-null object
dtypes: float64(11), int64(26), object(43)
memory usage: 1.8+ MB



In [10]:

    
X.index.is_unique









    Out[10]:





True

2.1 Missing value: Categorical type -> change it to "Mis"



In [11]:

    
# change null in all object columns to 'NA'
cat_cols = [] # all categorical features
cat_n_cols = [] # all categorical features that contained null values
n_cols = [] # all numerical features
for i in X.columns:
    if X[i].dtype == 'object':
        if X[i].isnull().sum()>0:
            cat_n_cols.append(i);
        X[i].fillna("Mis", inplace = True) # do not forget in place
        cat_cols.append(i);
    else:
        n_cols.append(i)
print "Here are all categorical data: ",'||'.join(cat_cols)
print "Here are all categorical data that contains missing values: ",'||'.join(cat_n_cols)
print "Here are all numerical data: ",'||'.join(n_cols)









    



Here are all categorical data:  MSZoning||Street||Alley||LotShape||LandContour||Utilities||LotConfig||LandSlope||Neighborhood||Condition1||Condition2||BldgType||HouseStyle||RoofStyle||RoofMatl||Exterior1st||Exterior2nd||MasVnrType||ExterQual||ExterCond||Foundation||BsmtQual||BsmtCond||BsmtExposure||BsmtFinType1||BsmtFinType2||Heating||HeatingQC||CentralAir||Electrical||KitchenQual||Functional||FireplaceQu||GarageType||GarageFinish||GarageQual||GarageCond||PavedDrive||PoolQC||Fence||MiscFeature||SaleType||SaleCondition
Here are all categorical data that contains missing values:  MSZoning||Alley||Utilities||Exterior1st||Exterior2nd||MasVnrType||BsmtQual||BsmtCond||BsmtExposure||BsmtFinType1||BsmtFinType2||Electrical||KitchenQual||Functional||FireplaceQu||GarageType||GarageFinish||GarageQual||GarageCond||PoolQC||Fence||MiscFeature||SaleType
Here are all numerical data:  Id||MSSubClass||LotFrontage||LotArea||OverallQual||OverallCond||YearBuilt||YearRemodAdd||MasVnrArea||BsmtFinSF1||BsmtFinSF2||BsmtUnfSF||TotalBsmtSF||1stFlrSF||2ndFlrSF||LowQualFinSF||GrLivArea||BsmtFullBath||BsmtHalfBath||FullBath||HalfBath||BedroomAbvGr||KitchenAbvGr||TotRmsAbvGrd||Fireplaces||GarageYrBlt||GarageCars||GarageArea||WoodDeckSF||OpenPorchSF||EnclosedPorch||3SsnPorch||ScreenPorch||PoolArea||MiscVal||MoSold||YrSold



In [12]:

    
n_n_cols = X.columns[X.isnull().sum()>0]
print "Numerical features with missing values:","||".join(n_n_cols)









    



Numerical features with missing values: LotFrontage||MasVnrArea||BsmtFinSF1||BsmtFinSF2||BsmtUnfSF||TotalBsmtSF||BsmtFullBath||BsmtHalfBath||GarageYrBlt||GarageCars||GarageArea



In [13]:

    
import seaborn as sns



In [15]:

    
# visualize the manipulation of setting missing value to Mis category
g = sns.PairGrid(pd.concat([X.iloc[:1459,:],y_target],axis =1),x_vars = cat_n_cols[18:], y_vars = "SalePrice")
g = g.map(sns.stripplot);
# possibly delete some columns that a large propotion is missing values (given that those missing values show 
# no obvious correlation with price



In [16]:

    
# as for PoolQC has lots of missing values and also there is
# no obvious correlation between these features and sales Price
# here I just drop it
X.drop(['PoolQC'],axis = 1, inplace = True)

2.2 missing value: float type



In [17]:

    
X[n_n_cols].info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 11 columns):
LotFrontage     2433 non-null float64
MasVnrArea      2896 non-null float64
BsmtFinSF1      2918 non-null float64
BsmtFinSF2      2918 non-null float64
BsmtUnfSF       2918 non-null float64
TotalBsmtSF     2918 non-null float64
BsmtFullBath    2917 non-null float64
BsmtHalfBath    2917 non-null float64
GarageYrBlt     2760 non-null float64
GarageCars      2918 non-null float64
GarageArea      2918 non-null float64
dtypes: float64(11)
memory usage: 250.9 KB



In [19]:

    
g = sns.PairGrid(pd.concat([X.iloc[:1459,:],y_target],axis =1),x_vars = n_n_cols, y_vars = "SalePrice", dropna = False)
g = g.map(sns.jointplot);



In [20]:

    
# fill the na value with median 
X.fillna(X.median()[n_n_cols], inplace = True)









    Out[20]:






  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      3SsnPorch
      ScreenPorch
      PoolArea
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
    
  
  
    
      0
      1
      60
      RL
      65.0
      8450
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      2
      2008
      WD
      Normal
    
    
      1
      2
      20
      RL
      80.0
      9600
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      5
      2007
      WD
      Normal
    
    
      2
      3
      60
      RL
      68.0
      11250
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      9
      2008
      WD
      Normal
    
    
      3
      4
      70
      RL
      60.0
      9550
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      2
      2006
      WD
      Abnorml
    
    
      4
      5
      60
      RL
      84.0
      14260
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      12
      2008
      WD
      Normal
    
    
      5
      6
      50
      RL
      85.0
      14115
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      320
      0
      0
      MnPrv
      Shed
      700
      10
      2009
      WD
      Normal
    
    
      6
      7
      20
      RL
      75.0
      10084
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      8
      2007
      WD
      Normal
    
    
      7
      8
      60
      RL
      68.0
      10382
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Shed
      350
      11
      2009
      WD
      Normal
    
    
      8
      9
      50
      RM
      51.0
      6120
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      4
      2008
      WD
      Abnorml
    
    
      9
      10
      190
      RL
      50.0
      7420
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      1
      2008
      WD
      Normal
    
    
      10
      11
      20
      RL
      70.0
      11200
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      2
      2008
      WD
      Normal
    
    
      11
      12
      60
      RL
      85.0
      11924
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      7
      2006
      New
      Partial
    
    
      12
      13
      20
      RL
      68.0
      12968
      Pave
      Mis
      IR2
      Lvl
      AllPub
      ...
      0
      176
      0
      Mis
      Mis
      0
      9
      2008
      WD
      Normal
    
    
      13
      14
      20
      RL
      91.0
      10652
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      8
      2007
      New
      Partial
    
    
      14
      15
      20
      RL
      68.0
      10920
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      0
      0
      0
      GdWo
      Mis
      0
      5
      2008
      WD
      Normal
    
    
      15
      16
      45
      RM
      51.0
      6120
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      GdPrv
      Mis
      0
      7
      2007
      WD
      Normal
    
    
      16
      17
      20
      RL
      68.0
      11241
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Shed
      700
      3
      2010
      WD
      Normal
    
    
      17
      18
      90
      RL
      72.0
      10791
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Shed
      500
      10
      2006
      WD
      Normal
    
    
      18
      19
      20
      RL
      66.0
      13695
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      6
      2008
      WD
      Normal
    
    
      19
      20
      20
      RL
      70.0
      7560
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      MnPrv
      Mis
      0
      5
      2009
      COD
      Abnorml
    
    
      20
      21
      60
      RL
      101.0
      14215
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      11
      2006
      New
      Partial
    
    
      21
      22
      45
      RM
      57.0
      7449
      Pave
      Grvl
      Reg
      Bnk
      AllPub
      ...
      0
      0
      0
      GdPrv
      Mis
      0
      6
      2007
      WD
      Normal
    
    
      22
      23
      20
      RL
      75.0
      9742
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      9
      2008
      WD
      Normal
    
    
      23
      24
      120
      RM
      44.0
      4224
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      6
      2007
      WD
      Normal
    
    
      24
      25
      20
      RL
      68.0
      8246
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      0
      0
      0
      MnPrv
      Mis
      0
      5
      2010
      WD
      Normal
    
    
      25
      26
      20
      RL
      110.0
      14230
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      7
      2009
      WD
      Normal
    
    
      26
      27
      20
      RL
      60.0
      7200
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      5
      2010
      WD
      Normal
    
    
      27
      28
      20
      RL
      98.0
      11478
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      5
      2010
      WD
      Normal
    
    
      28
      29
      20
      RL
      47.0
      16321
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      12
      2006
      WD
      Normal
    
    
      29
      30
      30
      RM
      60.0
      6324
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      5
      2008
      WD
      Normal
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      2889
      2890
      30
      RM
      50.0
      7030
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      MnPrv
      Mis
      0
      3
      2006
      WD
      Normal
    
    
      2890
      2891
      50
      RM
      75.0
      9060
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      4
      2006
      WD
      Normal
    
    
      2891
      2892
      30
      C (all)
      69.0
      12366
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      10
      2006
      WD
      Abnorml
    
    
      2892
      2893
      190
      C (all)
      50.0
      9000
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      10
      2006
      WD
      Abnorml
    
    
      2893
      2894
      50
      C (all)
      60.0
      8520
      Grvl
      Mis
      Reg
      Bnk
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      4
      2006
      WD
      Normal
    
    
      2894
      2895
      120
      RM
      41.0
      5748
      Pave
      Mis
      IR1
      HLS
      AllPub
      ...
      0
      153
      0
      Mis
      Mis
      0
      2
      2006
      New
      Partial
    
    
      2895
      2896
      120
      RM
      44.0
      3842
      Pave
      Mis
      IR1
      HLS
      AllPub
      ...
      0
      155
      0
      Mis
      Mis
      0
      12
      2006
      WD
      Normal
    
    
      2896
      2897
      20
      RL
      69.0
      23580
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      9
      2006
      WD
      Normal
    
    
      2897
      2898
      90
      RL
      65.0
      8385
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      10
      2006
      WD
      Normal
    
    
      2898
      2899
      20
      RL
      70.0
      9116
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      5
      2006
      WD
      Normal
    
    
      2899
      2900
      80
      RL
      140.0
      11080
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      5
      2006
      WD
      Normal
    
    
      2900
      2901
      20
      RL
      68.0
      50102
      Pave
      Mis
      IR1
      Low
      AllPub
      ...
      0
      138
      0
      Mis
      Mis
      0
      3
      2006
      WD
      Alloca
    
    
      2901
      2902
      20
      RL
      68.0
      8098
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      10
      2006
      WD
      Normal
    
    
      2902
      2903
      20
      RL
      95.0
      13618
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      11
      2006
      New
      Partial
    
    
      2903
      2904
      20
      RL
      88.0
      11577
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      9
      2006
      New
      Partial
    
    
      2904
      2905
      20
      Mis
      125.0
      31250
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      5
      2006
      WD
      Normal
    
    
      2905
      2906
      90
      RM
      78.0
      7020
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      11
      2006
      WD
      Normal
    
    
      2906
      2907
      160
      RM
      41.0
      2665
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      5
      2006
      WD
      Normal
    
    
      2907
      2908
      20
      RL
      58.0
      10172
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      10
      2006
      WD
      Normal
    
    
      2908
      2909
      90
      RL
      68.0
      11836
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      3
      2006
      WD
      Normal
    
    
      2909
      2910
      180
      RM
      21.0
      1470
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      4
      2006
      WD
      Normal
    
    
      2910
      2911
      160
      RM
      21.0
      1484
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      5
      2006
      WD
      Normal
    
    
      2911
      2912
      20
      RL
      80.0
      13384
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      5
      2006
      WD
      Normal
    
    
      2912
      2913
      160
      RM
      21.0
      1533
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      12
      2006
      WD
      Abnorml
    
    
      2913
      2914
      160
      RM
      21.0
      1526
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      GdPrv
      Mis
      0
      6
      2006
      WD
      Normal
    
    
      2914
      2915
      160
      RM
      21.0
      1936
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      6
      2006
      WD
      Normal
    
    
      2915
      2916
      160
      RM
      21.0
      1894
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      4
      2006
      WD
      Abnorml
    
    
      2916
      2917
      20
      RL
      160.0
      20000
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      9
      2006
      WD
      Abnorml
    
    
      2917
      2918
      85
      RL
      62.0
      10441
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      MnPrv
      Shed
      700
      7
      2006
      WD
      Normal
    
    
      2918
      2919
      60
      RL
      74.0
      9627
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      11
      2006
      WD
      Normal
    
  

2919 rows × 79 columns



In [21]:

    
X.head(5)









    Out[21]:






  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      3SsnPorch
      ScreenPorch
      PoolArea
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
    
  
  
    
      0
      1
      60
      RL
      65.0
      8450
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      2
      2008
      WD
      Normal
    
    
      1
      2
      20
      RL
      80.0
      9600
      Pave
      Mis
      Reg
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      5
      2007
      WD
      Normal
    
    
      2
      3
      60
      RL
      68.0
      11250
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      9
      2008
      WD
      Normal
    
    
      3
      4
      70
      RL
      60.0
      9550
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      2
      2006
      WD
      Abnorml
    
    
      4
      5
      60
      RL
      84.0
      14260
      Pave
      Mis
      IR1
      Lvl
      AllPub
      ...
      0
      0
      0
      Mis
      Mis
      0
      12
      2008
      WD
      Normal
    
  

5 rows × 79 columns

3. Testing different predictive models

3.1 Features are highly correlated



In [23]:

    
import matplotlib.pylab as plt
import numpy as np
train_set = pd.concat([X.iloc[:1459,:],y_target],axis = 1)
corr = train_set.corr()
sns.set(style = "white")
f,ax = plt.subplots(figsize=(11,9))
cmap = sns.diverging_palette(220,10,as_cmap=True)
mask = np.zeros_like(corr,dtype = np.bool)
sns.heatmap(corr,mask = mask,cmap = cmap, vmax=.3,square = True, linewidths = .5,
            cbar_kws = {"shrink": .5},ax = ax)









    Out[23]:





<matplotlib.axes._subplots.AxesSubplot at 0x11be31dd0>

3.2 Convert categorical data to numerical data



In [26]:

    
num_features = 0
cat_cols.remove('PoolQC')
for i in cat_cols:
    temp = len(X[i].unique())
    num_features+=temp
    #print i,"with %d unique values"%(temp)
    print X[i].value_counts()
    if temp >10:
        print "feature",i,"has %d features"%temp
        #print X[i].value_counts()
print "There are total of %d features after converting %d features"%(num_features,len(cat_cols
                                                                                     ))
# if we convert every feature with one hot encoder 259 features + ~37 numerical data
# that is almost ~20% of the training data
# To reduce the dimensinality
#1. match the data with test set
#2. eliminat some really sparse features









    



RL         2265
RM          460
FV          139
RH           26
C (all)      25
Mis           4
Name: MSZoning, dtype: int64
Pave    2907
Grvl      12
Name: Street, dtype: int64
Mis     2721
Grvl     120
Pave      78
Name: Alley, dtype: int64
Reg    1859
IR1     968
IR2      76
IR3      16
Name: LotShape, dtype: int64
Lvl    2622
HLS     120
Bnk     117
Low      60
Name: LandContour, dtype: int64
AllPub    2916
Mis          2
NoSeWa       1
Name: Utilities, dtype: int64
Inside     2133
Corner      511
CulDSac     176
FR2          85
FR3          14
Name: LotConfig, dtype: int64
Gtl    2778
Mod     125
Sev      16
Name: LandSlope, dtype: int64
NAmes      443
CollgCr    267
OldTown    239
Edwards    194
Somerst    182
NridgHt    166
Gilbert    165
Sawyer     151
NWAmes     131
SawyerW    125
Mitchel    114
BrkSide    108
Crawfor    103
IDOTRR      93
Timber      72
NoRidge     71
StoneBr     51
SWISU       48
ClearCr     44
MeadowV     37
BrDale      30
Blmngtn     28
Veenker     24
NPkVill     23
Blueste     10
Name: Neighborhood, dtype: int64
feature Neighborhood has 25 features
Norm      2511
Feedr      164
Artery      92
RRAn        50
PosN        39
RRAe        28
PosA        20
RRNn         9
RRNe         6
Name: Condition1, dtype: int64
Norm      2889
Feedr       13
Artery       5
PosA         4
PosN         4
RRNn         2
RRAn         1
RRAe         1
Name: Condition2, dtype: int64
1Fam      2425
TwnhsE     227
Duplex     109
Twnhs       96
2fmCon      62
Name: BldgType, dtype: int64
1Story    1471
2Story     872
1.5Fin     314
SLvl       128
SFoyer      83
2.5Unf      24
1.5Unf      19
2.5Fin       8
Name: HouseStyle, dtype: int64
Gable      2310
Hip         551
Gambrel      22
Flat         20
Mansard      11
Shed          5
Name: RoofStyle, dtype: int64
CompShg    2876
Tar&Grv      23
WdShake       9
WdShngl       7
Membran       1
ClyTile       1
Metal         1
Roll          1
Name: RoofMatl, dtype: int64
VinylSd    1025
MetalSd     450
HdBoard     442
Wd Sdng     411
Plywood     221
CemntBd     126
BrkFace      87
WdShing      56
AsbShng      44
Stucco       43
BrkComm       6
AsphShn       2
Stone         2
CBlock        2
Mis           1
ImStucc       1
Name: Exterior1st, dtype: int64
feature Exterior1st has 16 features
VinylSd    1014
MetalSd     447
HdBoard     406
Wd Sdng     391
Plywood     270
CmentBd     126
Wd Shng      81
Stucco       47
BrkFace      47
AsbShng      38
Brk Cmn      22
ImStucc      15
Stone         6
AsphShn       4
CBlock        3
Mis           1
Other         1
Name: Exterior2nd, dtype: int64
feature Exterior2nd has 17 features
None       1742
BrkFace     879
Stone       249
BrkCmn       25
Mis          24
Name: MasVnrType, dtype: int64
TA    1798
Gd     979
Ex     107
Fa      35
Name: ExterQual, dtype: int64
TA    2538
Gd     299
Fa      67
Ex      12
Po       3
Name: ExterCond, dtype: int64
PConc     1308
CBlock    1235
BrkTil     311
Slab        49
Stone       11
Wood         5
Name: Foundation, dtype: int64
TA     1283
Gd     1209
Ex      258
Fa       88
Mis      81
Name: BsmtQual, dtype: int64
TA     2606
Gd      122
Fa      104
Mis      82
Po        5
Name: BsmtCond, dtype: int64
No     1904
Av      418
Gd      276
Mn      239
Mis      82
Name: BsmtExposure, dtype: int64
Unf    851
GLQ    849
ALQ    429
Rec    288
BLQ    269
LwQ    154
Mis     79
Name: BsmtFinType1, dtype: int64
Unf    2493
Rec     105
LwQ      87
Mis      80
BLQ      68
ALQ      52
GLQ      34
Name: BsmtFinType2, dtype: int64
GasA     2874
GasW       27
Grav        9
Wall        6
OthW        2
Floor       1
Name: Heating, dtype: int64
Ex    1493
TA     857
Gd     474
Fa      92
Po       3
Name: HeatingQC, dtype: int64
Y    2723
N     196
Name: CentralAir, dtype: int64
SBrkr    2671
FuseA     188
FuseF      50
FuseP       8
Mis         1
Mix         1
Name: Electrical, dtype: int64
TA     1492
Gd     1151
Ex      205
Fa       70
Mis       1
Name: KitchenQual, dtype: int64
Typ     2717
Min2      70
Min1      65
Mod       35
Maj1      19
Maj2       9
Mis        2
Sev        2
Name: Functional, dtype: int64
Mis    1420
Gd      744
TA      592
Fa       74
Po       46
Ex       43
Name: FireplaceQu, dtype: int64
Attchd     1723
Detchd      779
BuiltIn     186
Mis         157
Basment      36
2Types       23
CarPort      15
Name: GarageType, dtype: int64
Unf    1230
RFn     811
Fin     719
Mis     159
Name: GarageFinish, dtype: int64
TA     2604
Mis     159
Fa      124
Gd       24
Po        5
Ex        3
Name: GarageQual, dtype: int64
TA     2654
Mis     159
Fa       74
Gd       15
Po       14
Ex        3
Name: GarageCond, dtype: int64
Y    2641
N     216
P      62
Name: PavedDrive, dtype: int64
Mis      2348
MnPrv     329
GdPrv     118
GdWo      112
MnWw       12
Name: Fence, dtype: int64
Mis     2814
Shed      95
Gar2       5
Othr       4
TenC       1
Name: MiscFeature, dtype: int64
WD       2525
New       239
COD        87
ConLD      26
CWD        12
ConLI       9
ConLw       8
Oth         7
Con         5
Mis         1
Name: SaleType, dtype: int64
Normal     2402
Partial     245
Abnorml     190
Family       46
Alloca       24
AdjLand      12
Name: SaleCondition, dtype: int64
There are total of 271 features after converting 42 features

3.2.1 convert all categorical features into numerical with one hot encoder



In [27]:

    
# convert different features into numbers
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
d = defaultdict(LabelEncoder)
X_cat = X[cat_cols].apply(lambda x:d[x.name].fit_transform(x))
# convert different numbers into a binary sparse matrix
from sklearn.preprocessing import OneHotEncoder
Ohe = OneHotEncoder(sparse = False)
Ohe.fit(X_cat)
X_cat = Ohe.transform(X_cat) 
X_cat = pd.DataFrame(X_cat)
print "Total number of cols after conversaion: %d"%X_cat.shape[0]
print X_cat.shape
X_n = X[n_cols]
X_allcat = pd.concat([X_cat,X_n],axis = 1)
print " the new feature matrix has the shape",X_allcat.shape









    



Total number of cols after conversaion: 2919
(2919, 271)
 the new feature matrix has the shape (2919, 308)

3.2.2 Ged rid of part of one hot encoded features that have less than 5 points



In [108]:

    
col_sparse = []
for i in X_allcat.columns:
    if len((X_allcat[i].nonzero())[0])<=1:
        col_sparse.append(i)
print len(col_sparse)



In [109]:

    
X_2 = X_allcat.drop(col_sparse, axis = 1)
print "the non-sparse features matrix has the shape", X_2.shape









    



the non-sparse features matrix has the shape (2919, 291)

3.3.1 Split the trainning data, validation data and test data



In [95]:

    
# split the data into festures and target
from sklearn.model_selection import train_test_split
X_train_a = X_allcat.iloc[:1460,:]
X_test_a = X_allcat.iloc[1460:,:]
y_target = dataset_exp['SalePrice']
X_train,X_val,y_train,y_val = train_test_split(X_train_a,y_target,test_size = 0.1,random_state = 1)
print "Trainning set shape is:", X_train.shape
print "Val set shape is:" ,X_val.shape
print "y_train shape is:", y_train.shape
print "y_val is:", y_val.shape









    



 Trainning set shape is: (1314, 308)
Test set shape is: (146, 308)
y_train shape is: (1314,)
y_test is: (146,)



In [110]:

    
from sklearn.model_selection import train_test_split
X_train_a = X_2.iloc[:1460,:]
X_test_a = X_2.iloc[1460:,:]
y_target = dataset_exp['SalePrice']
X_train,X_val,y_train,y_val = train_test_split(X_train_a,y_target,test_size = 0.1,random_state = 1)
print "Trainning set shape is:", X_train.shape
print "Val set shape is:" ,X_val.shape
print "y_train shape is:", y_train.shape
print "y_val is:", y_val.shape









    



Trainning set shape is: (1314, 291)
Val set shape is: (146, 291)
y_train shape is: (1314,)
y_val is: (146,)



In [98]:

    
X_test_a['LotArea'].head(5) # just to validate on any mistakes on the indices









    Out[98]:





1460    11622
1461    14267
1462    13830
1463     9978
1464     5005
Name: LotArea, dtype: int64



In [190]:

    
# exammine the distribution of lg target
import matplotlib.mlab as mlab
from scipy.stats import norm,lognorm
lg_y_target = np.log(y_target)
(mu,sigma) = norm.fit(lg_y_target)
n,bins,pathes = plt.hist(lg_y_target,60,normed = 1,facecolor = 'green',alpha = 0.75)
y = mlab.normpdf(bins,mu,sigma)
l = plt.plot(bins,y,'r--',linewidth = 2)
plt.xlabel('sales price')
plt.ylabel('probability')
plt.grid(True)
plt.show()

3.3.2 Random Forest



In [143]:

    
# Machine learing methods selection:
#1. Random Forest # use RMSE to evaluate results
lg_y_train = np.log(y_train)
lg_y_val = np.log(y_val)
from sklearn.ensemble import RandomForestRegressor
#rf_dict = {}
for n_est in np.arange(80,120,10):
    rf = RandomForestRegressor(n_estimators = n_est)
    forest = rf.fit(X_train,y_train)
    y_train_pred = np.log(rf.predict(X_train))
    y_val_pred = np.log(rf.predict(X_val))
    #train_score = rf.score(X_train,y_train)
    #res_score = rf.score(X_val,y_val)
    train_score = mean_squared_error(lg_y_train,y_train_pred)
    res_score = mean_squared_error(lg_y_val,y_val_pred)
    print "n_estimators: ",n_est
    print "tainning set score is:", sqrt(train_score),"Val set score is:", sqrt(res_score)
    rf_dict[n_est] = (train_score, res_score)
# 110 random estimators seem to be a not bad choice









    



n_estimators:  80
tainning set score is: 0.0610901727653 Val set score is: 0.145063492725
n_estimators:  90
tainning set score is: 0.0602490627117 Val set score is: 0.15100943255
n_estimators:  100
tainning set score is: 0.0608007546421 Val set score is: 0.14847370254
n_estimators:  110
tainning set score is: 0.0617911921286 Val set score is: 0.148686755011



In [132]:

    
#2 Random Forest with log normalized target (worse than raw values)
lg_y_train = np.log(y_train)
lg_y_val = np.log(y_val)
rf_log_dic = {}
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from math import sqrt
for n_est in np.arange(50,200,10):
    rf = RandomForestRegressor(n_estimators = n_est)
    forest = rf.fit(X_train,lg_y_train)
    y_train_pred = rf.predict(X_train)
    y_val_pred = rf.predict(X_val)
    #train_score = r2_score(y_train,y_train_pred)
    train_score = mean_squared_error(lg_y_train,y_train_pred)
    #res_score = r2_score(y_val,y_val_pred)
    res_score = mean_squared_error(lg_y_val,y_val_pred)
    print "n_estimators: ",n_est
    print "tainning set score is:", sqrt(train_score),"Val set score is:", sqrt(res_score)
    rf_dict[n_est] = (train_score, res_score)









    



n_estimators:  50
tainning set score is: 0.0554601247001 Val set score is: 0.148722076153
n_estimators:  60
tainning set score is: 0.0563301270892 Val set score is: 0.147161436945
n_estimators:  70
tainning set score is: 0.0554728222897 Val set score is: 0.152774440825
n_estimators:  80
tainning set score is: 0.0556105617524 Val set score is: 0.149432445324
n_estimators:  90
tainning set score is: 0.0548168186271 Val set score is: 0.151470035362
n_estimators:  100
tainning set score is: 0.0544718825814 Val set score is: 0.151172233315
n_estimators:  110
tainning set score is: 0.0536678357592 Val set score is: 0.151423948131
n_estimators:  120
tainning set score is: 0.0540466106419 Val set score is: 0.14915901847
n_estimators:  130
tainning set score is: 0.054438260873 Val set score is: 0.152756219955
n_estimators:  140
tainning set score is: 0.0540960690272 Val set score is: 0.149193997152
n_estimators:  150
tainning set score is: 0.0547520894748 Val set score is: 0.148188454217
n_estimators:  160
tainning set score is: 0.0528907598672 Val set score is: 0.151055981436
n_estimators:  170
tainning set score is: 0.0535285406528 Val set score is: 0.151160109639
n_estimators:  180
tainning set score is: 0.0545178865037 Val set score is: 0.149951571046
n_estimators:  190
tainning set score is: 0.0536844737075 Val set score is: 0.148726926886

3.3.3 XGboost



In [128]:

    
import xgboost as xgb
regr = xgb.XGBRegressor(colsample_bytree= 0.2,gamma = 0.0,learning_rate = 0.005,max_depth = 6,
                        min_child_weight = 2,n_estimators = 7200,reg_alpha = 0.9,
                        reg_lambda = 0.6,
                        subsample=0.3,
                        seed = 40,
                        silent = 1)



In [129]:

    
from sklearn.metrics import mean_squared_error
from math import sqrt
lg_y_train = np.log(y_train)
lg_y_val = np.log(y_val)
regr.fit(X_train,lg_y_train)
y_val_pred = regr.predict(X_val)
score = mean_squared_error(lg_y_val,y_val_pred)
#print r2_score(y_val,y_val_pred)
print sqrt(score)









    



0.113734846132

3.3.4 Lasso



In [171]:

    
from sklearn.linear_model import Lasso
#alpha = 0.00099
alpha_1 =0.00029
las = Lasso(alpha = alpha_1, max_iter = 50000)
las.fit(X_train,lg_y_train)
y_val_pred = las.predict(X_val)
score = mean_squared_error(lg_y_val,y_val_pred)
print sqrt(score)









    



0.118649673226



In [176]:

    
import matplotlib.pyplot as plt
plt.plot(las.predict(X_train),regr.predict(X_train),'o')
plt.xlim(11,13.5)
plt.ylim(11,13.5)









    Out[176]:





(11, 13.5)

3.3.5 if we combine this two



In [185]:

    
pred_w = 0.4*las.predict(X_val)+0.6*regr.predict(X_val)
cb_score = mean_squared_error(lg_y_val, pred_w)
print sqrt(cb_score)









    



0.110855040531

4. Submission

random forest: score: 0.14860



In [115]:

    
rf_f = RandomForestRegressor(n_estimators = 100)
forest_f = rf_f.fit(X_train,y_train)
res_score = rf_f.score(X_val,y_val)
print res_score









    



0.940386357474



In [227]:

    
pred = rf_f.predict(X_test_a)
print pred.shape

Xgboost score: 0.12621, reduced feature 0.12650



In [130]:

    
# Xgboost
pred = np.exp(regr.predict(X_test_a))



In [188]:

    
submission = pd.read_csv('sample_submission.csv')
submission.iloc[:,1] = pred
submission.to_csv('submission.csv',index = None)

Lasso : 0.13268



In [172]:

    
pred = np.exp(las.predict(X_test_a))

Lasso+Xgboost: 0.12465



In [187]:

    
pred_s = 0.4*las.predict(X_test_a)+0.6*regr.predict(X_test_a)
pred = np.exp(pred_s)



In [ ]:

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	12	2008	WD	Normal	250000

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	ScreenPorch	PoolQC	Fence	MiscFeature	MiscVal	MoSold	YrSold	SaleType	SaleCondition
0	1461	20	RH	80.0	11622	Pave	NaN	Reg	Lvl	AllPub	...	120	NaN	MnPrv	NaN	0	6	2010	WD	Normal
1	1462	20	RL	81.0	14267	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	Gar2	12500	6	2010	WD	Normal
2	1463	60	RL	74.0	13830	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	MnPrv	NaN	0	3	2010	WD	Normal
3	1464	60	RL	78.0	9978	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	NaN	0	6	2010	WD	Normal
4	1465	120	RL	43.0	5005	Pave	NaN	IR1	HLS	AllPub	...	144	NaN	NaN	NaN	0	1	2010	WD	Normal