notebook.community

Edit and run



In [1]:

    
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
%matplotlib inline

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))



In [2]:

    
df_train = pd.read_csv('train.csv')



In [3]:

    
df_train.head()









    Out[3]:







  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      PoolArea
      PoolQC
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
      SalePrice
    
  
  
    
      0
      1
      60
      RL
      65.0
      8450
      Pave
      NaN
      Reg
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      2
      2008
      WD
      Normal
      208500
    
    
      1
      2
      20
      RL
      80.0
      9600
      Pave
      NaN
      Reg
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      5
      2007
      WD
      Normal
      181500
    
    
      2
      3
      60
      RL
      68.0
      11250
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      9
      2008
      WD
      Normal
      223500
    
    
      3
      4
      70
      RL
      60.0
      9550
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      2
      2006
      WD
      Abnorml
      140000
    
    
      4
      5
      60
      RL
      84.0
      14260
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      12
      2008
      WD
      Normal
      250000
    
  

5 rows × 81 columns



In [4]:

    
df_train['SalePrice'].describe()









    Out[4]:





count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64



In [5]:

    
sns.distplot(df_train['SalePrice'])









    Out[5]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f38efe35710>



In [6]:

    
#correlation matrix
corrmat = df_train.corr()
f,ax = plt.subplots(figsize=(12,9))
sns.heatmap(corrmat)









    Out[6]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f38ea404320>



In [7]:

    
#saleprice correlation matrix
k = 20
cols = corrmat.nlargest(k,'SalePrice')['SalePrice']
print (cols)









    



SalePrice       1.000000
OverallQual     0.790982
GrLivArea       0.708624
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
1stFlrSF        0.605852
FullBath        0.560664
TotRmsAbvGrd    0.533723
YearBuilt       0.522897
YearRemodAdd    0.507101
GarageYrBlt     0.486362
MasVnrArea      0.477493
Fireplaces      0.466929
BsmtFinSF1      0.386420
LotFrontage     0.351799
WoodDeckSF      0.324413
2ndFlrSF        0.319334
OpenPorchSF     0.315856
HalfBath        0.284108
Name: SalePrice, dtype: float64

Choose the index

To simply train the model, I pick up 3 aspect of the index.

Area related : GrLivArea, GarageArea, TotalBsmtSF

Number/Rank related: OverallQual(don't know how it compute), FullBath, Fireplaces

Time related : YearBuilt, YearRemodAdd



In [8]:

    
#missing data
total = df_train.isnull().sum().sort_values(ascending = False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending = False)
missing_data = pd.concat([total,percent],axis=1,keys=['Total','Percent'])
missing_data.head(25)









    Out[8]:







  
    
      
      Total
      Percent
    
  
  
    
      PoolQC
      1453
      0.995205
    
    
      MiscFeature
      1406
      0.963014
    
    
      Alley
      1369
      0.937671
    
    
      Fence
      1179
      0.807534
    
    
      FireplaceQu
      690
      0.472603
    
    
      LotFrontage
      259
      0.177397
    
    
      GarageCond
      81
      0.055479
    
    
      GarageType
      81
      0.055479
    
    
      GarageYrBlt
      81
      0.055479
    
    
      GarageFinish
      81
      0.055479
    
    
      GarageQual
      81
      0.055479
    
    
      BsmtExposure
      38
      0.026027
    
    
      BsmtFinType2
      38
      0.026027
    
    
      BsmtFinType1
      37
      0.025342
    
    
      BsmtCond
      37
      0.025342
    
    
      BsmtQual
      37
      0.025342
    
    
      MasVnrArea
      8
      0.005479
    
    
      MasVnrType
      8
      0.005479
    
    
      Electrical
      1
      0.000685
    
    
      Utilities
      0
      0.000000
    
    
      YearRemodAdd
      0
      0.000000
    
    
      MSSubClass
      0
      0.000000
    
    
      Foundation
      0
      0.000000
    
    
      ExterCond
      0
      0.000000
    
    
      ExterQual
      0
      0.000000



In [9]:

    
# In the search for normality
sns.distplot(df_train['SalePrice'],fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'],plot=plt)



In [10]:

    
df_train['SalePrice'] = np.log(df_train['SalePrice'])
sns.distplot(df_train['SalePrice'],fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'],plot=plt)



In [11]:

    
sns.distplot(df_train['GrLivArea'],fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'],plot=plt)



In [12]:

    
# do log transformation
df_train['GrLivArea'] = np.log(df_train['GrLivArea'])
sns.distplot(df_train['GrLivArea'],fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'],plot=plt)



In [13]:

    
sns.distplot(df_train['GarageArea'],fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['GarageArea'],plot=plt)



In [14]:

    
sns.distplot(df_train['TotalBsmtSF'],fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['TotalBsmtSF'],plot=plt)



In [15]:

    
sns.distplot(df_train['OverallQual'])









    Out[15]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f38e82ca3c8>



In [16]:

    
sns.distplot(df_train['FullBath'])









    Out[16]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f38ea0402b0>



In [17]:

    
sns.distplot(df_train['Fireplaces'])









    Out[17]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f38e83a8278>



In [18]:

    
sns.distplot(df_train['YearBuilt'],fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['YearBuilt'],plot=plt)



In [19]:

    
sns.distplot(df_train['YearRemodAdd'],fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['YearRemodAdd'],plot=plt)

Pick the data we need

SalePrice

Area related : GrLivArea, GarageArea, TotalBsmtSF

Number/Rank related: OverallQual(don't know how it compute), FullBath, Fireplaces

Time related : YearBuilt, YearRemodAdd



In [20]:

    
# create new training set by previous feature selection
X_data = pd.concat([df_train['GrLivArea'],df_train['GarageArea'],df_train['TotalBsmtSF'],df_train['OverallQual'],df_train['FullBath'],df_train['Fireplaces'],df_train['YearBuilt'],df_train['YearRemodAdd']],axis=1)
y_data = df_train['SalePrice']



In [21]:

    
#split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data,y_data,test_size=0.2,random_state=0)



In [22]:

    
# train the models
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit (X_train,y_train)
print("Root Mean squared error: %.8f"
      % rmse((reg.predict(X_test)),y_test))









    



Root Mean squared error: 0.19458348



In [23]:

    
from sklearn import linear_model
reg = linear_model.RidgeCV(alphas=[0.1,0.25,0.3,0.33,0.375,0.5,1.0,5.0,10.0])
reg.fit (X_train,y_train)
print (reg.alpha_) 
print("Root Mean squared error: %.8f"
      % rmse((reg.predict(X_test)),y_test))









    



0.33
Root Mean squared error: 0.19455535



In [24]:

    
from sklearn import linear_model
reg = linear_model.LassoCV(alphas=[0.0001,0.001,0.0125,0.025,0.05])
reg.fit (X_train,y_train)
print (reg.alpha_) 
print("Root Mean squared error: %.8f"
      % rmse((reg.predict(X_test)),y_test))









    



0.001
Root Mean squared error: 0.19441522

How about using Xgboost?



In [34]:

    
import xgboost as xgb

regr = xgb.XGBRegressor(
                 colsample_bytree=0.8,
                 gamma=0.0,
                 learning_rate=0.001,
                 max_depth=4,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                  
                 reg_alpha=0.9,
                 reg_lambda=0.6,
                 subsample=0.8,
                 seed=42,
                 silent=False)



In [35]:

    
regr.fit(X_train,y_train)
y_pred = regr.predict(X_test)
print("XGBoost score on training set: ", rmse(y_test, y_pred))









    



XGBoost score on training set:  0.14765221021



In [27]:

    
#create prediction csv
df_test = pd.read_csv('test.csv')
#transfer data
df_test['GrLivArea'] = np.log(df_train['GrLivArea'])
X_data = pd.concat([df_test['GrLivArea'],df_test['GarageArea'],df_test['TotalBsmtSF'],df_test['OverallQual'],df_test['FullBath'],df_test['Fireplaces'],df_test['YearBuilt'],df_test['YearRemodAdd']],axis=1)

y_pred_data = regr.predict(X_data)
y_pred_data = np.exp(y_pred_data)

pd.DataFrame({'Id': df_test['Id'], 'SalePrice':y_pred_data}).to_csv('result.csv', index =False)



In [ ]:

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	12	2008	WD	Normal	250000

	Total	Percent
PoolQC	1453	0.995205
MiscFeature	1406	0.963014
Alley	1369	0.937671
Fence	1179	0.807534
FireplaceQu	690	0.472603
LotFrontage	259	0.177397
GarageCond	81	0.055479
GarageType	81	0.055479
GarageYrBlt	81	0.055479
GarageFinish	81	0.055479
GarageQual	81	0.055479
BsmtExposure	38	0.026027
BsmtFinType2	38	0.026027
BsmtFinType1	37	0.025342
BsmtCond	37	0.025342
BsmtQual	37	0.025342
MasVnrArea	8	0.005479
MasVnrType	8	0.005479
Electrical	1	0.000685
Utilities	0	0.000000
YearRemodAdd	0	0.000000
MSSubClass	0	0.000000
Foundation	0	0.000000
ExterCond	0	0.000000
ExterQual	0	0.000000