In [1]:

    
# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

from IPython.display import display

# remove warnings
import warnings
warnings.filterwarnings('ignore')

Load and pick Datas



In [2]:

    
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

print train_data.shape
display(train_data.head(1))
# display(train_data.info())

print test_data.shape
display(test_data.head(1))
# display(test_data.info())









    



(1460, 81)






    






  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      PoolArea
      PoolQC
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
      SalePrice
    
  
  
    
      0
      1
      60
      RL
      65.0
      8450
      Pave
      NaN
      Reg
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      2
      2008
      WD
      Normal
      208500
    
  

1 rows × 81 columns







    



(1459, 80)






    






  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      ScreenPorch
      PoolArea
      PoolQC
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
    
  
  
    
      0
      1461
      20
      RH
      80.0
      11622
      Pave
      NaN
      Reg
      Lvl
      AllPub
      ...
      120
      0
      NaN
      MnPrv
      NaN
      0
      6
      2010
      WD
      Normal
    
  

1 rows × 80 columns



In [3]:

    
previous_num_columns = train_data.select_dtypes(exclude=['object']).columns.values.tolist()
previous_num_columns.remove('Id')
previous_num_columns.remove('SalePrice')
print previous_num_columns









    



['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']



In [4]:

    
test_column = 'MasVnrArea'
sns.kdeplot(train_data[test_column])
sns.kdeplot(test_data[test_column])

print 'train:', train_data[test_column][train_data[test_column] > 1500].shape
print 'test:', test_data[test_column][test_data[test_column] > 1500].shape









    



train: (1,)
test: (0,)

测试集中不存在一定范围的数据，而训练集中存在少量数据，将其从训练集中删除。

LotFrontage: 训练集中存在2条大于200的记录，测试集中没有
LotArea : 训练集中存在5条大于70000的记录，测试集中没有
MasVnrArea : 训练集中存在1条大于1500的记录，测试集中没有



In [5]:

    
print train_data.shape
train_data.drop(train_data[train_data["LotFrontage"] > 200].index, inplace=True)
train_data.drop(train_data[train_data["LotArea"] > 70000].index, inplace=True)
train_data.drop(train_data[train_data["MasVnrArea"] > 1500].index, inplace=True)
print train_data.shape

train_length = train_data.shape[0]









    



(1460, 81)
(1452, 81)

测试集和训练集合并



In [6]:

    
conbined_data = pd.concat([train_data.loc[:, : 'SalePrice'], test_data])
conbined_data = conbined_data[test_data.columns]
display(conbined_data.head(1))
print conbined_data.shape









    






  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      ScreenPorch
      PoolArea
      PoolQC
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
    
  
  
    
      0
      1
      60
      RL
      65.0
      8450
      Pave
      NaN
      Reg
      Lvl
      AllPub
      ...
      0
      0
      NaN
      NaN
      NaN
      0
      2
      2008
      WD
      Normal
    
  

1 rows × 80 columns







    



(2911, 80)

Filling up missing values



In [7]:

    
# miss data columns
has_null_columns = conbined_data.columns[conbined_data.isnull().any()].tolist()
# how many data missed each column of train/test/conbine datas
train_null = train_data[has_null_columns].isnull().sum()
test_null = test_data[has_null_columns].isnull().sum()
conbined_null = conbined_data[has_null_columns].isnull().sum()

print 'how many data missed each column of train/test/conbine datas'
missed_data = pd.DataFrame(data=[train_null, test_null, conbined_null],
                             index=['train', 'test', 'conbine'], 
                             columns=has_null_columns)
missed_data









    



how many data missed each column of train/test/conbine datas






    Out[7]:






  
    
      
      MSZoning
      LotFrontage
      Alley
      Utilities
      Exterior1st
      Exterior2nd
      MasVnrType
      MasVnrArea
      BsmtQual
      BsmtCond
      ...
      GarageYrBlt
      GarageFinish
      GarageCars
      GarageArea
      GarageQual
      GarageCond
      PoolQC
      Fence
      MiscFeature
      SaleType
    
  
  
    
      train
      0
      256
      1362
      0
      0
      0
      8
      8
      37
      37
      ...
      81
      81
      0
      0
      81
      81
      1446
      1171
      1400
      0
    
    
      test
      4
      227
      1352
      2
      1
      1
      16
      15
      44
      45
      ...
      78
      78
      1
      1
      78
      78
      1456
      1169
      1408
      1
    
    
      conbine
      4
      483
      2714
      2
      1
      1
      24
      23
      81
      82
      ...
      159
      159
      1
      1
      159
      159
      2902
      2340
      2808
      1
    
  

3 rows × 34 columns



In [8]:

    
def fill_missing_conbined_data(column, value):
    conbined_data.loc[conbined_data[column].isnull(),column] = value

LotFrontage/LotArea

对于缺失的 LotFrontage（Linear feet of street connected to property），一般采用平均值进行填充，但考虑到可能与不同的 Neighborhood 有关系。



In [9]:

    
conbined_data['LotFrontage'].groupby(conbined_data["Neighborhood"]).median().plot()
conbined_data['LotFrontage'].groupby(conbined_data["Neighborhood"]).mean().plot()









    Out[9]:





<matplotlib.axes.AxesSubplot at 0x7f411fef5590>



In [10]:

    
lf_neighbor_map = conbined_data['LotFrontage'].groupby(conbined_data["Neighborhood"]).median()
    
rows = conbined_data['LotFrontage'].isnull()
conbined_data['LotFrontage'][rows] = conbined_data['Neighborhood'][rows].map(lambda neighbor : lf_neighbor_map[neighbor])



In [11]:

    
conbined_data[conbined_data['LotFrontage'].isnull()]









    Out[11]:






  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      ScreenPorch
      PoolArea
      PoolQC
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
    
  
  
  

0 rows × 80 columns

Alley



In [12]:

    
conbined_data.shape, conbined_data[conbined_data['Alley'].isnull()].shape









    Out[12]:





((2911, 80), (2714, 80))

2919 条数据缺失 2721 条，缺失数据过多(93.2%),将缺失数据填充为 NA（NA->No alley access）。



In [13]:

    
fill_missing_conbined_data('Alley', 'NA')

MasVnrType / MasVnrArea

方形单板砌体类型/面积，将缺失数据填充为出现次数最多的类型



In [14]:

    
conbined_data['MasVnrType'].fillna('None', inplace=True)
conbined_data['MasVnrArea'].fillna(0, inplace=True)

BsmtQual / BsmtCond / BsmtExposure / BsmtFinType1 / BsmtFinType2

缺失 37 / 38 条数据。



In [15]:

    
basement_cols=['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','BsmtFinSF1','BsmtFinSF2']



In [16]:

    
# 对于缺失的字符串类型的数据填充为 NA，表示 No Basement
for column in basement_cols:
    if 'FinSF'not in column:
        # NA	No Basement
        fill_missing_conbined_data(column, 'NA')



In [17]:

    
# 对于数值型的 BsmtFinSF1 和 BsmtFinSF2
fill_missing_conbined_data('BsmtFinSF1', 0)
fill_missing_conbined_data('BsmtFinSF2', 0)

Electrical

缺失一条数据,填充为出现次数最多的。



In [18]:

    
sns.countplot(conbined_data['Electrical'])









    Out[18]:





<matplotlib.axes.AxesSubplot at 0x7f411fea71d0>



In [19]:

    
fill_missing_conbined_data('Electrical', 'SBrkr')

FireplaceQu

缺失 1420 条数据



In [20]:

    
# 对于 Fireplaces 为 0 的，FireplaceQu 设置为 NA，表示 No Fireplace，此即缺失的 1420 条数据的情况
fill_missing_conbined_data('FireplaceQu', 'NA')

PoolQC

PoolQC 缺失 2909 条数据，是否与 PoolArea 有关。



In [21]:

    
fill_missing_conbined_data('PoolQC', 'NA')

MiscFeature



In [22]:

    
fill_missing_conbined_data('MiscFeature', 'NA')

Fence



In [23]:

    
fill_missing_conbined_data('Fence', 'NA')

Garages



In [24]:

    
garage_cols=['GarageType','GarageQual','GarageCond','GarageYrBlt','GarageFinish','GarageCars','GarageArea']
conbined_data[garage_cols][conbined_data['GarageType'].isnull()==True].head(1)









    Out[24]:






  
    
      
      GarageType
      GarageQual
      GarageCond
      GarageYrBlt
      GarageFinish
      GarageCars
      GarageArea
    
  
  
    
      39
      NaN
      NaN
      NaN
      NaN
      NaN
      0.0
      0.0



In [25]:

    
# 对于缺失的字符串类型的数据填充为 NA，表示 No Garage
for column in garage_cols:
    if column != 'GarageCars' and column != 'GarageArea':
        # NA	No Basement
        fill_missing_conbined_data(column, 'NA')
    else:
        fill_missing_conbined_data(column, 0)

MSZoning



In [26]:

    
sns.countplot(conbined_data['MSZoning'])









    Out[26]:





<matplotlib.axes.AxesSubplot at 0x7f411fe8a1d0>



In [27]:

    
fill_missing_conbined_data('MSZoning', 'RL')

Utilities

Definitely ignoring Utilities : all records are "AllPub", except for one "NoSeWa" in the train set and 2 NA in the test set.



In [28]:

    
sns.countplot(conbined_data['Utilities'])









    Out[28]:





<matplotlib.axes.AxesSubplot at 0x7f411fd9ac10>



In [29]:

    
fill_missing_conbined_data('Utilities', 'AllPub')

Exterior1st / Exterior2nd



In [30]:

    
sns.countplot(conbined_data['Exterior1st'])









    Out[30]:





<matplotlib.axes.AxesSubplot at 0x7f411fa4c290>



In [31]:

    
fill_missing_conbined_data('Exterior1st', 'VinylSd')



In [32]:

    
sns.countplot(conbined_data['Exterior2nd'])









    Out[32]:





<matplotlib.axes.AxesSubplot at 0x7f411fa33f10>



In [33]:

    
fill_missing_conbined_data('Exterior2nd', 'VinylSd')

BsmtUnfSF / TotalBsmtSF



In [34]:

    
# 可以发现，对于缺失的 TotalBsmtSF 和 BsmtUnfSF 填充为0
fill_missing_conbined_data('BsmtUnfSF', 0)
fill_missing_conbined_data('TotalBsmtSF', 0)

BsmtFullBath / BsmtHalfBath



In [35]:

    
fill_missing_conbined_data('BsmtFullBath', 0)
fill_missing_conbined_data('BsmtHalfBath', 0)

KitchenQual



In [36]:

    
sns.countplot(conbined_data['KitchenQual'])









    Out[36]:





<matplotlib.axes.AxesSubplot at 0x7f411f94be50>



In [37]:

    
fill_missing_conbined_data('KitchenQual', 'TA')

SaleType / Functional



In [38]:

    
sns.countplot(conbined_data['Functional'])









    Out[38]:





<matplotlib.axes.AxesSubplot at 0x7f411f717b50>



In [39]:

    
fill_missing_conbined_data('SaleType', 'WD')
fill_missing_conbined_data('Functional', 'Typ')

完成了缺失数据的填充。



In [40]:

    
# 存在缺失数据的列名
has_null_columns = conbined_data.columns[conbined_data.isnull().any()].tolist()
# 每列包含多少缺失数据
conbined_data[has_null_columns].isnull().sum()









    Out[40]:





Series([], dtype: float64)

Feature Engineering

时间属性（YearBuilt、GarageYrBlt）



In [41]:

    
built_year_data = conbined_data[['YearBuilt', 'GarageYrBlt']][conbined_data['GarageYrBlt'] != 'NA']



In [42]:

    
built_year_data['GarageYrBlt'] = built_year_data['GarageYrBlt'].map(lambda g : int(g))
built_year_data['GarageYrBlt'].corr(built_year_data['YearBuilt'])









    Out[42]:





0.8346885266143993

可以发现 YearBuilt、GarageYrBlt 之间存在很高的相关性。



In [43]:

    
built_year_data.head(1)









    Out[43]:






  
    
      
      YearBuilt
      GarageYrBlt
    
  
  
    
      0
      2003
      2003



In [44]:

    
from sklearn.linear_model import LinearRegression
regr = LinearRegression()
garage_year = built_year_data.loc[:,'GarageYrBlt'].values
built_year = built_year_data.loc[:,'YearBuilt'].values

length = garage_year.shape[0]
garage_year = garage_year.reshape(length, 1)
built_year = built_year.reshape(length, 1)

# Train the model using the training sets
regr.fit(built_year, garage_year)
plt.scatter(built_year, garage_year,  color='blue')
plt.plot(built_year, regr.predict(built_year), color='red',
         linewidth=3)









    Out[44]:





[<matplotlib.lines.Line2D at 0x7f416ccad390>]



In [45]:

    
# 对于 NA 的 GarageYrBlt，进行填充
conbined_data['GarageYrBlt'] = conbined_data.apply(lambda row : int(regr.predict(row['YearBuilt']))
                                                      if row['GarageYrBlt'] == 'NA' else int(row['GarageYrBlt']),
                                                       axis=1)



In [46]:

    
conbined_data['GarageYrBlt'].head()









    Out[46]:





0    2003
1    1976
2    2001
3    1998
4    2000
Name: GarageYrBlt, dtype: int64

YearBuilt建造时间，YearRemodAdd修建时间，确定是否翻新改造过



In [47]:

    
# How many years has remoded from built
conbined_data['RemodYears'] = conbined_data['YearRemodAdd'] - conbined_data['YearBuilt']
# Did a remodeling happened from built?
conbined_data["HasRemodeled"] = (conbined_data["YearRemodAdd"] != conbined_data["YearBuilt"]) * 1
# Did a remodeling happen in the year the house was sold?
conbined_data["HasRecentRemodel"] = (conbined_data["YearRemodAdd"] == conbined_data["YrSold"]) * 1



In [48]:

    
conbined_data['GarageBltYears'] = conbined_data['GarageYrBlt'] - conbined_data['YearBuilt']



In [49]:

    
conbined_data['GarageBltYears'].head()









    Out[49]:





0     0
1     0
2     0
3    83
4     0
Name: GarageBltYears, dtype: int64



In [50]:

    
# How many years has build now?
conbined_data['Now_YearBuilt'] = 2017 - conbined_data['YearBuilt']
conbined_data['Now_YearRemodAdd'] = 2017 - conbined_data['YearRemodAdd']
conbined_data['Now_GarageYrBlt'] = 2017 - conbined_data['GarageYrBlt']



In [51]:

    
conbined_data[['Now_YearBuilt','Now_YearRemodAdd','Now_GarageYrBlt']].head(3)









    Out[51]:






  
    
      
      Now_YearBuilt
      Now_YearRemodAdd
      Now_GarageYrBlt
    
  
  
    
      0
      14
      14
      14
    
    
      1
      41
      41
      41
    
    
      2
      16
      15
      16

房子卖的月份存在旺季
房子卖的月份为数值类型，将其转为字符串类型



In [52]:

    
train_data['SalePrice'].groupby(train_data['MoSold']).mean().plot()









    Out[52]:





<matplotlib.axes.AxesSubplot at 0x7f411f6c4e90>



In [53]:

    
sns.countplot(conbined_data['MoSold'])









    Out[53]:





<matplotlib.axes.AxesSubplot at 0x7f411f71f610>

可以看出每月卖出房屋的数量和价格基本成反比。



In [54]:

    
sale_price_month = train_data['SalePrice'].groupby(train_data['MoSold']).mean().to_dict()
# 该月卖的平均价格
conbined_data["MonthSaledMeanPrice"] = conbined_data["MoSold"].replace(sale_price_month)



In [55]:

    
# 统计每月卖的数量
sale_month = {"1": 0, "2": 0, "3": 0, "4": 0, "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "10": 0, "11": 0, "12": 0}
for m in conbined_data['MoSold'].values:
    sale_month[str(m)] = sale_month[str(m)] + 1



In [56]:

    
# 月份为数值类型，将其转为字符串类型
conbined_data['MoSold'] = conbined_data['MoSold'].map(lambda m : str(m))



In [57]:

    
# 该月卖的数量
conbined_data["MonthSaledCount"] = conbined_data["MoSold"].replace(sale_month)

对于 MSSubClass 的数值仅仅代表 the type of dwelling，所以将其编码。



In [58]:

    
train_data['SalePrice'].groupby(train_data['MSSubClass']).mean().plot()









    Out[58]:





<matplotlib.axes.AxesSubplot at 0x7f411fb85ad0>



In [59]:

    
sns.countplot(conbined_data['MSSubClass'])









    Out[59]:





<matplotlib.axes.AxesSubplot at 0x7f411cbdaa50>

可以看出价格和销量都与 MSSubClass 存在一定的联系。查看数据描述是因为房屋是否 NEWER！



In [60]:

    
conbined_data["NewerDwelling"] = conbined_data["MSSubClass"].replace(
    {20: 1, 30: 0, 40: 0, 45: 0,50: 0, 60: 1, 70: 0, 75: 0, 80: 0, 85: 0,
     90: 0, 120: 1, 150: 0, 160: 0, 180: 0, 190: 0})



In [61]:

    
# 新增每种类别的平均价格
sale_price_mssc = train_data['SalePrice'].groupby(train_data['MSSubClass']).mean().to_dict()
# 该月卖的平均价格
conbined_data["MSSubClassMeanPrice"] = conbined_data["MSSubClass"].replace(sale_price_mssc)



In [62]:

    
mssubclass_dict = {
    20: 'SC20',
    30: 'SC30',
    40: 'SC40',
    45: 'SC45',
    50: 'SC50',
    60: 'SC60',
    70: 'SC70',
    75: 'SC75',
    80: 'SC80',
    85: 'SC85',
    90: 'SC90',
    120: 'SC120',
    150: 'SC150',
    160: 'SC160',
    180: 'SC180',
    190: 'SC190',
}
conbined_data['MSSubClass'] = conbined_data['MSSubClass'].replace(mssubclass_dict)

对于那些存在大小特质的属性进行编码。



In [63]:

    
good_level_map = {'Street': {'Grvl': 0, 'Pave': 1},
     'Alley': {'NA':0, 'Grvl': 1, 'Pave': 2},
     'Utilities': {'AllPub':3, 'NoSeWa': 1, 'NoSewr': 2, 'ELO': 0},
     'ExterQual': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1,'Po': 0},
     'ExterCond': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1,'Po': 0},
     'BsmtExposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1,'NA': 0},
     'BsmtQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2,'Po': 1,'NA': 0},
     'BsmtCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2,'Po': 1,'NA': 0},
     'BsmtFinType1': {'NA':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6},
     'BsmtFinType2': {'NA':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6},
     'HeatingQC': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1,'Po': 0},
     'CentralAir': {'N':0, 'Y':1},
     'KitchenQual': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0},
     'Functional': {'Typ':0,'Min1':1,'Min2':1,'Mod':2,'Maj1':3,'Maj2':4,'Sev':5,'Sal': 6},
     'FireplaceQu': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
     'GarageQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
     'GarageCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
     'PoolQC': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'NA': 0},
     'Fence': {'GdPrv': 2, 'GdWo': 2, 'MnPrv': 1, 'MnWw': 1, 'NA': 0}
    }

print good_level_map.keys()
good_level_data = conbined_data[good_level_map.keys()].replace(good_level_map)

good_level_data.columns = good_level_data.columns.map(lambda m : m + '_')

conbined_data[good_level_data.columns] = good_level_data[good_level_data.columns]
print conbined_data.shape









    



['KitchenQual', 'BsmtCond', 'HeatingQC', 'GarageQual', 'ExterCond', 'CentralAir', 'ExterQual', 'Utilities', 'Alley', 'Functional', 'Street', 'FireplaceQu', 'Fence', 'BsmtFinType2', 'BsmtQual', 'PoolQC', 'BsmtExposure', 'BsmtFinType1', 'GarageCond']
(2911, 110)

Neighborhood 属性表示的是附近的地名，可将其转为经纬度。



In [64]:

    
# 纬度
conbined_data["latitude"] = conbined_data.Neighborhood.replace(
                                               {'Blmngtn' : 42.062806,
                                                'Blueste' : 42.009408,
                                                'BrDale' : 42.052500,
                                                'BrkSide': 42.033590,
                                                'ClearCr': 42.025425,
                                                'CollgCr': 42.021051,
                                                'Crawfor': 42.025949,
                                                'Edwards': 42.022800,
                                                'Gilbert': 42.027885,
                                                'GrnHill': 42.000854,
                                                'IDOTRR' : 42.019208,
                                                'Landmrk': 42.044777,
                                                'MeadowV': 41.991866,
                                                'Mitchel': 42.031307,
                                                'NAmes'  : 42.042966,
                                                'NoRidge': 42.050307,
                                                'NPkVill': 42.050207,
                                                'NridgHt': 42.060356,
                                                'NWAmes' : 42.051321,
                                                'OldTown': 42.028863,
                                                'SWISU'  : 42.017578,
                                                'Sawyer' : 42.033611,
                                                'SawyerW': 42.035540,
                                                'Somerst': 42.052191,
                                                'StoneBr': 42.060752,
                                                'Timber' : 41.998132,
                                                'Veenker': 42.040106})
# 经度
conbined_data["longitude"] = conbined_data.Neighborhood.replace(
                                               {'Blmngtn' : -93.639963,
                                                'Blueste' : -93.645543,
                                                'BrDale' : -93.628821,
                                                'BrkSide': -93.627552,
                                                'ClearCr': -93.675741,
                                                'CollgCr': -93.685643,
                                                'Crawfor': -93.620215,
                                                'Edwards': -93.663040,
                                                'Gilbert': -93.615692,
                                                'GrnHill': -93.643377,
                                                'IDOTRR' : -93.623401,
                                                'Landmrk': -93.646239,
                                                'MeadowV': -93.602441,
                                                'Mitchel': -93.626967,
                                                'NAmes'  : -93.613556,
                                                'NoRidge': -93.656045,
                                                'NPkVill': -93.625827,
                                                'NridgHt': -93.657107,
                                                'NWAmes' : -93.633798,
                                                'OldTown': -93.615497,
                                                'SWISU'  : -93.651283,
                                                'Sawyer' : -93.669348,
                                                'SawyerW': -93.685131,
                                                'Somerst': -93.643479,
                                                'StoneBr': -93.628955,
                                                'Timber' : -93.648335,
                                                'Veenker': -93.657032})

# Ames city 的经纬度：经度-93.63191310000002，纬度42.0307812，计算距离
conbined_data["NeighborDistance"] = np.sqrt(np.power((conbined_data["longitude"] - (-93.63191310000002)),2) + \
                                    np.power((conbined_data["latitude"] - 42.0307812),2))

display(conbined_data["NeighborDistance"].head())

conbined_data.drop(['longitude', 'latitude'], axis=1, inplace=True)









    





0    0.054604
1    0.026794
2    0.054604
3    0.012657
4    0.031042
Name: NeighborDistance, dtype: float64



In [65]:

    
# Bin by neighborhood (a little arbitrarily). Values were computed by: 
neighbor_price_map = train_data["SalePrice"].groupby(train_data["Neighborhood"]).median().sort_values().to_dict()



In [66]:

    
neighbor_bin_map = {
    "MeadowV" : 0,  #  88000
    "IDOTRR" : 1,   # 103000
    "BrDale" : 1,   # 106000
    "OldTown" : 1,  # 119000
    "Edwards" : 1,  # 119500
    "BrkSide" : 1,  # 124300
    "Sawyer" : 1,   # 135000
    "Blueste" : 1,  # 137500
    "SWISU" : 2,    # 139500
    "NAmes" : 2,    # 140000
    "NPkVill" : 2,  # 146000
    "Mitchel" : 2,  # 153500
    "SawyerW" : 2,  # 179900
    "Gilbert" : 2,  # 181000
    "NWAmes" : 2,   # 182900
    "Blmngtn" : 2,  # 191000
    "CollgCr" : 2,  # 197200
    "ClearCr" : 3,  # 200250
    "Crawfor" : 3,  # 200624
    "Veenker" : 3,  # 218000
    "Somerst" : 3,  # 225500
    "Timber" : 3,   # 228475
    "StoneBr" : 4,  # 278000
    "NoRidge" : 4,  # 290000
    "NridgHt" : 4,  # 315000
}

conbined_data["NeighborPrice"] = conbined_data["Neighborhood"].map(neighbor_price_map)
conbined_data["NeighborBin"] = conbined_data["Neighborhood"].map(neighbor_bin_map)



In [67]:

    
conbined_data.head()









    Out[67]:






  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      Fence_
      BsmtFinType2_
      BsmtQual_
      PoolQC_
      BsmtExposure_
      BsmtFinType1_
      GarageCond_
      NeighborDistance
      NeighborPrice
      NeighborBin
    
  
  
    
      0
      1
      SC60
      RL
      65.0
      8450
      Pave
      NA
      Reg
      Lvl
      AllPub
      ...
      0
      1
      4
      0
      1
      6
      3
      0.054604
      197200
      2
    
    
      1
      2
      SC20
      RL
      80.0
      9600
      Pave
      NA
      Reg
      Lvl
      AllPub
      ...
      0
      1
      4
      0
      4
      5
      3
      0.026794
      218000
      3
    
    
      2
      3
      SC60
      RL
      68.0
      11250
      Pave
      NA
      IR1
      Lvl
      AllPub
      ...
      0
      1
      4
      0
      2
      6
      3
      0.054604
      197200
      2
    
    
      3
      4
      SC70
      RL
      60.0
      9550
      Pave
      NA
      IR1
      Lvl
      AllPub
      ...
      0
      1
      3
      0
      1
      5
      3
      0.012657
      200624
      3
    
    
      4
      5
      SC60
      RL
      84.0
      14260
      Pave
      NA
      IR1
      Lvl
      AllPub
      ...
      0
      1
      4
      0
      3
      6
      3
      0.031042
      301500
      4
    
  

5 rows × 113 columns

Create new features

Ref: juliencs : A study on Regression applied to the Ames dataset

Create some boolean features
Simplifications of existing features - Ref
Combinations of existing features - Ref
Polynomials on the top 10 existing features - Ref



In [68]:

    
str_columns = conbined_data.select_dtypes(include=['object']).columns.values
num_columns = conbined_data.select_dtypes(exclude=['object']).columns.values



In [69]:

    
str_columns









    Out[69]:





array(['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
       'MiscFeature', 'MoSold', 'SaleType', 'SaleCondition'], dtype=object)

1.Create some boolean features



In [70]:

    
# test str column
column = "SaleCondition"
count_duct = {}
for key in set(conbined_data[column]):
    count_duct[key] = 0
    
for m in conbined_data[column].values:
    count_duct[str(m)] = count_duct[str(m)] + 1

count_duct= sorted(count_duct.items(), key=lambda d:d[1], reverse = True)
print np.array(count_duct)[:,0]
sns.countplot(conbined_data[column])









    



['Normal' 'Partial' 'Abnorml' 'Family' 'Alloca' 'AdjLand']






    Out[70]:





<matplotlib.axes.AxesSubplot at 0x7f411cc79c50>



In [71]:

    
# IR2 and IR3 don't appear that often, so just make a distinction 
# between regular and irregular.
conbined_data["IsRegularLotShape"] = (conbined_data["LotShape"] == "Reg") * 1

# Bnk, Low, HLS don't appear that often, so just make a distinction
conbined_data["IsLandContourLvl"] = (conbined_data["LandContour"] == "Lvl") * 1
conbined_data["IsLotConfigInside"] = (conbined_data["LotConfig"] == "Inside") * 1
conbined_data["IsLandSlopeGentle"] = (conbined_data["LandSlope"] == "Gtl") * 1
conbined_data["IsCondition1Norm"] = (conbined_data["Condition1"] == "Norm") * 1
conbined_data["IsCondition2Norm"] = (conbined_data["Condition2"] == "Norm") * 1
conbined_data["IsBldgType1Fam"] = (conbined_data["BldgType"] == "1Fam") * 1
conbined_data["IsRoofStyleGable"] = (conbined_data["RoofStyle"] == "Gable") * 1
conbined_data["IsRoofMatlCompShg"] = (conbined_data["RoofMatl"] == "CompShg") * 1
conbined_data["IsGasAHeating"] = (conbined_data["Heating"] == "GasA") * 1
conbined_data["IsGarageFinished"] = (conbined_data["GarageFinish"] == "Fin") * 1
conbined_data["IsPavedDrive"] = (conbined_data["PavedDrive"] == "Y") * 1
conbined_data["IsSaleTypeWD"] = (conbined_data["SaleType"] == "WD") * 1
conbined_data["IsSaleConditionNormal"] = (conbined_data["SaleCondition"] == "Normal") * 1



In [72]:

    
# The only interesting "misc. feature" is the presence of a shed.
conbined_data["HasShed"] = (conbined_data["MiscFeature"] == "Shed") * 1.  

# Was this house sold in the year it was built?
conbined_data["IsVeryNewHouse"] = (conbined_data["YearBuilt"] == conbined_data["YrSold"]) * 1

conbined_data["Has2ndFloor"] = (conbined_data["2ndFlrSF"] == 0) * 1
conbined_data["HasMasVnr"] = (conbined_data["MasVnrArea"] == 0) * 1
conbined_data["HasWoodDeck"] = (conbined_data["WoodDeckSF"] == 0) * 1
conbined_data["HasOpenPorch"] = (conbined_data["OpenPorchSF"] == 0) * 1
conbined_data["HasEnclosedPorch"] = (conbined_data["EnclosedPorch"] == 0) * 1
conbined_data["Has3SsnPorch"] = (conbined_data["3SsnPorch"] == 0) * 1
conbined_data["HasScreenPorch"] = (conbined_data["ScreenPorch"] == 0) * 1



In [73]:

    
# 2.Simplifications of existing features
conbined_data["SimplOverallQual"] = conbined_data.OverallQual.replace(
                                                            {1 : 1, 2 : 1, 3 : 1, # bad
                                                             4 : 2, 5 : 2, 6 : 2, # average
                                                             7 : 3, 8 : 3, 9 : 3, 10 : 3 # good
                                                            })
conbined_data["SimplOverallCond"] = conbined_data.OverallCond.replace(
                                                            {1 : 1, 2 : 1, 3 : 1, # bad
                                                             4 : 2, 5 : 2, 6 : 2, # average
                                                             7 : 3, 8 : 3, 9 : 3, 10 : 3 # good
                                                             })



In [74]:

    
# 3.Combinations of existing features

# Overall quality of the house
conbined_data["OverallGrade"] = conbined_data["OverallQual"] * conbined_data["OverallCond"]
# Overall quality of the garage
conbined_data["GarageGrade"] = conbined_data["GarageQual_"] * conbined_data["GarageCond"]
# Overall quality of the exterior
conbined_data["ExterGrade"] = conbined_data["ExterQual_"] * conbined_data["ExterCond"]
# Overall kitchen score
conbined_data["KitchenScore"] = conbined_data["KitchenAbvGr"] * conbined_data["KitchenQual_"]
# Overall fireplace score
conbined_data["FireplaceScore"] = conbined_data["Fireplaces"] * conbined_data["FireplaceQu_"]
# Overall garage score
conbined_data["GarageScore"] = conbined_data["GarageArea"] * conbined_data["GarageQual_"]
# Overall pool score
conbined_data["PoolScore"] = conbined_data["PoolArea"] * conbined_data["PoolQC_"]

# Total number of bathrooms
conbined_data["TotalBath"] = conbined_data["BsmtFullBath"] + (0.5 * conbined_data["BsmtHalfBath"]) + \
conbined_data["FullBath"] + (0.5 * conbined_data["HalfBath"])

# Total yard area in square feet
conbined_data["TotalPorchSF"] = conbined_data["OpenPorchSF"] + conbined_data["EnclosedPorch"] +\
                                conbined_data["3SsnPorch"] + conbined_data["ScreenPorch"]
# Total SF for house (living, basement, porch, pool)
conbined_data["AllSF"] = conbined_data["GrLivArea"] + conbined_data["TotalBsmtSF"] + \
                         conbined_data["TotalPorchSF"] + conbined_data["WoodDeckSF"] + \
                         conbined_data["PoolArea"]

# House completed before sale or not
conbined_data["BoughtOffPlan"] = conbined_data.SaleCondition.replace(
    {"Abnorml" : 0, "Alloca" : 0, "AdjLand" : 0, "Family" : 0, "Normal" : 0, "Partial" : 1})



In [75]:

    
# 4.Polynomials on the top n existing features
train_data_new = conbined_data.iloc[:train_length,:]
# 添加价格
train_data_new.head()









    Out[75]:






  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      GarageGrade
      ExterGrade
      KitchenScore
      FireplaceScore
      GarageScore
      PoolScore
      TotalBath
      TotalPorchSF
      AllSF
      BoughtOffPlan
    
  
  
    
      0
      1
      SC60
      RL
      65.0
      8450
      Pave
      NA
      Reg
      Lvl
      AllPub
      ...
      TATATA
      TATATA
      3
      0
      1644.0
      0
      3.5
      61
      2627.0
      0
    
    
      1
      2
      SC20
      RL
      80.0
      9600
      Pave
      NA
      Reg
      Lvl
      AllPub
      ...
      TATATA
      TATA
      2
      3
      1380.0
      0
      2.5
      0
      2822.0
      0
    
    
      2
      3
      SC60
      RL
      68.0
      11250
      Pave
      NA
      IR1
      Lvl
      AllPub
      ...
      TATATA
      TATATA
      3
      3
      1824.0
      0
      3.5
      42
      2748.0
      0
    
    
      3
      4
      SC70
      RL
      60.0
      9550
      Pave
      NA
      IR1
      Lvl
      AllPub
      ...
      TATATA
      TATA
      3
      4
      1926.0
      0
      2.0
      307
      2780.0
      0
    
    
      4
      5
      SC60
      RL
      84.0
      14260
      Pave
      NA
      IR1
      Lvl
      AllPub
      ...
      TATATA
      TATATA
      3
      3
      2508.0
      0
      3.5
      84
      3619.0
      0
    
  

5 rows × 149 columns



In [76]:

    
from xgboost import XGBRegressor

train_Y = train_data['SalePrice']
train_X = train_data_new.select_dtypes(exclude=['object']).drop(['Id'], axis=1)

xgb_regressor = XGBRegressor(seed=10)
xgb_regressor.fit(train_X, train_Y)

feature_importances = pd.Series(xgb_regressor.feature_importances_, train_X.columns.values)
feature_importances = feature_importances.sort_values(ascending=False)
# feature_importances= feature_importances.head(40)
feature_importances.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')









    Out[76]:





<matplotlib.text.Text at 0x7f411c916650>



In [77]:

    
top_n = 15
poly_columns = feature_importances.index.values[:top_n]
for column in poly_columns:
    conbined_data[column+'-s2'] = conbined_data[column] ** 2
    conbined_data[column+'-s3'] = conbined_data[column] ** 3
    conbined_data[column+'-sq'] = np.sqrt(conbined_data[column])



In [78]:

    
conbined_data.head()









    Out[78]:






  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      NeighborPrice-sq
      WoodDeckSF-s2
      WoodDeckSF-s3
      WoodDeckSF-sq
      OverallCond-s2
      OverallCond-s3
      OverallCond-sq
      1stFlrSF-s2
      1stFlrSF-s3
      1stFlrSF-sq
    
  
  
    
      0
      1
      SC60
      RL
      65.0
      8450
      Pave
      NA
      Reg
      Lvl
      AllPub
      ...
      444.072066
      0
      0
      0.000000
      25
      125
      2.236068
      732736
      627222016
      29.257478
    
    
      1
      2
      SC20
      RL
      80.0
      9600
      Pave
      NA
      Reg
      Lvl
      AllPub
      ...
      466.904701
      88804
      26463592
      17.262677
      64
      512
      2.828427
      1592644
      2009916728
      35.524639
    
    
      2
      3
      SC60
      RL
      68.0
      11250
      Pave
      NA
      IR1
      Lvl
      AllPub
      ...
      444.072066
      0
      0
      0.000000
      25
      125
      2.236068
      846400
      778688000
      30.331502
    
    
      3
      4
      SC70
      RL
      60.0
      9550
      Pave
      NA
      IR1
      Lvl
      AllPub
      ...
      447.910705
      0
      0
      0.000000
      25
      125
      2.236068
      923521
      887503681
      31.000000
    
    
      4
      5
      SC60
      RL
      84.0
      14260
      Pave
      NA
      IR1
      Lvl
      AllPub
      ...
      549.090157
      36864
      7077888
      13.856406
      25
      125
      2.236068
      1311025
      1501123625
      33.837849
    
  

5 rows × 194 columns

Feature Scale / Skew

Ref: Lasso model for regression problem



In [79]:

    
str_columns = conbined_data.select_dtypes(include=['object']).columns.values
num_columns = conbined_data.select_dtypes(exclude=['object']).columns.values[1:]



In [80]:

    
num_columns









    Out[80]:





array(['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
       'PoolArea', 'MiscVal', 'YrSold', 'RemodYears', 'HasRemodeled',
       'HasRecentRemodel', 'GarageBltYears', 'Now_YearBuilt',
       'Now_YearRemodAdd', 'Now_GarageYrBlt', 'MonthSaledMeanPrice',
       'MonthSaledCount', 'NewerDwelling', 'MSSubClassMeanPrice',
       'KitchenQual_', 'BsmtCond_', 'HeatingQC_', 'GarageQual_',
       'ExterCond_', 'CentralAir_', 'ExterQual_', 'Utilities_', 'Alley_',
       'Functional_', 'Street_', 'FireplaceQu_', 'Fence_', 'BsmtFinType2_',
       'BsmtQual_', 'PoolQC_', 'BsmtExposure_', 'BsmtFinType1_',
       'GarageCond_', 'NeighborDistance', 'NeighborPrice', 'NeighborBin',
       'IsRegularLotShape', 'IsLandContourLvl', 'IsLotConfigInside',
       'IsLandSlopeGentle', 'IsCondition1Norm', 'IsCondition2Norm',
       'IsBldgType1Fam', 'IsRoofStyleGable', 'IsRoofMatlCompShg',
       'IsGasAHeating', 'IsGarageFinished', 'IsPavedDrive', 'IsSaleTypeWD',
       'IsSaleConditionNormal', 'HasShed', 'IsVeryNewHouse', 'Has2ndFloor',
       'HasMasVnr', 'HasWoodDeck', 'HasOpenPorch', 'HasEnclosedPorch',
       'Has3SsnPorch', 'HasScreenPorch', 'SimplOverallQual',
       'SimplOverallCond', 'OverallGrade', 'KitchenScore',
       'FireplaceScore', 'GarageScore', 'PoolScore', 'TotalBath',
       'TotalPorchSF', 'AllSF', 'BoughtOffPlan', 'AllSF-s2', 'AllSF-s3',
       'AllSF-sq', 'LotArea-s2', 'LotArea-s3', 'LotArea-sq',
       'OverallGrade-s2', 'OverallGrade-s3', 'OverallGrade-sq',
       'BsmtFinSF1-s2', 'BsmtFinSF1-s3', 'BsmtFinSF1-sq', 'GrLivArea-s2',
       'GrLivArea-s3', 'GrLivArea-sq', 'YearBuilt-s2', 'YearBuilt-s3',
       'YearBuilt-sq', '2ndFlrSF-s2', '2ndFlrSF-s3', '2ndFlrSF-sq',
       'BsmtUnfSF-s2', 'BsmtUnfSF-s3', 'BsmtUnfSF-sq', 'LotFrontage-s2',
       'LotFrontage-s3', 'LotFrontage-sq', 'OverallQual-s2',
       'OverallQual-s3', 'OverallQual-sq', 'TotalBath-s2', 'TotalBath-s3',
       'TotalBath-sq', 'NeighborPrice-s2', 'NeighborPrice-s3',
       'NeighborPrice-sq', 'WoodDeckSF-s2', 'WoodDeckSF-s3',
       'WoodDeckSF-sq', 'OverallCond-s2', 'OverallCond-s3',
       'OverallCond-sq', '1stFlrSF-s2', '1stFlrSF-s3', '1stFlrSF-sq'], dtype=object)



In [81]:

    
scater_skew_num_columns = num_columns.tolist()
print len(scater_skew_num_columns)
for column in num_columns:
    # for boolean features, do not scatter and skewed
    if set(conbined_data[column]) == {0, 1}:
        scater_skew_num_columns.remove(column)

print len(scater_skew_num_columns)



In [82]:

    
t = conbined_data[scater_skew_num_columns].quantile(.95)
use_max_scater = t[t == 0].index
use_95_scater = t[t != 0].index
conbined_data[use_max_scater] = conbined_data[use_max_scater] / conbined_data[use_max_scater].max()
conbined_data[use_95_scater] = conbined_data[use_95_scater] / conbined_data[use_95_scater].quantile(.95)



In [83]:

    
# Transform the skewed numeric features by taking log(feature + 1).
# This will make the features more normal.
from scipy.stats import skew

skewed = conbined_data[scater_skew_num_columns].apply(lambda x: skew(x.astype(float)))
skewed = skewed[skewed > 0.75]
skewed = skewed.index
skewed = skewed.drop(['NeighborPrice','NeighborPrice-s2','NeighborPrice-s3'])
print 'skewed features', skewed.shape[0],' from total ',conbined_data.shape[1],' features'
conbined_data[skewed] = np.log1p(conbined_data[skewed])









    



skewed features 61  from total  194  features



In [84]:

    
price_feature = ["MonthSaledMeanPrice","MSSubClassMeanPrice","NeighborPrice","NeighborPrice-s2","NeighborPrice-s3","NeighborPrice-sq"]
conbined_data[price_feature] = np.log1p(conbined_data[price_feature])



In [85]:

    
conbined_data.shape









    Out[85]:





(2911, 194)

Dummy Encoding



In [86]:

    
str_columns









    Out[86]:





array(['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
       'MiscFeature', 'MoSold', 'SaleType', 'SaleCondition', 'GarageGrade',
       'ExterGrade'], dtype=object)



In [87]:

    
dummies_data = pd.get_dummies(conbined_data[str_columns])
conbined_data[dummies_data.columns] = dummies_data[dummies_data.columns]
conbined_data.drop(str_columns, axis=1, inplace=True)



In [88]:

    
conbined_data.shape









    Out[88]:





(2911, 471)

Delete some features to prevent overfitting.



In [89]:

    
# These onehot columns are missing in the test data, so drop them from the
# training data or we might overfit on them.
drop_cols = [
                "Exterior1st_ImStucc", "Exterior1st_Stone",
                "Exterior2nd_Other","HouseStyle_2.5Fin", 
            
                "RoofMatl_Membran", "RoofMatl_Metal", "RoofMatl_Roll",
                "Condition2_RRAe", "Condition2_RRAn", "Condition2_RRNn",
                "Heating_Floor", "Heating_OthW",

                "Electrical_Mix", 
                "MiscFeature_TenC",
                "GarageQual_Ex", "PoolQC_Fa"
            ]

conbined_data.drop(drop_cols, axis=1, inplace=True)



In [90]:

    
conbined_data.shape









    Out[90]:





(2911, 455)



In [91]:

    
train_X = conbined_data.iloc[:train_length, 1:]
train_Y = train_data['SalePrice']
train_Id = conbined_data.iloc[:train_length, 0]

test_X = conbined_data.iloc[train_length:, 1:]
test_Id = conbined_data.iloc[train_length:, 0]

print("train_X : " + str(train_X.shape))
print("test_X : " + str(test_X.shape))
print("train_Y : " + str(train_Y.shape))









    



train_X : (1452, 454)
test_X : (1459, 454)
train_Y : (1452,)



In [92]:

    
# 价格分布
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(10,5))
axis1.hist(train_Y)
# Transform skewed numeric features using log(p+1) transformation making them more normal
train_Y = np.log1p(train_Y)
axis2.hist(train_Y)









    Out[92]:





(array([   5.,   12.,   54.,  184.,  469.,  400.,  215.,   88.,   19.,    6.]),
 array([ 10.46027076,  10.76769112,  11.07511148,  11.38253184,
         11.6899522 ,  11.99737256,  12.30479292,  12.61221328,
         12.91963363,  13.22705399,  13.53447435]),
 <a list of 10 Patch objects>)

Save New Data



In [93]:

    
save_train = train_X.copy()
save_test = test_X.copy()
save_train['Id'] = train_Id
save_train['SalePrice'] = train_Y
save_test['Id'] = test_Id

display(save_train.head())
display(save_test.head())

save_train.to_csv("data/new_train.csv", index=False)
save_test.to_csv("data/new_test.csv", index=False)

print 'Done.'









    






  
    
      
      LotFrontage
      LotArea
      OverallQual
      OverallCond
      YearBuilt
      YearRemodAdd
      MasVnrArea
      BsmtFinSF1
      BsmtFinSF2
      BsmtUnfSF
      ...
      ExterGrade_GdGdGd
      ExterGrade_GdGdGdGd
      ExterGrade_Po
      ExterGrade_PoPo
      ExterGrade_TA
      ExterGrade_TATA
      ExterGrade_TATATA
      ExterGrade_TATATATA
      Id
      SalePrice
    
  
  
    
      0
      0.625000
      0.405761
      0.875
      0.625
      0.998007
      0.998007
      0.353963
      0.440935
      0.0
      0.096725
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      1
      12.247699
    
    
      1
      0.769231
      0.450153
      0.750
      1.000
      0.984554
      0.984554
      0.000000
      0.569657
      0.0
      0.175869
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      2
      12.109016
    
    
      2
      0.653846
      0.510589
      0.875
      0.625
      0.997010
      0.997509
      0.300867
      0.323152
      0.0
      0.257614
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      3
      12.317171
    
    
      3
      0.576923
      0.448263
      0.875
      0.625
      0.954160
      0.981565
      0.000000
      0.156615
      0.0
      0.311598
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      4
      11.849405
    
    
      4
      0.807692
      0.612228
      1.000
      0.625
      0.996512
      0.996512
      0.564402
      0.414840
      0.0
      0.286497
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      5
      12.429220
    
  

5 rows × 456 columns







    






  
    
      
      LotFrontage
      LotArea
      OverallQual
      OverallCond
      YearBuilt
      YearRemodAdd
      MasVnrArea
      BsmtFinSF1
      BsmtFinSF2
      BsmtUnfSF
      ...
      ExterGrade_GdGd
      ExterGrade_GdGdGd
      ExterGrade_GdGdGdGd
      ExterGrade_Po
      ExterGrade_PoPo
      ExterGrade_TA
      ExterGrade_TATA
      ExterGrade_TATATA
      ExterGrade_TATATATA
      Id
    
  
  
    
      0
      0.769231
      0.523724
      0.625
      0.750
      0.977080
      0.977080
      0.000000
      0.312872
      0.286817
      0.167887
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      1461
    
    
      1
      0.778846
      0.612453
      0.750
      0.750
      0.975585
      0.975585
      0.210277
      0.544931
      0.000000
      0.242853
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      1462
    
    
      2
      0.711538
      0.598325
      0.625
      0.625
      0.995017
      0.995516
      0.000000
      0.482969
      0.000000
      0.088703
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      1463
    
    
      3
      0.750000
      0.464324
      0.750
      0.750
      0.995516
      0.995516
      0.042424
      0.386980
      0.000000
      0.198329
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      1464
    
    
      4
      0.413462
      0.259604
      1.000
      0.625
      0.992526
      0.992526
      0.000000
      0.187671
      0.000000
      0.523875
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      1465
    
  

5 rows × 455 columns







    



Done.



In [ ]:

	MSZoning	LotFrontage	Alley	Utilities	Exterior1st	Exterior2nd	MasVnrType	MasVnrArea	BsmtQual	BsmtCond	...	GarageYrBlt	GarageFinish	GarageCars	GarageArea	GarageQual	GarageCond	PoolQC	Fence	MiscFeature	SaleType
train	0	256	1362	0	0	0	8	8	37	37	...	81	81	0	0	81	81	1446	1171	1400	0
test	4	227	1352	2	1	1	16	15	44	45	...	78	78	1	1	78	78	1456	1169	1408	1
conbine	4	483	2714	2	1	1	24	23	81	82	...	159	159	1	1	159	159	2902	2340	2808	1

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	BsmtFinType2_	BsmtQual_	BsmtExposure_	BsmtFinType1_	GarageCond_	NeighborDistance	NeighborPrice	NeighborBin
0	1	SC60	RL	65.0	8450	Pave	NA	Reg	Lvl	AllPub	...	1	4	1	6	3	0.054604	197200	2
1	2	SC20	RL	80.0	9600	Pave	NA	Reg	Lvl	AllPub	...	1	4	4	5	3	0.026794	218000	3
2	3	SC60	RL	68.0	11250	Pave	NA	IR1	Lvl	AllPub	...	1	4	2	6	3	0.054604	197200	2
3	4	SC70	RL	60.0	9550	Pave	NA	IR1	Lvl	AllPub	...	1	3	1	5	3	0.012657	200624	3
4	5	SC60	RL	84.0	14260	Pave	NA	IR1	Lvl	AllPub	...	1	4	3	6	3	0.031042	301500	4

	LotFrontage	LotArea	OverallQual	OverallCond	YearBuilt	YearRemodAdd	MasVnrArea	BsmtFinSF1	BsmtUnfSF	...	ExterGrade_TATA	ExterGrade_TATATA	Id	SalePrice
0	0.625000	0.405761	0.875	0.625	0.998007	0.998007	0.353963	0.440935	0.096725	...	0.0	1.0	1	12.247699
1	0.769231	0.450153	0.750	1.000	0.984554	0.984554	0.000000	0.569657	0.175869	...	1.0	0.0	2	12.109016
2	0.653846	0.510589	0.875	0.625	0.997010	0.997509	0.300867	0.323152	0.257614	...	0.0	1.0	3	12.317171
3	0.576923	0.448263	0.875	0.625	0.954160	0.981565	0.000000	0.156615	0.311598	...	1.0	0.0	4	11.849405
4	0.807692	0.612228	1.000	0.625	0.996512	0.996512	0.564402	0.414840	0.286497	...	0.0	1.0	5	12.429220

	LotFrontage	LotArea	OverallQual	OverallCond	YearBuilt	YearRemodAdd	MasVnrArea	BsmtFinSF1	BsmtFinSF2	BsmtUnfSF	...	ExterGrade_TATA	ExterGrade_TATATA	Id
0	0.769231	0.523724	0.625	0.750	0.977080	0.977080	0.000000	0.312872	0.286817	0.167887	...	1.0	0.0	1461
1	0.778846	0.612453	0.750	0.750	0.975585	0.975585	0.210277	0.544931	0.000000	0.242853	...	1.0	0.0	1462
2	0.711538	0.598325	0.625	0.625	0.995017	0.995516	0.000000	0.482969	0.000000	0.088703	...	1.0	0.0	1463
3	0.750000	0.464324	0.750	0.750	0.995516	0.995516	0.042424	0.386980	0.000000	0.198329	...	1.0	0.0	1464
4	0.413462	0.259604	1.000	0.625	0.992526	0.992526	0.000000	0.187671	0.000000	0.523875	...	0.0	1.0	1465