In [1]:
import os
import numpy as np
import pandas as pd
filepath = '/Users/mac/Desktop/Kaggle_datasets/House_Price/'
filename1 = 'train.csv'
filename2 = 'test.csv'
dfTrain = pd.read_csv(os.path.join(filepath+filename1))
dfTest = pd.read_csv(os.path.join(filepath+filename2))
In [2]:
dfTrain.head()
Out[2]:
Id
MSSubClass
MSZoning
LotFrontage
LotArea
Street
Alley
LotShape
LandContour
Utilities
...
PoolArea
PoolQC
Fence
MiscFeature
MiscVal
MoSold
YrSold
SaleType
SaleCondition
SalePrice
0
1
60
RL
65.0
8450
Pave
NaN
Reg
Lvl
AllPub
...
0
NaN
NaN
NaN
0
2
2008
WD
Normal
208500
1
2
20
RL
80.0
9600
Pave
NaN
Reg
Lvl
AllPub
...
0
NaN
NaN
NaN
0
5
2007
WD
Normal
181500
2
3
60
RL
68.0
11250
Pave
NaN
IR1
Lvl
AllPub
...
0
NaN
NaN
NaN
0
9
2008
WD
Normal
223500
3
4
70
RL
60.0
9550
Pave
NaN
IR1
Lvl
AllPub
...
0
NaN
NaN
NaN
0
2
2006
WD
Abnorml
140000
4
5
60
RL
84.0
14260
Pave
NaN
IR1
Lvl
AllPub
...
0
NaN
NaN
NaN
0
12
2008
WD
Normal
250000
5 rows × 81 columns
In [3]:
dfTrain.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id 1460 non-null int64
MSSubClass 1460 non-null int64
MSZoning 1460 non-null object
LotFrontage 1201 non-null float64
LotArea 1460 non-null int64
Street 1460 non-null object
Alley 91 non-null object
LotShape 1460 non-null object
LandContour 1460 non-null object
Utilities 1460 non-null object
LotConfig 1460 non-null object
LandSlope 1460 non-null object
Neighborhood 1460 non-null object
Condition1 1460 non-null object
Condition2 1460 non-null object
BldgType 1460 non-null object
HouseStyle 1460 non-null object
OverallQual 1460 non-null int64
OverallCond 1460 non-null int64
YearBuilt 1460 non-null int64
YearRemodAdd 1460 non-null int64
RoofStyle 1460 non-null object
RoofMatl 1460 non-null object
Exterior1st 1460 non-null object
Exterior2nd 1460 non-null object
MasVnrType 1452 non-null object
MasVnrArea 1452 non-null float64
ExterQual 1460 non-null object
ExterCond 1460 non-null object
Foundation 1460 non-null object
BsmtQual 1423 non-null object
BsmtCond 1423 non-null object
BsmtExposure 1422 non-null object
BsmtFinType1 1423 non-null object
BsmtFinSF1 1460 non-null int64
BsmtFinType2 1422 non-null object
BsmtFinSF2 1460 non-null int64
BsmtUnfSF 1460 non-null int64
TotalBsmtSF 1460 non-null int64
Heating 1460 non-null object
HeatingQC 1460 non-null object
CentralAir 1460 non-null object
Electrical 1459 non-null object
1stFlrSF 1460 non-null int64
2ndFlrSF 1460 non-null int64
LowQualFinSF 1460 non-null int64
GrLivArea 1460 non-null int64
BsmtFullBath 1460 non-null int64
BsmtHalfBath 1460 non-null int64
FullBath 1460 non-null int64
HalfBath 1460 non-null int64
BedroomAbvGr 1460 non-null int64
KitchenAbvGr 1460 non-null int64
KitchenQual 1460 non-null object
TotRmsAbvGrd 1460 non-null int64
Functional 1460 non-null object
Fireplaces 1460 non-null int64
FireplaceQu 770 non-null object
GarageType 1379 non-null object
GarageYrBlt 1379 non-null float64
GarageFinish 1379 non-null object
GarageCars 1460 non-null int64
GarageArea 1460 non-null int64
GarageQual 1379 non-null object
GarageCond 1379 non-null object
PavedDrive 1460 non-null object
WoodDeckSF 1460 non-null int64
OpenPorchSF 1460 non-null int64
EnclosedPorch 1460 non-null int64
3SsnPorch 1460 non-null int64
ScreenPorch 1460 non-null int64
PoolArea 1460 non-null int64
PoolQC 7 non-null object
Fence 281 non-null object
MiscFeature 54 non-null object
MiscVal 1460 non-null int64
MoSold 1460 non-null int64
YrSold 1460 non-null int64
SaleType 1460 non-null object
SaleCondition 1460 non-null object
SalePrice 1460 non-null int64
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
In [4]:
dfTest.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
Id 1459 non-null int64
MSSubClass 1459 non-null int64
MSZoning 1455 non-null object
LotFrontage 1232 non-null float64
LotArea 1459 non-null int64
Street 1459 non-null object
Alley 107 non-null object
LotShape 1459 non-null object
LandContour 1459 non-null object
Utilities 1457 non-null object
LotConfig 1459 non-null object
LandSlope 1459 non-null object
Neighborhood 1459 non-null object
Condition1 1459 non-null object
Condition2 1459 non-null object
BldgType 1459 non-null object
HouseStyle 1459 non-null object
OverallQual 1459 non-null int64
OverallCond 1459 non-null int64
YearBuilt 1459 non-null int64
YearRemodAdd 1459 non-null int64
RoofStyle 1459 non-null object
RoofMatl 1459 non-null object
Exterior1st 1458 non-null object
Exterior2nd 1458 non-null object
MasVnrType 1443 non-null object
MasVnrArea 1444 non-null float64
ExterQual 1459 non-null object
ExterCond 1459 non-null object
Foundation 1459 non-null object
BsmtQual 1415 non-null object
BsmtCond 1414 non-null object
BsmtExposure 1415 non-null object
BsmtFinType1 1417 non-null object
BsmtFinSF1 1458 non-null float64
BsmtFinType2 1417 non-null object
BsmtFinSF2 1458 non-null float64
BsmtUnfSF 1458 non-null float64
TotalBsmtSF 1458 non-null float64
Heating 1459 non-null object
HeatingQC 1459 non-null object
CentralAir 1459 non-null object
Electrical 1459 non-null object
1stFlrSF 1459 non-null int64
2ndFlrSF 1459 non-null int64
LowQualFinSF 1459 non-null int64
GrLivArea 1459 non-null int64
BsmtFullBath 1457 non-null float64
BsmtHalfBath 1457 non-null float64
FullBath 1459 non-null int64
HalfBath 1459 non-null int64
BedroomAbvGr 1459 non-null int64
KitchenAbvGr 1459 non-null int64
KitchenQual 1458 non-null object
TotRmsAbvGrd 1459 non-null int64
Functional 1457 non-null object
Fireplaces 1459 non-null int64
FireplaceQu 729 non-null object
GarageType 1383 non-null object
GarageYrBlt 1381 non-null float64
GarageFinish 1381 non-null object
GarageCars 1458 non-null float64
GarageArea 1458 non-null float64
GarageQual 1381 non-null object
GarageCond 1381 non-null object
PavedDrive 1459 non-null object
WoodDeckSF 1459 non-null int64
OpenPorchSF 1459 non-null int64
EnclosedPorch 1459 non-null int64
3SsnPorch 1459 non-null int64
ScreenPorch 1459 non-null int64
PoolArea 1459 non-null int64
PoolQC 3 non-null object
Fence 290 non-null object
MiscFeature 51 non-null object
MiscVal 1459 non-null int64
MoSold 1459 non-null int64
YrSold 1459 non-null int64
SaleType 1458 non-null object
SaleCondition 1459 non-null object
dtypes: float64(11), int64(26), object(43)
memory usage: 912.0+ KB
In [5]:
dfTrain.isnull().sum()
Out[5]:
Id 0
MSSubClass 0
MSZoning 0
LotFrontage 259
LotArea 0
Street 0
Alley 1369
LotShape 0
LandContour 0
Utilities 0
LotConfig 0
LandSlope 0
Neighborhood 0
Condition1 0
Condition2 0
BldgType 0
HouseStyle 0
OverallQual 0
OverallCond 0
YearBuilt 0
YearRemodAdd 0
RoofStyle 0
RoofMatl 0
Exterior1st 0
Exterior2nd 0
MasVnrType 8
MasVnrArea 8
ExterQual 0
ExterCond 0
Foundation 0
...
BedroomAbvGr 0
KitchenAbvGr 0
KitchenQual 0
TotRmsAbvGrd 0
Functional 0
Fireplaces 0
FireplaceQu 690
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageCars 0
GarageArea 0
GarageQual 81
GarageCond 81
PavedDrive 0
WoodDeckSF 0
OpenPorchSF 0
EnclosedPorch 0
3SsnPorch 0
ScreenPorch 0
PoolArea 0
PoolQC 1453
Fence 1179
MiscFeature 1406
MiscVal 0
MoSold 0
YrSold 0
SaleType 0
SaleCondition 0
SalePrice 0
dtype: int64
In [6]:
dfTest.isnull().sum()
Out[6]:
Id 0
MSSubClass 0
MSZoning 4
LotFrontage 227
LotArea 0
Street 0
Alley 1352
LotShape 0
LandContour 0
Utilities 2
LotConfig 0
LandSlope 0
Neighborhood 0
Condition1 0
Condition2 0
BldgType 0
HouseStyle 0
OverallQual 0
OverallCond 0
YearBuilt 0
YearRemodAdd 0
RoofStyle 0
RoofMatl 0
Exterior1st 1
Exterior2nd 1
MasVnrType 16
MasVnrArea 15
ExterQual 0
ExterCond 0
Foundation 0
...
HalfBath 0
BedroomAbvGr 0
KitchenAbvGr 0
KitchenQual 1
TotRmsAbvGrd 0
Functional 2
Fireplaces 0
FireplaceQu 730
GarageType 76
GarageYrBlt 78
GarageFinish 78
GarageCars 1
GarageArea 1
GarageQual 78
GarageCond 78
PavedDrive 0
WoodDeckSF 0
OpenPorchSF 0
EnclosedPorch 0
3SsnPorch 0
ScreenPorch 0
PoolArea 0
PoolQC 1456
Fence 1169
MiscFeature 1408
MiscVal 0
MoSold 0
YrSold 0
SaleType 1
SaleCondition 0
dtype: int64
In [7]:
dfTrain.columns
Out[7]:
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
'SaleCondition', 'SalePrice'],
dtype='object')
In [8]:
dfTest.columns
Out[8]:
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
'SaleCondition'],
dtype='object')
In [9]:
#由於資料缺失多太難補了,因此決定把NA破千的全部捨棄
#其他的NA若是數字則取平均,若是str則歸成other類別使用onehot encoding技巧無視XD
#拿掉Alley 1369,PoolQC 1453,Fence 1179,MiscFeature 1406
dfTrain_feature = dfTrain[['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
'LotShape', 'LandContour', 'Utilities', 'LotConfig',
'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
'SaleCondition']]
train_label = np.array(dfTrain['SalePrice'])
dfTest_feature = dfTest[['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
'LotShape', 'LandContour', 'Utilities', 'LotConfig',
'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
'SaleCondition']]
In [10]:
dfTrain_feature.fillna(dfTrain_feature.mean(), inplace=True) #藉由填上平均值,一次解決所有數值類別缺失的問題!!
//anaconda/lib/python3.5/site-packages/pandas/core/generic.py:3295: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self._update_inplace(new_data)
Out[10]:
MSSubClass
MSZoning
LotFrontage
LotArea
Street
LotShape
LandContour
Utilities
LotConfig
LandSlope
...
OpenPorchSF
EnclosedPorch
3SsnPorch
ScreenPorch
PoolArea
MiscVal
MoSold
YrSold
SaleType
SaleCondition
0
60
RL
65.000000
8450
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
61
0
0
0
0
0
2
2008
WD
Normal
1
20
RL
80.000000
9600
Pave
Reg
Lvl
AllPub
FR2
Gtl
...
0
0
0
0
0
0
5
2007
WD
Normal
2
60
RL
68.000000
11250
Pave
IR1
Lvl
AllPub
Inside
Gtl
...
42
0
0
0
0
0
9
2008
WD
Normal
3
70
RL
60.000000
9550
Pave
IR1
Lvl
AllPub
Corner
Gtl
...
35
272
0
0
0
0
2
2006
WD
Abnorml
4
60
RL
84.000000
14260
Pave
IR1
Lvl
AllPub
FR2
Gtl
...
84
0
0
0
0
0
12
2008
WD
Normal
5
50
RL
85.000000
14115
Pave
IR1
Lvl
AllPub
Inside
Gtl
...
30
0
320
0
0
700
10
2009
WD
Normal
6
20
RL
75.000000
10084
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
57
0
0
0
0
0
8
2007
WD
Normal
7
60
RL
70.049958
10382
Pave
IR1
Lvl
AllPub
Corner
Gtl
...
204
228
0
0
0
350
11
2009
WD
Normal
8
50
RM
51.000000
6120
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
0
205
0
0
0
0
4
2008
WD
Abnorml
9
190
RL
50.000000
7420
Pave
Reg
Lvl
AllPub
Corner
Gtl
...
4
0
0
0
0
0
1
2008
WD
Normal
10
20
RL
70.000000
11200
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
0
0
0
0
0
0
2
2008
WD
Normal
11
60
RL
85.000000
11924
Pave
IR1
Lvl
AllPub
Inside
Gtl
...
21
0
0
0
0
0
7
2006
New
Partial
12
20
RL
70.049958
12968
Pave
IR2
Lvl
AllPub
Inside
Gtl
...
0
0
0
176
0
0
9
2008
WD
Normal
13
20
RL
91.000000
10652
Pave
IR1
Lvl
AllPub
Inside
Gtl
...
33
0
0
0
0
0
8
2007
New
Partial
14
20
RL
70.049958
10920
Pave
IR1
Lvl
AllPub
Corner
Gtl
...
213
176
0
0
0
0
5
2008
WD
Normal
15
45
RM
51.000000
6120
Pave
Reg
Lvl
AllPub
Corner
Gtl
...
112
0
0
0
0
0
7
2007
WD
Normal
16
20
RL
70.049958
11241
Pave
IR1
Lvl
AllPub
CulDSac
Gtl
...
0
0
0
0
0
700
3
2010
WD
Normal
17
90
RL
72.000000
10791
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
0
0
0
0
0
500
10
2006
WD
Normal
18
20
RL
66.000000
13695
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
102
0
0
0
0
0
6
2008
WD
Normal
19
20
RL
70.000000
7560
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
0
0
0
0
0
0
5
2009
COD
Abnorml
20
60
RL
101.000000
14215
Pave
IR1
Lvl
AllPub
Corner
Gtl
...
154
0
0
0
0
0
11
2006
New
Partial
21
45
RM
57.000000
7449
Pave
Reg
Bnk
AllPub
Inside
Gtl
...
0
205
0
0
0
0
6
2007
WD
Normal
22
20
RL
75.000000
9742
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
159
0
0
0
0
0
9
2008
WD
Normal
23
120
RM
44.000000
4224
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
110
0
0
0
0
0
6
2007
WD
Normal
24
20
RL
70.049958
8246
Pave
IR1
Lvl
AllPub
Inside
Gtl
...
90
0
0
0
0
0
5
2010
WD
Normal
25
20
RL
110.000000
14230
Pave
Reg
Lvl
AllPub
Corner
Gtl
...
56
0
0
0
0
0
7
2009
WD
Normal
26
20
RL
60.000000
7200
Pave
Reg
Lvl
AllPub
Corner
Gtl
...
32
0
0
0
0
0
5
2010
WD
Normal
27
20
RL
98.000000
11478
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
50
0
0
0
0
0
5
2010
WD
Normal
28
20
RL
47.000000
16321
Pave
IR1
Lvl
AllPub
CulDSac
Gtl
...
258
0
0
0
0
0
12
2006
WD
Normal
29
30
RM
60.000000
6324
Pave
IR1
Lvl
AllPub
Inside
Gtl
...
0
87
0
0
0
0
5
2008
WD
Normal
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
1430
60
RL
60.000000
21930
Pave
IR3
Lvl
AllPub
Inside
Gtl
...
40
0
0
0
0
0
7
2006
WD
Normal
1431
120
RL
70.049958
4928
Pave
IR1
Lvl
AllPub
Inside
Gtl
...
60
0
0
0
0
0
10
2009
WD
Normal
1432
30
RL
60.000000
10800
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
0
0
0
0
0
0
8
2007
WD
Normal
1433
60
RL
93.000000
10261
Pave
IR1
Lvl
AllPub
Inside
Gtl
...
0
0
0
0
0
0
5
2008
WD
Normal
1434
20
RL
80.000000
17400
Pave
Reg
Low
AllPub
Inside
Mod
...
41
0
0
0
0
0
5
2006
WD
Normal
1435
20
RL
80.000000
8400
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
36
0
0
0
0
0
7
2008
COD
Abnorml
1436
20
RL
60.000000
9000
Pave
Reg
Lvl
AllPub
FR2
Gtl
...
0
0
0
0
0
0
5
2007
WD
Normal
1437
20
RL
96.000000
12444
Pave
Reg
Lvl
AllPub
FR2
Gtl
...
66
0
304
0
0
0
11
2008
New
Partial
1438
20
RM
90.000000
7407
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
158
158
0
0
0
0
4
2010
WD
Normal
1439
60
RL
80.000000
11584
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
88
216
0
0
0
0
11
2007
WD
Normal
1440
70
RL
79.000000
11526
Pave
IR1
Bnk
AllPub
Inside
Mod
...
0
0
0
0
0
0
9
2008
WD
Normal
1441
120
RM
70.049958
4426
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
0
0
0
0
0
0
5
2008
WD
Normal
1442
60
FV
85.000000
11003
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
52
0
0
0
0
0
4
2009
WD
Normal
1443
30
RL
70.049958
8854
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
98
0
0
40
0
0
5
2009
WD
Normal
1444
20
RL
63.000000
8500
Pave
Reg
Lvl
AllPub
FR2
Gtl
...
60
0
0
0
0
0
11
2007
WD
Normal
1445
85
RL
70.000000
8400
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
0
252
0
0
0
0
5
2007
WD
Normal
1446
20
RL
70.049958
26142
Pave
IR1
Lvl
AllPub
CulDSac
Gtl
...
39
0
0
0
0
0
4
2010
WD
Normal
1447
60
RL
80.000000
10000
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
65
0
0
0
0
0
12
2007
WD
Normal
1448
50
RL
70.000000
11767
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
24
0
0
0
0
0
5
2007
WD
Normal
1449
180
RM
21.000000
1533
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
0
0
0
0
0
0
8
2006
WD
Abnorml
1450
90
RL
60.000000
9000
Pave
Reg
Lvl
AllPub
FR2
Gtl
...
45
0
0
0
0
0
9
2009
WD
Normal
1451
20
RL
78.000000
9262
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
36
0
0
0
0
0
5
2009
New
Partial
1452
180
RM
35.000000
3675
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
28
0
0
0
0
0
5
2006
WD
Normal
1453
20
RL
90.000000
17217
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
56
0
0
0
0
0
7
2006
WD
Abnorml
1454
20
FV
62.000000
7500
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
113
0
0
0
0
0
10
2009
WD
Normal
1455
60
RL
62.000000
7917
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
40
0
0
0
0
0
8
2007
WD
Normal
1456
20
RL
85.000000
13175
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
0
0
0
0
0
0
2
2010
WD
Normal
1457
70
RL
66.000000
9042
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
60
0
0
0
0
2500
5
2010
WD
Normal
1458
20
RL
68.000000
9717
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
0
112
0
0
0
0
4
2010
WD
Normal
1459
20
RL
75.000000
9937
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
68
0
0
0
0
0
6
2008
WD
Normal
1460 rows × 75 columns
In [11]:
dfTrain_feature.mode()
Out[11]:
MSSubClass
MSZoning
LotFrontage
LotArea
Street
LotShape
LandContour
Utilities
LotConfig
LandSlope
...
OpenPorchSF
EnclosedPorch
3SsnPorch
ScreenPorch
PoolArea
MiscVal
MoSold
YrSold
SaleType
SaleCondition
0
20
RL
70.049958
7200
Pave
Reg
Lvl
AllPub
Inside
Gtl
...
0
0
0
0
0
0
6
2009
WD
Normal
1 rows × 75 columns
In [12]:
#dfTrain_feature.fillna(dfTrain_feature.mode(), inplace=True) #無效使用
In [13]:
dfTrain_feature.fillna('Others', inplace = True)
//anaconda/lib/python3.5/site-packages/pandas/core/frame.py:2842: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
downcast=downcast, **kwargs)
In [14]:
dfTrain_feature.isnull().sum() #把所有缺類別都當成一類...
Out[14]:
MSSubClass 0
MSZoning 0
LotFrontage 0
LotArea 0
Street 0
LotShape 0
LandContour 0
Utilities 0
LotConfig 0
LandSlope 0
Neighborhood 0
Condition1 0
Condition2 0
BldgType 0
HouseStyle 0
OverallQual 0
OverallCond 0
YearBuilt 0
YearRemodAdd 0
RoofStyle 0
RoofMatl 0
Exterior1st 0
Exterior2nd 0
MasVnrType 0
MasVnrArea 0
ExterQual 0
ExterCond 0
Foundation 0
BsmtQual 0
BsmtCond 0
..
BsmtFullBath 0
BsmtHalfBath 0
FullBath 0
HalfBath 0
BedroomAbvGr 0
KitchenAbvGr 0
KitchenQual 0
TotRmsAbvGrd 0
Functional 0
Fireplaces 0
FireplaceQu 0
GarageType 0
GarageYrBlt 0
GarageFinish 0
GarageCars 0
GarageArea 0
GarageQual 0
GarageCond 0
PavedDrive 0
WoodDeckSF 0
OpenPorchSF 0
EnclosedPorch 0
3SsnPorch 0
ScreenPorch 0
PoolArea 0
MiscVal 0
MoSold 0
YrSold 0
SaleType 0
SaleCondition 0
dtype: int64
In [15]:
def fillingdata(df):
df.fillna(df.mean(), inplace = True) #數值缺失用平均處理
df.fillna('Others', inplace = True) #在處理類別缺失,使用Others
return df
#dfTrain = fillingdata(dfTrain)
dfTest_feature = fillingdata(dfTest_feature)
//anaconda/lib/python3.5/site-packages/pandas/core/generic.py:3295: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self._update_inplace(new_data)
//anaconda/lib/python3.5/site-packages/pandas/core/frame.py:2842: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
downcast=downcast, **kwargs)
In [16]:
dfTest_feature.isnull().sum()
Out[16]:
MSSubClass 0
MSZoning 0
LotFrontage 0
LotArea 0
Street 0
LotShape 0
LandContour 0
Utilities 0
LotConfig 0
LandSlope 0
Neighborhood 0
Condition1 0
Condition2 0
BldgType 0
HouseStyle 0
OverallQual 0
OverallCond 0
YearBuilt 0
YearRemodAdd 0
RoofStyle 0
RoofMatl 0
Exterior1st 0
Exterior2nd 0
MasVnrType 0
MasVnrArea 0
ExterQual 0
ExterCond 0
Foundation 0
BsmtQual 0
BsmtCond 0
..
BsmtFullBath 0
BsmtHalfBath 0
FullBath 0
HalfBath 0
BedroomAbvGr 0
KitchenAbvGr 0
KitchenQual 0
TotRmsAbvGrd 0
Functional 0
Fireplaces 0
FireplaceQu 0
GarageType 0
GarageYrBlt 0
GarageFinish 0
GarageCars 0
GarageArea 0
GarageQual 0
GarageCond 0
PavedDrive 0
WoodDeckSF 0
OpenPorchSF 0
EnclosedPorch 0
3SsnPorch 0
ScreenPorch 0
PoolArea 0
MiscVal 0
MoSold 0
YrSold 0
SaleType 0
SaleCondition 0
dtype: int64
In [17]:
#填補完後,該來onehot_encoding了
dfFull_feature = dfTrain_feature.append(dfTest_feature)
dfFull_feature = pd.get_dummies(dfFull_feature) #赫然發現get_dummies有直接把NA開成一類的功能...
dfTrain_feature = dfFull_feature.iloc[0:1460,:]
dfTest_feature = dfFull_feature.iloc[1460:,:]
In [18]:
dfTrain_feature.shape
Out[18]:
(1460, 294)
In [19]:
dfTrain_feature.head()
Out[19]:
MSSubClass
LotFrontage
LotArea
OverallQual
OverallCond
YearBuilt
YearRemodAdd
MasVnrArea
BsmtFinSF1
BsmtFinSF2
...
SaleType_New
SaleType_Oth
SaleType_Others
SaleType_WD
SaleCondition_Abnorml
SaleCondition_AdjLand
SaleCondition_Alloca
SaleCondition_Family
SaleCondition_Normal
SaleCondition_Partial
0
60
65.0
8450
7
5
2003
2003
196.0
706.0
0.0
...
0
0
0
1
0
0
0
0
1
0
1
20
80.0
9600
6
8
1976
1976
0.0
978.0
0.0
...
0
0
0
1
0
0
0
0
1
0
2
60
68.0
11250
7
5
2001
2002
162.0
486.0
0.0
...
0
0
0
1
0
0
0
0
1
0
3
70
60.0
9550
7
5
1915
1970
0.0
216.0
0.0
...
0
0
0
1
1
0
0
0
0
0
4
60
84.0
14260
8
5
2000
2000
350.0
655.0
0.0
...
0
0
0
1
0
0
0
0
1
0
5 rows × 294 columns
In [20]:
dfTest_feature.shape
Out[20]:
(1459, 294)
In [21]:
train_feature = np.array(dfTrain_feature)
#train_label 已經存在
test_feature = np.array(dfTest_feature)
In [22]:
print(train_feature[0])
print(train_label[0])
print(test_feature[0])
[ 6.00000000e+01 6.50000000e+01 8.45000000e+03 7.00000000e+00
5.00000000e+00 2.00300000e+03 2.00300000e+03 1.96000000e+02
7.06000000e+02 0.00000000e+00 1.50000000e+02 8.56000000e+02
8.56000000e+02 8.54000000e+02 0.00000000e+00 1.71000000e+03
1.00000000e+00 0.00000000e+00 2.00000000e+00 1.00000000e+00
3.00000000e+00 1.00000000e+00 8.00000000e+00 0.00000000e+00
2.00300000e+03 2.00000000e+00 5.48000000e+02 0.00000000e+00
6.10000000e+01 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 2.00000000e+00 2.00800000e+03
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00]
208500
[ 2.00000000e+01 8.00000000e+01 1.16220000e+04 5.00000000e+00
6.00000000e+00 1.96100000e+03 1.96100000e+03 0.00000000e+00
4.68000000e+02 1.44000000e+02 2.70000000e+02 8.82000000e+02
8.96000000e+02 0.00000000e+00 0.00000000e+00 8.96000000e+02
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
2.00000000e+00 1.00000000e+00 5.00000000e+00 0.00000000e+00
1.96100000e+03 1.00000000e+00 7.30000000e+02 1.40000000e+02
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.20000000e+02
0.00000000e+00 0.00000000e+00 6.00000000e+00 2.01000000e+03
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00]
In [23]:
dfTrain_feature.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Columns: 294 entries, MSSubClass to SaleCondition_Partial
dtypes: float64(11), int64(25), uint8(258)
memory usage: 789.9 KB
In [24]:
'''
from sklearn import preprocessing
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
train_feature = minmax_scale.fit_transform(train_feature)
test_feature = minmax_scale.fit_transform(test_feature)
'''
#train_label = np.log(train_label) #先把label取log
Out[24]:
'\nfrom sklearn import preprocessing\nminmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))\n\ntrain_feature = minmax_scale.fit_transform(train_feature)\ntest_feature = minmax_scale.fit_transform(test_feature)\n'
In [25]:
print(train_feature[0])
print(train_label[0])
print(test_feature[0])
[ 6.00000000e+01 6.50000000e+01 8.45000000e+03 7.00000000e+00
5.00000000e+00 2.00300000e+03 2.00300000e+03 1.96000000e+02
7.06000000e+02 0.00000000e+00 1.50000000e+02 8.56000000e+02
8.56000000e+02 8.54000000e+02 0.00000000e+00 1.71000000e+03
1.00000000e+00 0.00000000e+00 2.00000000e+00 1.00000000e+00
3.00000000e+00 1.00000000e+00 8.00000000e+00 0.00000000e+00
2.00300000e+03 2.00000000e+00 5.48000000e+02 0.00000000e+00
6.10000000e+01 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 2.00000000e+00 2.00800000e+03
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00]
208500
[ 2.00000000e+01 8.00000000e+01 1.16220000e+04 5.00000000e+00
6.00000000e+00 1.96100000e+03 1.96100000e+03 0.00000000e+00
4.68000000e+02 1.44000000e+02 2.70000000e+02 8.82000000e+02
8.96000000e+02 0.00000000e+00 0.00000000e+00 8.96000000e+02
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
2.00000000e+00 1.00000000e+00 5.00000000e+00 0.00000000e+00
1.96100000e+03 1.00000000e+00 7.30000000e+02 1.40000000e+02
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.20000000e+02
0.00000000e+00 0.00000000e+00 6.00000000e+00 2.01000000e+03
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
1.00000000e+00 0.00000000e+00]
In [26]:
train_feature.shape
Out[26]:
(1460, 294)
In [58]:
import matplotlib.pyplot as plt
def show_train_history(train_history,train,validation):
plt.plot(train_history.history[train])
plt.plot(train_history.history[validation])
plt.title('Train History')
plt.ylabel(train)
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='best')
plt.show()
######################### 建立模型
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
model = Sequential() #一層一層到底,按順序
#輸入層(隱藏層1)
model.add(Dense(units=2000, input_dim=294,
kernel_initializer='uniform',
activation='relu'))
model.add(Dropout(0.5))
#隱藏層2,不用寫input_dim,因為就是前一層的units
model.add(Dense(units=1000,
kernel_initializer='uniform',
activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=1000,
kernel_initializer='uniform',
activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=1000,
kernel_initializer='uniform',
activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=500,
kernel_initializer='uniform',
activation='relu'))
model.add(Dropout(0.5))
#輸出層
model.add(Dense(units=1,
kernel_initializer='uniform',
activation=None))
print(model.summary()) #可以清楚看到model還有參數數量
######################### 訓練模型
#選擇loss度量,optimizer學習路徑速度、
model.compile(loss='mean_squared_logarithmic_error',
optimizer='adam', metrics=['accuracy'])
#開始train,並且記錄情況(設有val項以免overfitting)
train_history = model.fit(x=train_feature, y=train_label, #上面多分割一步在keras是內建的
validation_split=0.2, epochs=50, batch_size=50, verbose=2) #verbose=2表示顯示訓練過程
######################### 訓練過程視覺化
show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')
#儲存訓練結果
model.save_weights("Savemodels/HousingPrice(Kaggles)_MLP.h5")
print('model saved to disk')
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_49 (Dense) (None, 2000) 590000
_________________________________________________________________
dropout_38 (Dropout) (None, 2000) 0
_________________________________________________________________
dense_50 (Dense) (None, 1000) 2001000
_________________________________________________________________
dropout_39 (Dropout) (None, 1000) 0
_________________________________________________________________
dense_51 (Dense) (None, 1000) 1001000
_________________________________________________________________
dropout_40 (Dropout) (None, 1000) 0
_________________________________________________________________
dense_52 (Dense) (None, 1000) 1001000
_________________________________________________________________
dropout_41 (Dropout) (None, 1000) 0
_________________________________________________________________
dense_53 (Dense) (None, 500) 500500
_________________________________________________________________
dropout_42 (Dropout) (None, 500) 0
_________________________________________________________________
dense_54 (Dense) (None, 1) 501
=================================================================
Total params: 5,094,001
Trainable params: 5,094,001
Non-trainable params: 0
_________________________________________________________________
None
Train on 1168 samples, validate on 292 samples
Epoch 1/50
7s - loss: 6.4789 - acc: 0.0000e+00 - val_loss: 0.9571 - val_acc: 0.0000e+00
Epoch 2/50
6s - loss: 0.3803 - acc: 0.0000e+00 - val_loss: 0.2187 - val_acc: 0.0000e+00
Epoch 3/50
6s - loss: 0.1794 - acc: 0.0000e+00 - val_loss: 0.1317 - val_acc: 0.0000e+00
Epoch 4/50
6s - loss: 0.1373 - acc: 0.0000e+00 - val_loss: 0.1055 - val_acc: 0.0000e+00
Epoch 5/50
6s - loss: 0.1095 - acc: 0.0000e+00 - val_loss: 0.0752 - val_acc: 0.0000e+00
Epoch 6/50
6s - loss: 0.0894 - acc: 0.0000e+00 - val_loss: 0.0719 - val_acc: 0.0000e+00
Epoch 7/50
7s - loss: 0.0734 - acc: 0.0000e+00 - val_loss: 0.0555 - val_acc: 0.0000e+00
Epoch 8/50
6s - loss: 0.0736 - acc: 0.0000e+00 - val_loss: 0.0564 - val_acc: 0.0000e+00
Epoch 9/50
6s - loss: 0.0706 - acc: 0.0000e+00 - val_loss: 0.0496 - val_acc: 0.0000e+00
Epoch 10/50
6s - loss: 0.0627 - acc: 0.0000e+00 - val_loss: 0.0507 - val_acc: 0.0000e+00
Epoch 11/50
6s - loss: 0.0641 - acc: 0.0000e+00 - val_loss: 0.0521 - val_acc: 0.0000e+00
Epoch 12/50
5s - loss: 0.0630 - acc: 0.0000e+00 - val_loss: 0.0618 - val_acc: 0.0000e+00
Epoch 13/50
5s - loss: 0.0623 - acc: 0.0000e+00 - val_loss: 0.0503 - val_acc: 0.0034
Epoch 14/50
5s - loss: 0.0597 - acc: 0.0000e+00 - val_loss: 0.0483 - val_acc: 0.0000e+00
Epoch 15/50
5s - loss: 0.0611 - acc: 0.0000e+00 - val_loss: 0.0491 - val_acc: 0.0000e+00
Epoch 16/50
5s - loss: 0.0627 - acc: 0.0000e+00 - val_loss: 0.0511 - val_acc: 0.0000e+00
Epoch 17/50
6s - loss: 0.0600 - acc: 0.0000e+00 - val_loss: 0.0490 - val_acc: 0.0000e+00
Epoch 18/50
6s - loss: 0.0599 - acc: 0.0000e+00 - val_loss: 0.0601 - val_acc: 0.0000e+00
Epoch 19/50
5s - loss: 0.0564 - acc: 0.0000e+00 - val_loss: 0.0453 - val_acc: 0.0000e+00
Epoch 20/50
6s - loss: 0.0603 - acc: 0.0000e+00 - val_loss: 0.0544 - val_acc: 0.0000e+00
Epoch 21/50
6s - loss: 0.0611 - acc: 0.0000e+00 - val_loss: 0.0490 - val_acc: 0.0000e+00
Epoch 22/50
5s - loss: 0.0568 - acc: 0.0000e+00 - val_loss: 0.0496 - val_acc: 0.0000e+00
Epoch 23/50
6s - loss: 0.0569 - acc: 0.0000e+00 - val_loss: 0.0450 - val_acc: 0.0000e+00
Epoch 24/50
5s - loss: 0.0557 - acc: 0.0000e+00 - val_loss: 0.0570 - val_acc: 0.0000e+00
Epoch 25/50
5s - loss: 0.0570 - acc: 0.0000e+00 - val_loss: 0.0461 - val_acc: 0.0000e+00
Epoch 26/50
6s - loss: 0.0581 - acc: 0.0000e+00 - val_loss: 0.0545 - val_acc: 0.0000e+00
Epoch 27/50
6s - loss: 0.0567 - acc: 0.0000e+00 - val_loss: 0.0474 - val_acc: 0.0000e+00
Epoch 28/50
6s - loss: 0.0573 - acc: 0.0000e+00 - val_loss: 0.0621 - val_acc: 0.0000e+00
Epoch 29/50
5s - loss: 0.0608 - acc: 0.0000e+00 - val_loss: 0.0493 - val_acc: 0.0000e+00
Epoch 30/50
5s - loss: 0.0569 - acc: 0.0000e+00 - val_loss: 0.0443 - val_acc: 0.0000e+00
Epoch 31/50
5s - loss: 0.0550 - acc: 0.0000e+00 - val_loss: 0.0490 - val_acc: 0.0000e+00
Epoch 32/50
6s - loss: 0.0525 - acc: 0.0000e+00 - val_loss: 0.0468 - val_acc: 0.0000e+00
Epoch 33/50
6s - loss: 0.0549 - acc: 0.0000e+00 - val_loss: 0.0462 - val_acc: 0.0000e+00
Epoch 34/50
6s - loss: 0.0550 - acc: 0.0000e+00 - val_loss: 0.0471 - val_acc: 0.0000e+00
Epoch 35/50
6s - loss: 0.0551 - acc: 0.0000e+00 - val_loss: 0.0510 - val_acc: 0.0000e+00
Epoch 36/50
5s - loss: 0.0532 - acc: 0.0000e+00 - val_loss: 0.0681 - val_acc: 0.0000e+00
Epoch 37/50
5s - loss: 0.0542 - acc: 0.0000e+00 - val_loss: 0.0442 - val_acc: 0.0000e+00
Epoch 38/50
5s - loss: 0.0537 - acc: 0.0000e+00 - val_loss: 0.0470 - val_acc: 0.0000e+00
Epoch 39/50
5s - loss: 0.0506 - acc: 0.0000e+00 - val_loss: 0.0473 - val_acc: 0.0000e+00
Epoch 40/50
5s - loss: 0.0528 - acc: 0.0000e+00 - val_loss: 0.0441 - val_acc: 0.0000e+00
Epoch 41/50
5s - loss: 0.0555 - acc: 0.0000e+00 - val_loss: 0.0437 - val_acc: 0.0000e+00
Epoch 42/50
5s - loss: 0.0532 - acc: 0.0000e+00 - val_loss: 0.0464 - val_acc: 0.0000e+00
Epoch 43/50
5s - loss: 0.0526 - acc: 0.0000e+00 - val_loss: 0.0560 - val_acc: 0.0000e+00
Epoch 44/50
6s - loss: 0.0533 - acc: 0.0000e+00 - val_loss: 0.0459 - val_acc: 0.0000e+00
Epoch 45/50
6s - loss: 0.0523 - acc: 0.0000e+00 - val_loss: 0.0476 - val_acc: 0.0000e+00
Epoch 46/50
5s - loss: 0.0524 - acc: 0.0000e+00 - val_loss: 0.0497 - val_acc: 0.0000e+00
Epoch 47/50
6s - loss: 0.0517 - acc: 0.0000e+00 - val_loss: 0.0482 - val_acc: 0.0000e+00
Epoch 48/50
5s - loss: 0.0515 - acc: 0.0000e+00 - val_loss: 0.0437 - val_acc: 0.0000e+00
Epoch 49/50
5s - loss: 0.0516 - acc: 0.0000e+00 - val_loss: 0.0447 - val_acc: 0.0000e+00
Epoch 50/50
6s - loss: 0.0515 - acc: 0.0000e+00 - val_loss: 0.0454 - val_acc: 0.0000e+00
model saved to disk
In [59]:
prediction = model.predict(train_feature)
prediction
Out[59]:
array([[ 186748.6875 ],
[ 166117.96875 ],
[ 193776.6875 ],
...,
[ 190180.578125 ],
[ 122913.1640625],
[ 162731.03125 ]], dtype=float32)
In [60]:
######################### 紀錄模型預測情形(答案卷)
prediction = model.predict(test_feature)
df = pd.DataFrame(prediction)
df.to_csv('HousingPriceaAnswer.csv')
In [ ]:
Content source: Pytoddler/Kaggle-competition
Similar notebooks:
notebook.community | gallery | about