notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd



In [2]:

    
train_df = pd.read_csv('./input/train.csv', index_col=0)
test_df = pd.read_csv('./input/test.csv', index_col=0)



In [4]:

    
train_df.head()









    Out[4]:







  
    
      
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      LotConfig
      ...
      PoolArea
      PoolQC
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
      SalePrice
    
    
      Id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      60
      RL
      65.0
      8450
      Pave
      NaN
      Reg
      Lvl
      AllPub
      Inside
      ...
      0
      NaN
      NaN
      NaN
      0
      2
      2008
      WD
      Normal
      208500
    
    
      2
      20
      RL
      80.0
      9600
      Pave
      NaN
      Reg
      Lvl
      AllPub
      FR2
      ...
      0
      NaN
      NaN
      NaN
      0
      5
      2007
      WD
      Normal
      181500
    
    
      3
      60
      RL
      68.0
      11250
      Pave
      NaN
      IR1
      Lvl
      AllPub
      Inside
      ...
      0
      NaN
      NaN
      NaN
      0
      9
      2008
      WD
      Normal
      223500
    
    
      4
      70
      RL
      60.0
      9550
      Pave
      NaN
      IR1
      Lvl
      AllPub
      Corner
      ...
      0
      NaN
      NaN
      NaN
      0
      2
      2006
      WD
      Abnorml
      140000
    
    
      5
      60
      RL
      84.0
      14260
      Pave
      NaN
      IR1
      Lvl
      AllPub
      FR2
      ...
      0
      NaN
      NaN
      NaN
      0
      12
      2008
      WD
      Normal
      250000
    
  

5 rows × 80 columns



In [6]:

    
#label本身并不平滑。为了我们分类器的学习更加准确，我们会首先把label给“平滑化”（正态化）
import matplotlib.pyplot as plt
prices = pd.DataFrame({"price":train_df["SalePrice"], "log(price + 1)":np.log1p(train_df["SalePrice"])})
prices.hist()
plt.show()



In [7]:

    
y_train = np.log1p(train_df.pop('SalePrice'))



In [8]:

    
all_df = pd.concat((train_df, test_df), axis=0)



In [19]:

    
all_df['MSSubClass'].dtypes
all_df['MSSubClass'].value_counts()
all_df['MSSubClass'] = all_df['MSSubClass'].astype(str)
pd.get_dummies(all_df['MSSubClass'], prefix='MSSubClass').head()









    Out[19]:







  
    
      
      MSSubClass_120
      MSSubClass_150
      MSSubClass_160
      MSSubClass_180
      MSSubClass_190
      MSSubClass_20
      MSSubClass_30
      MSSubClass_40
      MSSubClass_45
      MSSubClass_50
      MSSubClass_60
      MSSubClass_70
      MSSubClass_75
      MSSubClass_80
      MSSubClass_85
      MSSubClass_90
    
    
      Id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      2
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      4
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
    
    
      5
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0



In [20]:

    
all_dummy_df = pd.get_dummies(all_df)
all_dummy_df.head()









    Out[20]:







  
    
      
      LotFrontage
      LotArea
      OverallQual
      OverallCond
      YearBuilt
      YearRemodAdd
      MasVnrArea
      BsmtFinSF1
      BsmtFinSF2
      BsmtUnfSF
      ...
      SaleType_ConLw
      SaleType_New
      SaleType_Oth
      SaleType_WD
      SaleCondition_Abnorml
      SaleCondition_AdjLand
      SaleCondition_Alloca
      SaleCondition_Family
      SaleCondition_Normal
      SaleCondition_Partial
    
    
      Id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      65.0
      8450
      7
      5
      2003
      2003
      196.0
      706.0
      0.0
      150.0
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
    
    
      2
      80.0
      9600
      6
      8
      1976
      1976
      0.0
      978.0
      0.0
      284.0
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
    
    
      3
      68.0
      11250
      7
      5
      2001
      2002
      162.0
      486.0
      0.0
      434.0
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
    
    
      4
      60.0
      9550
      7
      5
      1915
      1970
      0.0
      216.0
      0.0
      540.0
      ...
      0
      0
      0
      1
      1
      0
      0
      0
      0
      0
    
    
      5
      84.0
      14260
      8
      5
      2000
      2000
      350.0
      655.0
      0.0
      490.0
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
    
  

5 rows × 303 columns



In [21]:

    
all_dummy_df.isnull().sum().sort_values(ascending=False).head(10)









    Out[21]:





LotFrontage     486
GarageYrBlt     159
MasVnrArea       23
BsmtHalfBath      2
BsmtFullBath      2
BsmtFinSF2        1
GarageCars        1
TotalBsmtSF       1
BsmtUnfSF         1
GarageArea        1
dtype: int64



In [22]:

    
mean_cols = all_dummy_df.mean()
mean_cols.head(10)
all_dummy_df = all_dummy_df.fillna(mean_cols)
all_dummy_df.isnull().sum().sum()









    Out[22]:





0



In [23]:

    
dummy_train_df = all_dummy_df.loc[train_df.index]
dummy_test_df = all_dummy_df.loc[test_df.index]



In [24]:

    
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

X_train = dummy_train_df.values
X_test = dummy_test_df.values

Ridge



In [25]:

    
alphas = np.logspace(-3, 2, 50)
test_scores = []
for alpha in alphas:
    clf = Ridge(alpha)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))



In [27]:

    
plt.plot(alphas, test_scores)
plt.title("Alpha vs CV Error");
plt.show()

15最佳

RandomForestRegressor



In [28]:

    
from sklearn.ensemble import RandomForestRegressor
max_features = [.1, .3, .5, .7, .9, .99]
test_scores = []
for max_feat in max_features:
    clf = RandomForestRegressor(n_estimators=200, max_features=max_feat)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))



In [29]:

    
plt.plot(max_features, test_scores)
plt.title("Max Features vs CV Error");
plt.show()

xgboost



In [36]:

    
from xgboost import XGBRegressor
params = [1,2,3,4,5,6]
test_scores = []
for param in params:
    clf = XGBRegressor(max_depth=param)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))









    



---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-36-ee9cf24e0012> in <module>()
----> 1 from xgboost import XGBRegressor
      2 params = [1,2,3,4,5,6]
      3 test_scores = []
      4 for param in params:
      5     clf = XGBRegressor(max_depth=param)

ModuleNotFoundError: No module named 'xgboost'



In [ ]:

    
plt.plot(params, test_scores)
plt.title("max_depth vs CV Error");
plt.show()

bagging



In [38]:

    
from sklearn.ensemble import BaggingRegressor
params = [10, 15, 20, 25, 30, 35, 40, 45, 50]
test_scores = []
for param in params:
    clf = BaggingRegressor(n_estimators=param)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))



In [39]:

    
plt.plot(params, test_scores)
plt.title("max_depth vs CV Error");
plt.show()

基本为ridge ，效果更好一点



In [40]:

    
ridge = Ridge(15)
params = [1, 10, 15, 20, 25, 30, 40]
test_scores = []
for param in params:
    clf = BaggingRegressor(n_estimators=param, base_estimator=ridge)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))



In [41]:

    
plt.plot(params, test_scores)
plt.title("max_depth vs CV Error");
plt.show()

Ensemble



In [30]:

    
ridge = Ridge(alpha=15)
rf = RandomForestRegressor(n_estimators=500, max_features=.3)

ridge.fit(X_train, y_train)
rf.fit(X_train, y_train)









    Out[30]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.3, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)



In [31]:

    
y_ridge = np.expm1(ridge.predict(X_test))
y_rf = np.expm1(rf.predict(X_test))
y_final = (y_ridge + y_rf) / 2



In [32]:

    
submission_df = pd.DataFrame(data= {'Id' : test_df.index, 'SalePrice': y_final})



In [33]:

    
submission_df.head()









    Out[33]:







  
    
      
      Id
      SalePrice
    
  
  
    
      0
      1461
      119966.089418
    
    
      1
      1462
      151232.242058
    
    
      2
      1463
      175097.675096
    
    
      3
      1464
      190359.896556
    
    
      4
      1465
      194616.767458



In [37]:

    
submission_df.to_csv('submission20180316.csv',index = False,header = True,columns = ['Id','SalePrice'])



In [ ]:

	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	LotConfig	...	PoolArea	PoolQC	Fence	MiscFeature	MiscVal	MoSold	YrSold	SaleType	SaleCondition	SalePrice
Id
1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0	NaN	NaN	NaN	0	2	2008	WD	Normal	208500
2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	FR2	...	0	NaN	NaN	NaN	0	5	2007	WD	Normal	181500
3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	Inside	...	0	NaN	NaN	NaN	0	9	2008	WD	Normal	223500
4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	Corner	...	0	NaN	NaN	NaN	0	2	2006	WD	Abnorml	140000
5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	FR2	...	0	NaN	NaN	NaN	0	12	2008	WD	Normal	250000

	MSSubClass_120	MSSubClass_150	MSSubClass_160	MSSubClass_180	MSSubClass_190	MSSubClass_20	MSSubClass_30	MSSubClass_40	MSSubClass_45	MSSubClass_50	MSSubClass_60	MSSubClass_70	MSSubClass_75	MSSubClass_80	MSSubClass_85	MSSubClass_90
Id
1	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0
2	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0
5	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0

	LotFrontage	LotArea	OverallQual	OverallCond	YearBuilt	YearRemodAdd	MasVnrArea	BsmtFinSF1	BsmtFinSF2	BsmtUnfSF	...	SaleType_ConLw	SaleType_New	SaleType_Oth	SaleType_WD	SaleCondition_Abnorml	SaleCondition_AdjLand	SaleCondition_Alloca	SaleCondition_Family	SaleCondition_Normal	SaleCondition_Partial
Id
1	65.0	8450	7	5	2003	2003	196.0	706.0	0.0	150.0	...	0	0	0	1	0	0	0	0	1	0
2	80.0	9600	6	8	1976	1976	0.0	978.0	0.0	284.0	...	0	0	0	1	0	0	0	0	1	0
3	68.0	11250	7	5	2001	2002	162.0	486.0	0.0	434.0	...	0	0	0	1	0	0	0	0	1	0
4	60.0	9550	7	5	1915	1970	0.0	216.0	0.0	540.0	...	0	0	0	1	1	0	0	0	0	0
5	84.0	14260	8	5	2000	2000	350.0	655.0	0.0	490.0	...	0	0	0	1	0	0	0	0	1	0

	Id	SalePrice
0	1461	119966.089418
1	1462	151232.242058
2	1463	175097.675096
3	1464	190359.896556
4	1465	194616.767458

	MSSubClass_120	MSSubClass_150	MSSubClass_160	MSSubClass_180	MSSubClass_190	MSSubClass_20	MSSubClass_30	MSSubClass_40	MSSubClass_45	MSSubClass_50	MSSubClass_60	MSSubClass_70	MSSubClass_75	MSSubClass_80	MSSubClass_85	MSSubClass_90
Id
1	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0
2	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0
5	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0

	MSSubClass_120	MSSubClass_150	MSSubClass_160	MSSubClass_180	MSSubClass_190	MSSubClass_20	MSSubClass_30	MSSubClass_40	MSSubClass_45	MSSubClass_50	MSSubClass_60	MSSubClass_70	MSSubClass_75	MSSubClass_80	MSSubClass_85	MSSubClass_90
Id
1	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0
2	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0
5	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0