In [1]:
import numpy as np
import pandas as pd

In [2]:
train_df = pd.read_csv('./input/train.csv', index_col=0)
test_df = pd.read_csv('./input/test.csv', index_col=0)

In [4]:
train_df.head()


Out[4]:
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
Id
1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub Inside ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub FR2 ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub Inside ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500
4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub Corner ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub FR2 ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000

5 rows × 80 columns


In [6]:
#label本身并不平滑。为了我们分类器的学习更加准确,我们会首先把label给“平滑化”(正态化)
import matplotlib.pyplot as plt
prices = pd.DataFrame({"price":train_df["SalePrice"], "log(price + 1)":np.log1p(train_df["SalePrice"])})
prices.hist()
plt.show()



In [7]:
y_train = np.log1p(train_df.pop('SalePrice'))

In [8]:
all_df = pd.concat((train_df, test_df), axis=0)

In [19]:
all_df['MSSubClass'].dtypes
all_df['MSSubClass'].value_counts()
all_df['MSSubClass'] = all_df['MSSubClass'].astype(str)
pd.get_dummies(all_df['MSSubClass'], prefix='MSSubClass').head()


Out[19]:
MSSubClass_120 MSSubClass_150 MSSubClass_160 MSSubClass_180 MSSubClass_190 MSSubClass_20 MSSubClass_30 MSSubClass_40 MSSubClass_45 MSSubClass_50 MSSubClass_60 MSSubClass_70 MSSubClass_75 MSSubClass_80 MSSubClass_85 MSSubClass_90
Id
1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
2 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
5 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0

In [20]:
all_dummy_df = pd.get_dummies(all_df)
all_dummy_df.head()


Out[20]:
LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF ... SaleType_ConLw SaleType_New SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal SaleCondition_Partial
Id
1 65.0 8450 7 5 2003 2003 196.0 706.0 0.0 150.0 ... 0 0 0 1 0 0 0 0 1 0
2 80.0 9600 6 8 1976 1976 0.0 978.0 0.0 284.0 ... 0 0 0 1 0 0 0 0 1 0
3 68.0 11250 7 5 2001 2002 162.0 486.0 0.0 434.0 ... 0 0 0 1 0 0 0 0 1 0
4 60.0 9550 7 5 1915 1970 0.0 216.0 0.0 540.0 ... 0 0 0 1 1 0 0 0 0 0
5 84.0 14260 8 5 2000 2000 350.0 655.0 0.0 490.0 ... 0 0 0 1 0 0 0 0 1 0

5 rows × 303 columns


In [21]:
all_dummy_df.isnull().sum().sort_values(ascending=False).head(10)


Out[21]:
LotFrontage     486
GarageYrBlt     159
MasVnrArea       23
BsmtHalfBath      2
BsmtFullBath      2
BsmtFinSF2        1
GarageCars        1
TotalBsmtSF       1
BsmtUnfSF         1
GarageArea        1
dtype: int64

In [22]:
mean_cols = all_dummy_df.mean()
mean_cols.head(10)
all_dummy_df = all_dummy_df.fillna(mean_cols)
all_dummy_df.isnull().sum().sum()


Out[22]:
0

In [23]:
dummy_train_df = all_dummy_df.loc[train_df.index]
dummy_test_df = all_dummy_df.loc[test_df.index]

In [24]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

X_train = dummy_train_df.values
X_test = dummy_test_df.values

Ridge


In [25]:
alphas = np.logspace(-3, 2, 50)
test_scores = []
for alpha in alphas:
    clf = Ridge(alpha)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))

In [27]:
plt.plot(alphas, test_scores)
plt.title("Alpha vs CV Error");
plt.show()


15最佳

RandomForestRegressor


In [28]:
from sklearn.ensemble import RandomForestRegressor
max_features = [.1, .3, .5, .7, .9, .99]
test_scores = []
for max_feat in max_features:
    clf = RandomForestRegressor(n_estimators=200, max_features=max_feat)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))

In [29]:
plt.plot(max_features, test_scores)
plt.title("Max Features vs CV Error");
plt.show()


xgboost


In [36]:
from xgboost import XGBRegressor
params = [1,2,3,4,5,6]
test_scores = []
for param in params:
    clf = XGBRegressor(max_depth=param)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))


---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-36-ee9cf24e0012> in <module>()
----> 1 from xgboost import XGBRegressor
      2 params = [1,2,3,4,5,6]
      3 test_scores = []
      4 for param in params:
      5     clf = XGBRegressor(max_depth=param)

ModuleNotFoundError: No module named 'xgboost'

In [ ]:
plt.plot(params, test_scores)
plt.title("max_depth vs CV Error");
plt.show()

bagging


In [38]:
from sklearn.ensemble import BaggingRegressor
params = [10, 15, 20, 25, 30, 35, 40, 45, 50]
test_scores = []
for param in params:
    clf = BaggingRegressor(n_estimators=param)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))

In [39]:
plt.plot(params, test_scores)
plt.title("max_depth vs CV Error");
plt.show()


基本为ridge ,效果更好一点


In [40]:
ridge = Ridge(15)
params = [1, 10, 15, 20, 25, 30, 40]
test_scores = []
for param in params:
    clf = BaggingRegressor(n_estimators=param, base_estimator=ridge)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))

In [41]:
plt.plot(params, test_scores)
plt.title("max_depth vs CV Error");
plt.show()


Ensemble


In [30]:
ridge = Ridge(alpha=15)
rf = RandomForestRegressor(n_estimators=500, max_features=.3)

ridge.fit(X_train, y_train)
rf.fit(X_train, y_train)


Out[30]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.3, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [31]:
y_ridge = np.expm1(ridge.predict(X_test))
y_rf = np.expm1(rf.predict(X_test))
y_final = (y_ridge + y_rf) / 2

In [32]:
submission_df = pd.DataFrame(data= {'Id' : test_df.index, 'SalePrice': y_final})

In [33]:
submission_df.head()


Out[33]:
Id SalePrice
0 1461 119966.089418
1 1462 151232.242058
2 1463 175097.675096
3 1464 190359.896556
4 1465 194616.767458

In [37]:
submission_df.to_csv('submission20180316.csv',index = False,header = True,columns = ['Id','SalePrice'])

In [ ]: