notebook.community

Edit and run



In [78]:

    
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline
#%matplotlib auto
plt.style.use('ggplot')#makes nicer plots without work

train_df = pd.read_csv('/home/kevin/Documents/data/housing/train.csv')
test_df = pd.read_csv('/home/kevin/Documents/data/housing/test.csv')

test_index = test_df['Id']#We'll remove this later



In [53]:

    
#print(plt.style.available)



In [54]:

    
print(train_df.head(2))
print(test_df.head(2))









    



   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   

  LandContour Utilities    ...     PoolArea PoolQC Fence MiscFeature MiscVal  \
0         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
1         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   

  MoSold YrSold  SaleType  SaleCondition  SalePrice  
0      2   2008        WD         Normal     208500  
1      5   2007        WD         Normal     181500  

[2 rows x 81 columns]
     Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0  1461          20       RH         80.0    11622   Pave   NaN      Reg   
1  1462          20       RL         81.0    14267   Pave   NaN      IR1   

  LandContour Utilities      ...       ScreenPorch PoolArea PoolQC  Fence  \
0         Lvl    AllPub      ...               120        0    NaN  MnPrv   
1         Lvl    AllPub      ...                 0        0    NaN    NaN   

  MiscFeature MiscVal MoSold  YrSold  SaleType  SaleCondition  
0         NaN       0      6    2010        WD         Normal  
1        Gar2   12500      6    2010        WD         Normal  

[2 rows x 80 columns]



In [55]:

    
train_df = train_df.drop('Id', 1)
test_df = test_df.drop('Id',1)
col_labels = list(train_df.columns)#convert to list to turn objects to strs

print(col_labels)









    



['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'SalePrice']



In [56]:

    
print("train_df.shape = {}, test_df.shape = {}".format(train_df.shape, test_df.shape))









    



train_df.shape = (1460, 80), test_df.shape = (1459, 79)



In [57]:

    
#remove columns wih few non-null values
train_df = train_df.drop(['Alley', 'FireplaceQu','PoolQC', 'Fence', 'MiscFeature'], 1)
test_df = test_df.drop(['Alley', 'FireplaceQu','PoolQC', 'Fence', 'MiscFeature'], 1)



In [58]:

    
print("train_df.shape = {}, test_df.shape = {}".format(train_df.shape, test_df.shape))
#We removed 5 columns









    



train_df.shape = (1460, 75), test_df.shape = (1459, 74)



In [59]:

    
#data imputation
from sklearn.base import TransformerMixin
class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series(
            [X[c].value_counts().index[0]
            if X[c].dtype==np.dtype('O') else X[c].median() for c in X], index=X.columns)
        return self
    def transform(self,X, y=None):
        return X.fillna(self.fill)



In [60]:

    
#join features
feature_labels = test_df.columns
my_features = list(range(len(feature_labels)))
X_train = train_df[my_features]
X_test = test_df[my_features]

#print(X_train.head(2))
#print(X_test.head(2))

big_X = X_train.append(X_test)

big_X_imputed = DataFrameImputer().fit_transform(big_X)
print("X_train.shape = {}, X_test.shape = {}, big_X.shape = {}".format(
        X_train.shape, X_test.shape, big_X.shape))
big_X_imputed.head(2)









    



X_train.shape = (1460, 74), X_test.shape = (1459, 74), big_X.shape = (2919, 74)






    Out[60]:






  
    
      
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      LotShape
      LandContour
      Utilities
      LotConfig
      LandSlope
      ...
      OpenPorchSF
      EnclosedPorch
      3SsnPorch
      ScreenPorch
      PoolArea
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
    
  
  
    
      0
      60
      RL
      65.0
      8450
      Pave
      Reg
      Lvl
      AllPub
      Inside
      Gtl
      ...
      61
      0
      0
      0
      0
      0
      2
      2008
      WD
      Normal
    
    
      1
      20
      RL
      80.0
      9600
      Pave
      Reg
      Lvl
      AllPub
      FR2
      Gtl
      ...
      0
      0
      0
      0
      0
      0
      5
      2007
      WD
      Normal
    
  

2 rows × 74 columns



In [61]:

    
#find non_numeric features since XGBoost doens't work with categorical features
non_numeric_features= []
for feature, dtype in enumerate(big_X_imputed.dtypes):
    #print(big_X_imputed.columns[feature], dtype)
    if dtype == object:
        non_numeric_features.append(big_X_imputed.columns[feature])
print("len(non_numeric_features): ", len(non_numeric_features))
print(non_numeric_features)









    



len(non_numeric_features):  38
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']



In [62]:

    
#convert categorical features to integers
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for feature in non_numeric_features:
    big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature])



In [63]:

    
#big_X_imputed.info()
big_X_imputed.head(2)









    Out[63]:






  
    
      
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      LotShape
      LandContour
      Utilities
      LotConfig
      LandSlope
      ...
      OpenPorchSF
      EnclosedPorch
      3SsnPorch
      ScreenPorch
      PoolArea
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
    
  
  
    
      0
      60
      3
      65.0
      8450
      1
      3
      3
      0
      4
      0
      ...
      61
      0
      0
      0
      0
      0
      2
      2008
      8
      4
    
    
      1
      20
      3
      80.0
      9600
      1
      3
      3
      0
      2
      0
      ...
      0
      0
      0
      0
      0
      0
      5
      2007
      8
      4
    
  

2 rows × 74 columns



In [65]:

    
#prepare inputs for model
train_X = big_X_imputed[0:X_train.shape[0]].as_matrix()
test_X = big_X_imputed[X_train.shape[0]::].as_matrix()
train_y = train_df['SalePrice']



In [66]:

    
#need to log transform so that cheap houses and expensive houses are equally weighted. 
#We don't want expensive houses to dominate.
plt.subplot(1,2,1)
plt.title('raw')
train_y.hist()

plt.subplot(1,2,2)
plt.title('log transformed')
train_y.apply(np.log).hist()









    Out[66]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fa3b17a4668>



In [67]:

    
train_y = train_y.apply(np.log)



In [68]:

    
print("train_X.shape = {},\ntest_X.shape = {},\ntrain_y.shape() = {}".format(
    train_X.shape, test_X.shape, train_y.shape))









    



train_X.shape = (1460, 74),
test_X.shape = (1459, 74),
train_y.shape() = (1460,)



In [69]:

    
#XGBOOST
import pickle
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
err = []



In [70]:

    
kf = KFold(5)



In [71]:

    
#errors look obscenely large...before the log transform.
for train_index, test_index in kf.split(train_X):
    #print("Train: ", train_index, ", Test: ", test_index)
    #fit fold
    xgb_model = xgb.XGBRegressor().fit(train_X[train_index], train_y[train_index])
    predictions = xgb_model.predict(train_X[test_index])
    actuals = train_y[test_index]
    err.append(mean_squared_error(actuals, predictions))
print("Errors: ",err)









    



Errors:  [0.013931335134448695, 0.021071190074793505, 0.018134963394421926, 0.014287029463296618, 0.019013999532209711]



In [72]:

    
#parameter optimization
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model, {'max_depth': [2,3],
                   'n_estimators': [800,1000],
                              'learning_rate': [.075,.1],
                              'reg_lambda':[.75,1],
                              'reg_alpha':[0,1e-5],
                              }, 
                   scoring = 'neg_mean_squared_error',
                   verbose = 1)
clf.fit(train_X, train_y)
clf.best_score_, clf.best_params_









    



Fitting 3 folds for each of 32 candidates, totalling 96 fits






    



[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed:   35.9s finished






    Out[72]:





(-0.015684356867696172,
 {'learning_rate': 0.1,
  'max_depth': 2,
  'n_estimators': 1000,
  'reg_alpha': 1e-05,
  'reg_lambda': 1})



In [73]:

    
#clf.get_params



In [74]:

    
#convert back to actual price
logpredictions = clf.predict(test_X)
print(logpredictions)
predictions = np.array([np.exp(x) for x in logpredictions])
print(predictions)









    



[ 11.72449303  11.96258926  12.14687061 ...,  12.03166389  11.61701775
  12.32793045]
[ 123561.34375    156778.5        188503.265625  ...,  167990.703125
  110970.2890625  225918.6875   ]



In [79]:

    
#export to csv
test_index.shape









    Out[79]:





(1459,)



In [ ]:



In [80]:

    
pred_df = pd.DataFrame(predictions, index = test_index, columns = ['SalePrice'] )



In [81]:

    
pred_df.head(2)









    Out[81]:






  
    
      
      SalePrice
    
    
      Id
      
    
  
  
    
      1461
      123561.34375
    
    
      1462
      156778.50000



In [82]:

    
pred_df.to_csv('output.csv', header = True, index_label = 'Id')



In [115]:

    
#xgb.plot_importance(clf)
#plt.show()



In [87]:

    
gbm = xgb.XGBRegressor(max_depth = 2, n_estimators= 1000, learning_rate=.1, reg_alpha=1e-05, reg_lambda = 1).fit(train_X, train_y)



In [88]:

    
predictions = gbm.predict(test_X)



In [90]:

    
print(predictions)









    



[ 11.72449303  11.96258926  12.14687061 ...,  12.03166389  11.61701775
  12.32793045]



In [91]:

    
xgb.plot_importance(gbm)
plt.show()



In [92]:

    
X_train.head(2)









    Out[92]:






  
    
      
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      LotShape
      LandContour
      Utilities
      LotConfig
      LandSlope
      ...
      OpenPorchSF
      EnclosedPorch
      3SsnPorch
      ScreenPorch
      PoolArea
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
    
  
  
    
      0
      60
      RL
      65.0
      8450
      Pave
      Reg
      Lvl
      AllPub
      Inside
      Gtl
      ...
      61
      0
      0
      0
      0
      0
      2
      2008
      WD
      Normal
    
    
      1
      20
      RL
      80.0
      9600
      Pave
      Reg
      Lvl
      AllPub
      FR2
      Gtl
      ...
      0
      0
      0
      0
      0
      0
      5
      2007
      WD
      Normal
    
  

2 rows × 74 columns



In [93]:

    
X_train.columns[[44,32,3,15,35]]









    Out[93]:





Index(['GrLivArea', 'BsmtFinSF1', 'LotArea', 'OverallQual', 'BsmtUnfSF'], dtype='object')



In [ ]:



In [94]:

    
from sklearn.model_selection import cross_val_score
cross_val_score(clf, train_X, train_y, scoring='neg_mean_squared_error')









    



Fitting 3 folds for each of 32 candidates, totalling 96 fits






    



[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed:   30.8s finished






    



Fitting 3 folds for each of 32 candidates, totalling 96 fits






    



[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed:   31.9s finished






    



Fitting 3 folds for each of 32 candidates, totalling 96 fits






    



[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed:   30.8s finished






    Out[94]:





array([-0.01262227, -0.02108672, -0.01498771])



In [95]:

    
X_train.columns[[0,4,36,45, 37, 42, 60]]









    Out[95]:





Index(['MSSubClass', 'Street', 'TotalBsmtSF', 'BsmtFullBath', 'Heating',
       '2ndFlrSF', 'GarageQual'],
      dtype='object')



In [96]:

    
X_train.columns[[21,61]]









    Out[96]:





Index(['Exterior1st', 'GarageCond'], dtype='object')



In [97]:

    
#xgb.plot_tree(gbm)



In [98]:

    
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions) )
    print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob))
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')



In [99]:

    
#modelfit(gbm, train_X, X_train.columns)

	MSSubClass	MSZoning	LotFrontage	LotArea	Street	LotShape	LandContour	Utilities	LotConfig	LandSlope	...	OpenPorchSF	EnclosedPorch	3SsnPorch	ScreenPorch	PoolArea	MiscVal	MoSold	YrSold	SaleType	SaleCondition
0	60	RL	65.0	8450	Pave	Reg	Lvl	AllPub	Inside	Gtl	...	61	0	0	0	0	0	2	2008	WD	Normal
1	20	RL	80.0	9600	Pave	Reg	Lvl	AllPub	FR2	Gtl	...	0	0	0	0	0	0	5	2007	WD	Normal