In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
%matplotlib inline
from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
return np.sqrt(mean_squared_error(y_true, y_pred))
In [15]:
df_train = pd.read_csv('train.csv')
df_train.drop(['MSZoning','Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'],inplace=True,axis=1)
y_data = df_train['SalePrice']
X_data = df_train.drop('SalePrice',axis=1)
#split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data,y_data,test_size=0.3,random_state=0)
In [16]:
import xgboost as xgb
regr = xgb.XGBRegressor(
colsample_bytree=0.4,
gamma=0.0,
learning_rate=0.001,
max_depth=4,
min_child_weight=1.5,
n_estimators=30000,
reg_alpha=0.9,
reg_lambda=0.6,
subsample=0.2,
seed=42,
silent=1)
In [17]:
regr.fit(X_train,y_train)
y_pred = regr.predict(X_test)
print("XGBoost score on training set: ", rmse(y_test, y_pred))
In [ ]: