In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
%matplotlib inline
def rmse(y_true, y_pred):
return np.sqrt(mean_squared_error(y_true, y_pred))
In [2]:
df_train = pd.read_csv('train.csv')
In [3]:
df_train.head()
Out[3]:
In [4]:
df_train['SalePrice'].describe()
Out[4]:
In [5]:
sns.distplot(df_train['SalePrice'])
Out[5]:
In [6]:
#correlation matrix
corrmat = df_train.corr()
f,ax = plt.subplots(figsize=(12,9))
sns.heatmap(corrmat)
Out[6]:
In [7]:
#saleprice correlation matrix
k = 20
cols = corrmat.nlargest(k,'SalePrice')['SalePrice']
print (cols)
In [8]:
#missing data
total = df_train.isnull().sum().sort_values(ascending = False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending = False)
missing_data = pd.concat([total,percent],axis=1,keys=['Total','Percent'])
missing_data.head(25)
Out[8]:
In [9]:
# In the search for normality
sns.distplot(df_train['SalePrice'],fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'],plot=plt)
In [10]:
df_train['SalePrice'] = np.log(df_train['SalePrice'])
sns.distplot(df_train['SalePrice'],fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'],plot=plt)
In [11]:
sns.distplot(df_train['GrLivArea'],fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'],plot=plt)
In [12]:
# do log transformation
df_train['GrLivArea'] = np.log(df_train['GrLivArea'])
sns.distplot(df_train['GrLivArea'],fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'],plot=plt)
In [13]:
sns.distplot(df_train['GarageArea'],fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['GarageArea'],plot=plt)
In [14]:
sns.distplot(df_train['TotalBsmtSF'],fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['TotalBsmtSF'],plot=plt)
In [15]:
sns.distplot(df_train['OverallQual'])
Out[15]:
In [16]:
sns.distplot(df_train['FullBath'])
Out[16]:
In [17]:
sns.distplot(df_train['Fireplaces'])
Out[17]:
In [18]:
sns.distplot(df_train['YearBuilt'],fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['YearBuilt'],plot=plt)
In [19]:
sns.distplot(df_train['YearRemodAdd'],fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['YearRemodAdd'],plot=plt)
In [20]:
# create new training set by previous feature selection
X_data = pd.concat([df_train['GrLivArea'],df_train['GarageArea'],df_train['TotalBsmtSF'],df_train['OverallQual'],df_train['FullBath'],df_train['Fireplaces'],df_train['YearBuilt'],df_train['YearRemodAdd']],axis=1)
y_data = df_train['SalePrice']
In [21]:
#split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data,y_data,test_size=0.2,random_state=0)
In [22]:
# train the models
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit (X_train,y_train)
print("Root Mean squared error: %.8f"
% rmse((reg.predict(X_test)),y_test))
In [23]:
from sklearn import linear_model
reg = linear_model.RidgeCV(alphas=[0.1,0.25,0.3,0.33,0.375,0.5,1.0,5.0,10.0])
reg.fit (X_train,y_train)
print (reg.alpha_)
print("Root Mean squared error: %.8f"
% rmse((reg.predict(X_test)),y_test))
In [24]:
from sklearn import linear_model
reg = linear_model.LassoCV(alphas=[0.0001,0.001,0.0125,0.025,0.05])
reg.fit (X_train,y_train)
print (reg.alpha_)
print("Root Mean squared error: %.8f"
% rmse((reg.predict(X_test)),y_test))
In [34]:
import xgboost as xgb
regr = xgb.XGBRegressor(
colsample_bytree=0.8,
gamma=0.0,
learning_rate=0.001,
max_depth=4,
min_child_weight=1.5,
n_estimators=10000,
reg_alpha=0.9,
reg_lambda=0.6,
subsample=0.8,
seed=42,
silent=False)
In [35]:
regr.fit(X_train,y_train)
y_pred = regr.predict(X_test)
print("XGBoost score on training set: ", rmse(y_test, y_pred))
In [27]:
#create prediction csv
df_test = pd.read_csv('test.csv')
#transfer data
df_test['GrLivArea'] = np.log(df_train['GrLivArea'])
X_data = pd.concat([df_test['GrLivArea'],df_test['GarageArea'],df_test['TotalBsmtSF'],df_test['OverallQual'],df_test['FullBath'],df_test['Fireplaces'],df_test['YearBuilt'],df_test['YearRemodAdd']],axis=1)
y_pred_data = regr.predict(X_data)
y_pred_data = np.exp(y_pred_data)
pd.DataFrame({'Id': df_test['Id'], 'SalePrice':y_pred_data}).to_csv('result.csv', index =False)
In [ ]: