In [ ]:
# load libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
print(os.listdir("../input"))
In [ ]:
# load the datasets in
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
sample_sub = pd.read_csv('../input/sample_submission.csv')
In [ ]:
# look at train dataset
train.head()
In [ ]:
print(train.shape)
print(train.info())
We have 1460 observations, and 81 columns. There quite a few strings too, which we would have to encode later on.
Let's have a look at the number of missing values.
In [ ]:
tmp = train.isnull().sum()
In [ ]:
# get top 10 results
tmp.sort_values(ascending=False).head(10).plot(kind='bar', figsize=(8,8))
In [ ]:
drop_cols = ['PoolQC','MiscFeature','Alley','Fence']
In [ ]:
# write custom transformer to drop these 4 cols for use in Pipeline later
from sklearn.base import BaseEstimator, TransformerMixin
def DropColumnsTransform(BaseEstimator, TransformerMixin):
def __init__(self, attribs_drop):
self.attribs_drop = attribs_drop
def fit(self, X):
return self
def transform(self, X):
return X.drop(self.attribs_drop, axis=1).values
In [ ]:
# look at categorical data
train_cat = train.select_dtypes(include=['object'])
train_cat.shape
# use this to impute missing values as "?"
train_cat = train_cat.fillna("?")
In [ ]:
print("43/%d or %.2f%% of columns are categorical" % (train.shape[1], 43/train.shape[1]*100))
In [ ]:
from sklearn.preprocessing import LabelBinarizer, Imputer
LabelBinarizer = LabelBinarizer()
# loop to apply LB to each column individually, then combine them back together
list_cols = []
for col in list(train_cat.columns):
x = train_cat[col].values
x_trans = LabelBinarizer.fit_transform(x)
list_cols.append(x_trans)
train_cat_transformed = np.concatenate(list_cols,axis=1)
train_cat_transformed
In [ ]:
# numerical data now
Imp = Imputer(strategy="median")
train_num = train.select_dtypes(include=['number'])
train_num.shape
In [ ]:
# look at correlation
cor = train_num.corr()
f = plt.figure(figsize=(15,15))
sns.heatmap(cor, cmap='plasma')
Many features (e.g. LotArea, GarageCars) are indeed correlated highly with SalePrice.
In [ ]:
tmp = cor['SalePrice'].sort_values(ascending=False)
tmp[1:11].plot(kind='bar', figsize=(8,8))
In [ ]:
# we will have to remove SalePrice before imputing
train_num_wsp = train_num.drop('SalePrice',axis=1)
train_num_tr = Imp.fit_transform(train_num_wsp)
train_num_tr
In [ ]:
X = np.concatenate([train_num_tr, train_cat_transformed],axis=1)
y = train_num['SalePrice'].values
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)
In [ ]:
from sklearn.model_selection import train_test_split
# split into 10% for validation at end
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1)
In [ ]:
# Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse
linreg = LinearRegression()
scores = cross_val_score(linreg, X_train, y_train, scoring="neg_mean_squared_error", cv=10, verbose=1)
def printscorespretty(scores):
sc = np.sqrt(-scores)
print("Scores:", sc)
print("Mean:", np.mean(sc))
print("SD:", np.sqrt(np.var(sc)))
printscorespretty(scores)
In [ ]:
#Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
scores = cross_val_score(dtr, X_train, y_train, scoring="neg_mean_squared_error", cv=10, verbose=1)
printscorespretty(scores)
DecisionTree Regressor is performing much better than Linear Regression here, perhaps capturing some non-linearity in data.
In [ ]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
scores = cross_val_score(rf, X_train, y_train, scoring="neg_mean_squared_error", cv=10, verbose=1)
printscorespretty(scores)
Best performance thus far is from RF.
In [ ]:
# XGBoost
from xgboost import XGBRegressor
XGB = XGBRegressor()
scores = cross_val_score(XGB, X_train, y_train, scoring="neg_mean_squared_error", cv=10, verbose=1)
printscorespretty(scores)
In [ ]: