In [ ]:
# load libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import os
print(os.listdir("../input"))

In [ ]:
# load the datasets in
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
sample_sub = pd.read_csv('../input/sample_submission.csv')

In [ ]:
# look at train dataset
train.head()

In [ ]:
print(train.shape)
print(train.info())

We have 1460 observations, and 81 columns. There quite a few strings too, which we would have to encode later on.

Let's have a look at the number of missing values.


In [ ]:
tmp = train.isnull().sum()

In [ ]:
# get top 10 results
tmp.sort_values(ascending=False).head(10).plot(kind='bar', figsize=(8,8))
  1. One way to handle this is to drop the first 4, given that almost all observations are missing.

In [ ]:
drop_cols = ['PoolQC','MiscFeature','Alley','Fence']

In [ ]:
# write custom transformer to drop these 4 cols for use in Pipeline later
from sklearn.base import BaseEstimator, TransformerMixin

def DropColumnsTransform(BaseEstimator, TransformerMixin):
    def __init__(self, attribs_drop):
        self.attribs_drop = attribs_drop
    def fit(self, X):
        return self
    def transform(self, X):
        return X.drop(self.attribs_drop, axis=1).values

In [ ]:
# look at categorical data
train_cat = train.select_dtypes(include=['object'])
train_cat.shape
# use this to impute missing values as "?"
train_cat = train_cat.fillna("?")

In [ ]:
print("43/%d or %.2f%% of columns are categorical" % (train.shape[1], 43/train.shape[1]*100))

In [ ]:
from sklearn.preprocessing import LabelBinarizer, Imputer

LabelBinarizer = LabelBinarizer()
# loop to apply LB to each column individually, then combine them back together
list_cols = []
for col in list(train_cat.columns):
    x = train_cat[col].values
    x_trans = LabelBinarizer.fit_transform(x)
    list_cols.append(x_trans)
train_cat_transformed = np.concatenate(list_cols,axis=1)
train_cat_transformed

In [ ]:
# numerical data now

Imp = Imputer(strategy="median")
train_num = train.select_dtypes(include=['number'])
train_num.shape

In [ ]:
# look at correlation
cor = train_num.corr()
f = plt.figure(figsize=(15,15))
sns.heatmap(cor, cmap='plasma')

Many features (e.g. LotArea, GarageCars) are indeed correlated highly with SalePrice.


In [ ]:
tmp = cor['SalePrice'].sort_values(ascending=False)
tmp[1:11].plot(kind='bar', figsize=(8,8))

In [ ]:
# we will have to remove SalePrice before imputing
train_num_wsp = train_num.drop('SalePrice',axis=1)
train_num_tr = Imp.fit_transform(train_num_wsp)
train_num_tr

In [ ]:
X = np.concatenate([train_num_tr, train_cat_transformed],axis=1)
y = train_num['SalePrice'].values
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Fit Models

  1. Linear Regression
  2. RandomForest

In [ ]:
from sklearn.model_selection import train_test_split
# split into 10% for validation at end
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1)

In [ ]:
# Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse

linreg = LinearRegression()
scores = cross_val_score(linreg, X_train, y_train, scoring="neg_mean_squared_error", cv=10, verbose=1)

def printscorespretty(scores):
    sc = np.sqrt(-scores)
    print("Scores:", sc)
    print("Mean:", np.mean(sc))
    print("SD:", np.sqrt(np.var(sc)))

printscorespretty(scores)

In [ ]:
#Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()
scores = cross_val_score(dtr, X_train, y_train, scoring="neg_mean_squared_error", cv=10, verbose=1)
printscorespretty(scores)

DecisionTree Regressor is performing much better than Linear Regression here, perhaps capturing some non-linearity in data.


In [ ]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
scores = cross_val_score(rf, X_train, y_train, scoring="neg_mean_squared_error", cv=10, verbose=1)
printscorespretty(scores)

Best performance thus far is from RF.


In [ ]:
# XGBoost
from xgboost import XGBRegressor

XGB = XGBRegressor()
scores = cross_val_score(XGB, X_train, y_train, scoring="neg_mean_squared_error", cv=10, verbose=1)
printscorespretty(scores)

In [ ]: