In [80]:
import math
import warnings

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf

from IPython import display
from scipy import stats
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn import preprocessing
from tensorflow.python.data import Dataset

warnings.filterwarnings('ignore')
%matplotlib inline

# Citations: Feature selection section adapted from Comprehensive Data Exploration with Python by Pedro Marcelino

In [81]:
# Load training examples
df_train = pd.read_csv('../datasets/house_prices/train.csv')
# Randomize dataset to ensure that we draw observations idependently and identically
# from training distribution (i.i.d property)
df_train = df_train.reindex(np.random.permutation(df_train.index))

# Load test examples
df_test = pd.read_csv('../datasets/house_prices/test.csv')
df_test = df_test.reindex(np.random.permutation(df_test.index))

In [82]:
df_train.columns


Out[82]:
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition', 'SalePrice'],
      dtype='object')

In [83]:
# PART 1: Analyze label field, SalePrice
df_train['SalePrice'].describe()


Out[83]:
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [84]:
# Look at distribution
sns.distplot(df_train['SalePrice'])
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)



In [85]:
# Looks much better!
# PART 2: Feature selection
# First, let's look at a correlation matrix between all of our variables
correlation_matrix = df_train.corr() # Calculates pairwise correlation of columns; defauls to using 'pearson' correlation coefficient.
_ = plt.subplots(figsize=(12, 9))
sns.heatmap(correlation_matrix, vmax=0.8, square=True)


Out[85]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a3234d1d0>

In [86]:
# Observations
# 1) TotalBsmtSF and 1stFlrSF are highly correlated; likely encode the same information,
# therefore if we were to choose them as features in our model, we would only
# need to choose one of them.
# 2) GarageCars and GarageArea are also highly corralated; same as above.
# 3) etc. there are other pairs of columns that are highly correlated.

In [87]:
# Let's look at a correlation matrix between the k variables that are
# most highly correlated with a label variable, SalePrice:
k = 10
variable_columns = correlation_matrix.nlargest(k, 'SalePrice')['SalePrice'].index
label_correlation_matrix = np.corrcoef(df_train[variable_columns].values.T)
sns.set(font_scale=1.25)
heatmap = sns.heatmap(label_correlation_matrix, cbar=True, annot=True, square=True, fmt='0.2f', annot_kws={'size': 10}, yticklabels=variable_columns.values, xticklabels=variable_columns.values)



In [88]:
# Selected features
# 1) OverallQual: High correlation with SalePrice and not much correlation
# with other features
# 2) GrLivArea: Same as above
# 3) GarageCars: Same as above, don't need GarageArea since it encodes same
# information as this variable.
# 4) TotalBsmtSF: Same as above, don't need 1stFlrSF
# 5) Let's use YearBuilt, too.

In [89]:
# Now that we've selected features, let's look at correlation between features and label, SalePrice
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'YearBuilt']
sns.pairplot(df_train[cols], size=2.5)
plt.show()



In [90]:
# PART 3: Clean missing data
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum() / df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)


Out[90]:
Total Percent
PoolQC 1453 0.995205
MiscFeature 1406 0.963014
Alley 1369 0.937671
Fence 1179 0.807534
FireplaceQu 690 0.472603
LotFrontage 259 0.177397
GarageCond 81 0.055479
GarageType 81 0.055479
GarageYrBlt 81 0.055479
GarageFinish 81 0.055479
GarageQual 81 0.055479
BsmtExposure 38 0.026027
BsmtFinType2 38 0.026027
BsmtFinType1 37 0.025342
BsmtCond 37 0.025342
BsmtQual 37 0.025342
MasVnrArea 8 0.005479
MasVnrType 8 0.005479
Electrical 1 0.000685
Utilities 0 0.000000

In [91]:
# Let's delete variables (columns) with missing data. Note that all of the
# variables with missing data aren't variables we really care about.
df_train = df_train.drop((missing_data[missing_data['Total'] > 1]).index,1)

# Let's also drop the one observation missing an 'Electrical' field.
df_train = df_train.drop(df_train.loc[df_train['Electrical'].isnull()].index)

# Double check that there no data is missing anymore
df_train.isnull().sum().sort_values(ascending=False)


Out[91]:
SalePrice        0
OverallQual      0
BsmtUnfSF        0
BsmtFinSF2       0
BsmtFinSF1       0
Foundation       0
ExterCond        0
ExterQual        0
Exterior2nd      0
Exterior1st      0
RoofMatl         0
RoofStyle        0
YearRemodAdd     0
YearBuilt        0
OverallCond      0
HouseStyle       0
Heating          0
BldgType         0
Condition2       0
Condition1       0
Neighborhood     0
LandSlope        0
LotConfig        0
Utilities        0
LandContour      0
LotShape         0
Street           0
LotArea          0
MSZoning         0
MSSubClass       0
                ..
Fireplaces       0
SaleType         0
YrSold           0
MoSold           0
MiscVal          0
PoolArea         0
ScreenPorch      0
3SsnPorch        0
EnclosedPorch    0
OpenPorchSF      0
WoodDeckSF       0
PavedDrive       0
GarageArea       0
GarageCars       0
Functional       0
CentralAir       0
TotRmsAbvGrd     0
KitchenQual      0
KitchenAbvGr     0
BedroomAbvGr     0
HalfBath         0
FullBath         0
BsmtHalfBath     0
BsmtFullBath     0
GrLivArea        0
LowQualFinSF     0
2ndFlrSF         0
1stFlrSF         0
Electrical       0
Id               0
Length: 63, dtype: int64

In [92]:
# PART 4: Remove outliers
# First, let's look at SalePrice to determine if there are any outliers that
# we need to remove
saleprice_scaled = preprocessing.StandardScaler().fit_transform(df_train['SalePrice'][:, np.newaxis])
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]
print('outer range (low) of the distribution:')
print(low_range)
print('\nouter range (high) of the distribution:')
print(high_range)


outer range (low) of the distribution:
[[-1.83820775]
 [-1.83303414]
 [-1.80044422]
 [-1.78282123]
 [-1.77400974]
 [-1.62295562]
 [-1.6166617 ]
 [-1.58519209]
 [-1.58519209]
 [-1.57269236]]

outer range (high) of the distribution:
[[3.82758058]
 [4.0395221 ]
 [4.49473628]
 [4.70872962]
 [4.728631  ]
 [5.06034585]
 [5.42191907]
 [5.58987866]
 [7.10041987]
 [7.22629831]]

In [93]:
# Hmm, looks like there are some outliers (the 7.somethings)
# They are following the overall trend so we'll keep them.
# Let's look at GrLivArea vs SalePrice now:
data = pd.concat([df_train['SalePrice'], df_train['GrLivArea']], axis=1)
data.plot.scatter(x='GrLivArea', y='SalePrice', ylim=(0, 800000))


'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
Out[93]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a326b32e8>

In [94]:
# Looks like there are some GrLivArea outliers (the two farthest right points
# that don't seem to be following the overall trend). Let's drop those
# observations:
df_train.sort_values(by = 'GrLivArea', ascending = False)[:2]

# Moved outlier removal to preprocess_features function.
df_train = df_train.drop(df_train[df_train['Id'] == 1299].index)
df_train = df_train.drop(df_train[df_train['Id'] == 524].index)

In [95]:
# PART 5: Scale label field if needed
# Observations:
# 1) Hmm, isn't quite a normal distribution (not symmetrical about the center, mean > median > mode)
# 2) Distribution has positive skewness (longer right tail), therefore scaling
# data can be transformed with a log function:
df_train['SalePrice'] = np.log(df_train['SalePrice'])

# Distribution with log transformation (f=stats.norm also graphs normal curve)
sns.distplot(df_train['SalePrice'], fit=stats.norm)
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)



In [96]:
# PART 6: Scale features if needed
sns.distplot(df_train['GrLivArea'], fit=stats.norm)
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'], plot=plt)



In [97]:
# Looks like GrLivArea also has a bit of a postive skew to its distribution.
df_train['GrLivArea'] = np.log(df_train['GrLivArea'])
sns.distplot(df_train['GrLivArea'], fit=stats.norm)
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'], plot=plt)



In [98]:
sns.distplot(df_train['TotalBsmtSF'], fit=stats.norm);
fig = plt.figure()
res = stats.probplot(df_train['TotalBsmtSF'], plot=plt)



In [99]:
#create column for new variable (one is enough because it's a binary categorical feature)
#if area>0 it gets 1, for area==0 it gets 0
df_train['HasBsmt'] = pd.Series(len(df_train['TotalBsmtSF']), index=df_train.index)
df_train['HasBsmt'] = 0 
df_train.loc[df_train['TotalBsmtSF']>0,'HasBsmt'] = 1

In [100]:
# Log transform TotalBsmtSF for observations that have basements.
df_train.loc[df_train['HasBsmt']==1,'TotalBsmtSF'] = np.log(df_train['TotalBsmtSF'])

In [101]:
sns.distplot(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], fit=stats.norm);
fig = plt.figure()
res = stats.probplot(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], plot=plt)



In [102]:
# Check homescedasticity between variables:
# Linear regression models depend on another assumption: homescedacity.
# Homescedacity (means "same variance") describes the property that the variance
# between the label variable and all feature variables exhibit similar levels of variance.
# Heteroscedasticity (opposite of homescedacitiy) can be detected using
# a bivariate graphical approach: if there is a cone shape between two variables,
# they exchibit heteroscedasticity.
# Note: Most of the time, if you can ensure normality, homescedacity follows.
plt.scatter(df_train['GrLivArea'], df_train['SalePrice'])


Out[102]:
<matplotlib.collections.PathCollection at 0x1a33e896a0>

In [103]:
plt.scatter(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], df_train[df_train['TotalBsmtSF']>0]['SalePrice']);



In [205]:
# Load training examples
df_train = pd.read_csv('../datasets/house_prices/train.csv')
# Randomize dataset to ensure that we draw observations idependently and identically
# from training distribution (i.i.d property)
df_train = df_train.reindex(np.random.permutation(df_train.index))

# Load test examples
df_test = pd.read_csv('../datasets/house_prices/test.csv')
df_test = df_test.reindex(np.random.permutation(df_test.index))

In [224]:
# PART 7: Training models
FEATURE_FIELDS = ['OverallQual',
                  'GrLivArea',
                  'GarageCars',
                  'TotalBsmtSF',
                  'YearBuilt']

TARGET_FIELD = 'SalePrice'

# Need min and max years to one-hot encode year.
# There are cases when certain years only show up in either the train or test subsets which
# will cause the number of one-hot encoded features to be different leading to errors in prediction.
MIN_YEAR = min(df_train['YearBuilt'].min(), df_test['YearBuilt'].min())
MAX_YEAR = max(df_train['YearBuilt'].max(), df_test['YearBuilt'].max())

MIN_GARAGE_CARS = int(min(df_train['GarageCars'].min(), df_test['GarageCars'].min()))
MAX_GARAGE_CARS = int(max(df_train['GarageCars'].max(), df_test['GarageCars'].max()))

# Write some helper functions to be reused throughout
def preprocess_features(dataframe):
    """Prepares features from dataframe for model training.
    
    Args:
        dataframe: A Pandas DataFrame containing housing data.
    
    Returns:
        A DataFrame containing preprocessed features to be used for model.
    """
    processed_features = dataframe[FEATURE_FIELDS].copy()
    
    # Numerical features (scale them if needed to achieve normality)
    processed_features['GrLivArea'] = np.log(processed_features['GrLivArea'])
    
    # Some TotalBsmtSF values are 0 for houses without a basement.
    # Since we cannot log transform 0 values, we need to add a temporary
    # variable HasBsmt to determine with values to log transform, leaving 0 values
    # as 0.
    processed_features['HasBsmt'] = pd.Series(len(processed_features['TotalBsmtSF']), index=processed_features.index)
    processed_features['HasBsmt'] = 0 
    processed_features.loc[processed_features['TotalBsmtSF']>0,'HasBsmt'] = 1
    
    # Log transform TotalBsmtSF value if not 0 (house has basement)
    processed_features.loc[processed_features['HasBsmt']==1,'TotalBsmtSF'] = np.log(processed_features['TotalBsmtSF'])
    
    # HasBsmt field no longer needed so remove it
    processed_features = processed_features.drop(labels='HasBsmt', axis=1)

    # Remove observations with missing fields
    processed_features = processed_features.fillna(processed_features.mean())

    # Categorical features (need to one-hot encode them)
    # First, fill in categorical values that aren't represented in train dataset but are in test dataset
#     processed_features['YearBuilt'] = processed_features['YearBuilt'].astype(
#         'category', 
#         categories=[year for year in range(MIN_YEAR, MAX_YEAR + 1)],
#     )
#     processed_features['GarageCars'] = processed_features['GarageCars'].astype(int)
#     processed_features['GarageCars'] = processed_features['GarageCars'].astype(
#         'category', 
#         categories=[year for year in range(MIN_GARAGE_CARS, MAX_GARAGE_CARS + 1)],
#     )
#     year_built_one_hot = pd.get_dummies(processed_features['YearBuilt'], prefix='YearBuilt', drop_first=True)
#     garage_cars_one_hot = pd.get_dummies(processed_features['GarageCars'], prefix='GarageCars', drop_first=True)
#     processed_features = processed_features.drop(labels=['YearBuilt', 'GarageCars'], axis=1)
#     processed_features = pd.concat([processed_features, year_built_one_hot, garage_cars_one_hot], axis=1)
    return processed_features

def _remove_missing_data(dataframe):
    """Removes observations that have missing fields from dataframe."""
    before_length = len(dataframe.index)
    rows_with_missing_data = dataframe.loc[dataframe[FEATURE_FIELDS].isnull().sum(axis=1) > 0].index
    print("Rows with missing data: %s" % rows_with_missing_data)
    dataframe = dataframe.drop(rows_with_missing_data)
    after_length = len(dataframe.index)
    num_rows_removed = before_length - after_length
    print("Removed %d rows with missing features." % num_rows_removed)
    return dataframe

def preprocess_targets(dataframe):
    """Prepares target features (labels) from dataframe for model training.
    
    Args:
        dataframe: A Pandas DataFrame containing housing data.
    
    Returns:
        A DataFrame containing preprocessed targets to be used for model.
    """
    processed_targets = pd.DataFrame()
    processed_targets[TARGET_FIELD] = np.log(dataframe[TARGET_FIELD])
    return processed_targets

# Prepare train targets (labels)
training_targets = preprocess_targets(df_train)

# Split training dataset into training and validation subsets
train_without_missing = (df_train)
training_examples = preprocess_features(df_train)
training_examples, validation_examples, training_targets, validation_targets = model_selection.train_test_split(training_examples, training_targets, test_size=0.25)

# Prepate test examples to make predictions on
test_examples = preprocess_features(df_test)

print("Training examples summary:")
display.display(training_examples.describe())
print("Training targets summary:")
display.display(training_targets.describe())

print("Validation examples summary:")
display.display(validation_examples.describe())
print("Validation targets summary:")
display.display(validation_targets.describe())

print("Test examples summary:")
display.display(test_examples.describe())
print("Test targets summary:")
display.display(test_targets.describe())


Training examples summary:
OverallQual GrLivArea GarageCars TotalBsmtSF YearBuilt
count 1095.000000 1095.000000 1095.000000 1095.000000 1095.000000
mean 6.114155 7.263901 1.777169 6.710815 1971.856621
std 1.387701 0.333896 0.751372 1.235947 30.065110
min 1.000000 5.811141 0.000000 0.000000 1872.000000
25% 5.000000 7.021084 1.000000 6.675823 1954.500000
50% 6.000000 7.292337 2.000000 6.887553 1974.000000
75% 7.000000 7.480146 2.000000 7.152269 2001.000000
max 10.000000 8.637994 4.000000 8.717682 2010.000000
Training targets summary:
SalePrice
count 1095.000000
mean 12.025229
std 0.397141
min 10.460242
25% 11.775290
50% 12.000892
75% 12.273731
max 13.521139
Validation examples summary:
OverallQual GrLivArea GarageCars TotalBsmtSF YearBuilt
count 365.000000 365.000000 365.000000 365.000000 365.00000
mean 6.054795 7.279395 1.736986 6.865692 1969.50137
std 1.369712 0.332698 0.735203 0.807504 30.58609
min 2.000000 6.173786 0.000000 0.000000 1880.00000
25% 5.000000 7.076654 1.000000 6.692084 1950.00000
50% 6.000000 7.283448 2.000000 6.933423 1970.00000
75% 7.000000 7.497762 2.000000 7.188413 2000.00000
max 10.000000 8.370084 4.000000 7.875879 2009.00000
Validation targets summary:
SalePrice
count 365.000000
mean 12.020518
std 0.406836
min 10.471950
25% 11.767180
50% 12.004568
75% 12.271392
max 13.534473
Test examples summary:
OverallQual GrLivArea GarageCars TotalBsmtSF YearBuilt
count 1459.000000 1459.000000 1459.000000 1459.000000 1459.000000
mean 6.078821 7.253745 1.766118 6.714643 1971.357779
std 1.436812 0.316152 0.775679 1.205425 30.390071
min 1.000000 6.008813 0.000000 0.000000 1879.000000
25% 5.000000 7.018849 1.000000 6.664409 1953.000000
50% 6.000000 7.266827 2.000000 6.895683 1973.000000
75% 7.000000 7.450661 2.000000 7.173191 2001.000000
max 10.000000 8.536015 5.000000 8.536015 2010.000000
Test targets summary:
SalePrice
count 365.000000
mean 12.000411
std 0.384291
min 10.542706
25% 11.744037
50% 11.964001
75% 12.230765
max 13.345507

In [225]:
def output_model_eval(training_targets, training_predictions, validation_targets, validation_predictions):
    # Calculate root mean squared error
    mse = metrics.mean_squared_error(training_targets, training_predictions)
    rsme = math.sqrt(mse)
    print("Training mean squared error: %.4f" % mse)
    print("Training root mean squared error: %.2f" % rsme)

    mse = metrics.mean_squared_error(validation_targets, validation_predictions)
    rsme = math.sqrt(mse)
    print("Validation mean squared error: %.4f" % mse)
    print("Validation root mean squared error: %.2f" % rsme)

    # Variance score: 1 is a perfect prediction
    r2_score = metrics.r2_score(training_targets, training_predictions)
    print("Training variance (r2) score: %.4f" % r2_score)

    r2_score = metrics.r2_score(validation_targets, validation_predictions)
    print("Validation variance (r2) score: %.4f" % r2_score)

    # Plot actual values vs. predictions for both training and validation sets and
    # identity line for reference
    fig, ax = plt.subplots(figsize=(12, 9))
    validation_plt = plt.scatter(validation_targets, validation_predictions, color='red', s=10)
    training_plt = plt.scatter(training_targets, training_predictions, color='blue', s=10)
    lims = [
        np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
        np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
    ]

    # now plot both limits against each other
    ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
    ax.set_aspect('equal')
    ax.set_xlim(lims)
    plt.legend([validation_plt, training_plt], ['Validation', 'Training'])
    plt.title('Actual vs. predicted')
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.show()
    return model

In [226]:
def linear_regression(training_examples,
                      training_targets,
                      validation_examples,
                      validation_targets):
    
    # For models with one-hot encoded variables, you need to set 'fit_intercept' to False to avoid the
    # dummy variable trap.
    model = linear_model.LinearRegression()
    model.fit(training_examples, training_targets)
    
    training_predictions = model.predict(training_examples)
    validation_predictions = model.predict(validation_examples)
    
    output_model_eval(training_targets, training_predictions, validation_targets, validation_predictions)
    return model

def ridgeCV_regression(training_examples,
                     training_targets,
                     validation_examples,
                     validation_targets):

    model = linear_model.RidgeCV()
    model.fit(training_examples, training_targets)
    
    training_predictions = model.predict(training_examples)
    validation_predictions = model.predict(validation_examples)
    
    output_model_eval(training_targets, training_predictions, validation_targets, validation_predictions)
    return model

model = linear_regression(training_examples, training_targets, validation_examples, validation_targets)

# Have to apply np.exp to undo log transform done on label, SalePrice
test_predictions = np.exp(model.predict(test_examples))
test_predictions_df = pd.DataFrame(test_predictions)


Training mean squared error: 0.0282
Training root mean squared error: 0.17
Validation mean squared error: 0.0328
Validation root mean squared error: 0.18
Training variance (r2) score: 0.8208
Validation variance (r2) score: 0.8011

In [245]:
# Prepare final dataframe for submission
test_examples_with_id = pd.concat([df_test['Id'], test_examples], axis=1)
test_examples_with_id['SalePrice'] = test_predictions_df
submission_df = pd.DataFrame({'Id': test_examples_with_id.Id, 'SalePrice': test_examples_with_id.SalePrice})
test_examples_with_id.loc[test_examples_with_id['Id'] == 2036]


Out[245]:
Id OverallQual GrLivArea GarageCars TotalBsmtSF YearBuilt SalePrice
1419 2036 8 7.36897 2.0 6.866933 1999 123563.284089

In [246]:
# Convert submission_df to csv
submission_df.to_csv('submission.csv', index=False)

In [ ]: