notebook.community

Edit and run



In [80]:

    
import math
import warnings

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf

from IPython import display
from scipy import stats
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn import preprocessing
from tensorflow.python.data import Dataset

warnings.filterwarnings('ignore')
%matplotlib inline

# Citations: Feature selection section adapted from Comprehensive Data Exploration with Python by Pedro Marcelino



In [81]:

    
# Load training examples
df_train = pd.read_csv('../datasets/house_prices/train.csv')
# Randomize dataset to ensure that we draw observations idependently and identically
# from training distribution (i.i.d property)
df_train = df_train.reindex(np.random.permutation(df_train.index))

# Load test examples
df_test = pd.read_csv('../datasets/house_prices/test.csv')
df_test = df_test.reindex(np.random.permutation(df_test.index))



In [82]:

    
df_train.columns









    Out[82]:





Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition', 'SalePrice'],
      dtype='object')



In [83]:

    
# PART 1: Analyze label field, SalePrice
df_train['SalePrice'].describe()









    Out[83]:





count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64



In [84]:

    
# Look at distribution
sns.distplot(df_train['SalePrice'])
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)



In [85]:

    
# Looks much better!
# PART 2: Feature selection
# First, let's look at a correlation matrix between all of our variables
correlation_matrix = df_train.corr() # Calculates pairwise correlation of columns; defauls to using 'pearson' correlation coefficient.
_ = plt.subplots(figsize=(12, 9))
sns.heatmap(correlation_matrix, vmax=0.8, square=True)









    Out[85]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a3234d1d0>



In [86]:

    
# Observations
# 1) TotalBsmtSF and 1stFlrSF are highly correlated; likely encode the same information,
# therefore if we were to choose them as features in our model, we would only
# need to choose one of them.
# 2) GarageCars and GarageArea are also highly corralated; same as above.
# 3) etc. there are other pairs of columns that are highly correlated.



In [87]:

    
# Let's look at a correlation matrix between the k variables that are
# most highly correlated with a label variable, SalePrice:
k = 10
variable_columns = correlation_matrix.nlargest(k, 'SalePrice')['SalePrice'].index
label_correlation_matrix = np.corrcoef(df_train[variable_columns].values.T)
sns.set(font_scale=1.25)
heatmap = sns.heatmap(label_correlation_matrix, cbar=True, annot=True, square=True, fmt='0.2f', annot_kws={'size': 10}, yticklabels=variable_columns.values, xticklabels=variable_columns.values)



In [88]:

    
# Selected features
# 1) OverallQual: High correlation with SalePrice and not much correlation
# with other features
# 2) GrLivArea: Same as above
# 3) GarageCars: Same as above, don't need GarageArea since it encodes same
# information as this variable.
# 4) TotalBsmtSF: Same as above, don't need 1stFlrSF
# 5) Let's use YearBuilt, too.



In [89]:

    
# Now that we've selected features, let's look at correlation between features and label, SalePrice
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'YearBuilt']
sns.pairplot(df_train[cols], size=2.5)
plt.show()



In [90]:

    
# PART 3: Clean missing data
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum() / df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)









    Out[90]:







  
    
      
      Total
      Percent
    
  
  
    
      PoolQC
      1453
      0.995205
    
    
      MiscFeature
      1406
      0.963014
    
    
      Alley
      1369
      0.937671
    
    
      Fence
      1179
      0.807534
    
    
      FireplaceQu
      690
      0.472603
    
    
      LotFrontage
      259
      0.177397
    
    
      GarageCond
      81
      0.055479
    
    
      GarageType
      81
      0.055479
    
    
      GarageYrBlt
      81
      0.055479
    
    
      GarageFinish
      81
      0.055479
    
    
      GarageQual
      81
      0.055479
    
    
      BsmtExposure
      38
      0.026027
    
    
      BsmtFinType2
      38
      0.026027
    
    
      BsmtFinType1
      37
      0.025342
    
    
      BsmtCond
      37
      0.025342
    
    
      BsmtQual
      37
      0.025342
    
    
      MasVnrArea
      8
      0.005479
    
    
      MasVnrType
      8
      0.005479
    
    
      Electrical
      1
      0.000685
    
    
      Utilities
      0
      0.000000



In [91]:

    
# Let's delete variables (columns) with missing data. Note that all of the
# variables with missing data aren't variables we really care about.
df_train = df_train.drop((missing_data[missing_data['Total'] > 1]).index,1)

# Let's also drop the one observation missing an 'Electrical' field.
df_train = df_train.drop(df_train.loc[df_train['Electrical'].isnull()].index)

# Double check that there no data is missing anymore
df_train.isnull().sum().sort_values(ascending=False)









    Out[91]:





SalePrice        0
OverallQual      0
BsmtUnfSF        0
BsmtFinSF2       0
BsmtFinSF1       0
Foundation       0
ExterCond        0
ExterQual        0
Exterior2nd      0
Exterior1st      0
RoofMatl         0
RoofStyle        0
YearRemodAdd     0
YearBuilt        0
OverallCond      0
HouseStyle       0
Heating          0
BldgType         0
Condition2       0
Condition1       0
Neighborhood     0
LandSlope        0
LotConfig        0
Utilities        0
LandContour      0
LotShape         0
Street           0
LotArea          0
MSZoning         0
MSSubClass       0
                ..
Fireplaces       0
SaleType         0
YrSold           0
MoSold           0
MiscVal          0
PoolArea         0
ScreenPorch      0
3SsnPorch        0
EnclosedPorch    0
OpenPorchSF      0
WoodDeckSF       0
PavedDrive       0
GarageArea       0
GarageCars       0
Functional       0
CentralAir       0
TotRmsAbvGrd     0
KitchenQual      0
KitchenAbvGr     0
BedroomAbvGr     0
HalfBath         0
FullBath         0
BsmtHalfBath     0
BsmtFullBath     0
GrLivArea        0
LowQualFinSF     0
2ndFlrSF         0
1stFlrSF         0
Electrical       0
Id               0
Length: 63, dtype: int64



In [92]:

    
# PART 4: Remove outliers
# First, let's look at SalePrice to determine if there are any outliers that
# we need to remove
saleprice_scaled = preprocessing.StandardScaler().fit_transform(df_train['SalePrice'][:, np.newaxis])
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]
print('outer range (low) of the distribution:')
print(low_range)
print('\nouter range (high) of the distribution:')
print(high_range)









    



outer range (low) of the distribution:
[[-1.83820775]
 [-1.83303414]
 [-1.80044422]
 [-1.78282123]
 [-1.77400974]
 [-1.62295562]
 [-1.6166617 ]
 [-1.58519209]
 [-1.58519209]
 [-1.57269236]]

outer range (high) of the distribution:
[[3.82758058]
 [4.0395221 ]
 [4.49473628]
 [4.70872962]
 [4.728631  ]
 [5.06034585]
 [5.42191907]
 [5.58987866]
 [7.10041987]
 [7.22629831]]



In [93]:

    
# Hmm, looks like there are some outliers (the 7.somethings)
# They are following the overall trend so we'll keep them.
# Let's look at GrLivArea vs SalePrice now:
data = pd.concat([df_train['SalePrice'], df_train['GrLivArea']], axis=1)
data.plot.scatter(x='GrLivArea', y='SalePrice', ylim=(0, 800000))









    



'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.






    Out[93]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a326b32e8>



In [94]:

    
# Looks like there are some GrLivArea outliers (the two farthest right points
# that don't seem to be following the overall trend). Let's drop those
# observations:
df_train.sort_values(by = 'GrLivArea', ascending = False)[:2]

# Moved outlier removal to preprocess_features function.
df_train = df_train.drop(df_train[df_train['Id'] == 1299].index)
df_train = df_train.drop(df_train[df_train['Id'] == 524].index)



In [95]:

    
# PART 5: Scale label field if needed
# Observations:
# 1) Hmm, isn't quite a normal distribution (not symmetrical about the center, mean > median > mode)
# 2) Distribution has positive skewness (longer right tail), therefore scaling
# data can be transformed with a log function:
df_train['SalePrice'] = np.log(df_train['SalePrice'])

# Distribution with log transformation (f=stats.norm also graphs normal curve)
sns.distplot(df_train['SalePrice'], fit=stats.norm)
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)



In [96]:

    
# PART 6: Scale features if needed
sns.distplot(df_train['GrLivArea'], fit=stats.norm)
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'], plot=plt)



In [97]:

    
# Looks like GrLivArea also has a bit of a postive skew to its distribution.
df_train['GrLivArea'] = np.log(df_train['GrLivArea'])
sns.distplot(df_train['GrLivArea'], fit=stats.norm)
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'], plot=plt)



In [98]:

    
sns.distplot(df_train['TotalBsmtSF'], fit=stats.norm);
fig = plt.figure()
res = stats.probplot(df_train['TotalBsmtSF'], plot=plt)



In [99]:

    
#create column for new variable (one is enough because it's a binary categorical feature)
#if area>0 it gets 1, for area==0 it gets 0
df_train['HasBsmt'] = pd.Series(len(df_train['TotalBsmtSF']), index=df_train.index)
df_train['HasBsmt'] = 0 
df_train.loc[df_train['TotalBsmtSF']>0,'HasBsmt'] = 1



In [100]:

    
# Log transform TotalBsmtSF for observations that have basements.
df_train.loc[df_train['HasBsmt']==1,'TotalBsmtSF'] = np.log(df_train['TotalBsmtSF'])



In [101]:

    
sns.distplot(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], fit=stats.norm);
fig = plt.figure()
res = stats.probplot(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], plot=plt)



In [102]:

    
# Check homescedasticity between variables:
# Linear regression models depend on another assumption: homescedacity.
# Homescedacity (means "same variance") describes the property that the variance
# between the label variable and all feature variables exhibit similar levels of variance.
# Heteroscedasticity (opposite of homescedacitiy) can be detected using
# a bivariate graphical approach: if there is a cone shape between two variables,
# they exchibit heteroscedasticity.
# Note: Most of the time, if you can ensure normality, homescedacity follows.
plt.scatter(df_train['GrLivArea'], df_train['SalePrice'])









    Out[102]:





<matplotlib.collections.PathCollection at 0x1a33e896a0>



In [103]:

    
plt.scatter(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], df_train[df_train['TotalBsmtSF']>0]['SalePrice']);



In [205]:

    
# Load training examples
df_train = pd.read_csv('../datasets/house_prices/train.csv')
# Randomize dataset to ensure that we draw observations idependently and identically
# from training distribution (i.i.d property)
df_train = df_train.reindex(np.random.permutation(df_train.index))

# Load test examples
df_test = pd.read_csv('../datasets/house_prices/test.csv')
df_test = df_test.reindex(np.random.permutation(df_test.index))



In [224]:

    
# PART 7: Training models
FEATURE_FIELDS = ['OverallQual',
                  'GrLivArea',
                  'GarageCars',
                  'TotalBsmtSF',
                  'YearBuilt']

TARGET_FIELD = 'SalePrice'

# Need min and max years to one-hot encode year.
# There are cases when certain years only show up in either the train or test subsets which
# will cause the number of one-hot encoded features to be different leading to errors in prediction.
MIN_YEAR = min(df_train['YearBuilt'].min(), df_test['YearBuilt'].min())
MAX_YEAR = max(df_train['YearBuilt'].max(), df_test['YearBuilt'].max())

MIN_GARAGE_CARS = int(min(df_train['GarageCars'].min(), df_test['GarageCars'].min()))
MAX_GARAGE_CARS = int(max(df_train['GarageCars'].max(), df_test['GarageCars'].max()))

# Write some helper functions to be reused throughout
def preprocess_features(dataframe):
    """Prepares features from dataframe for model training.
    
    Args:
        dataframe: A Pandas DataFrame containing housing data.
    
    Returns:
        A DataFrame containing preprocessed features to be used for model.
    """
    processed_features = dataframe[FEATURE_FIELDS].copy()
    
    # Numerical features (scale them if needed to achieve normality)
    processed_features['GrLivArea'] = np.log(processed_features['GrLivArea'])
    
    # Some TotalBsmtSF values are 0 for houses without a basement.
    # Since we cannot log transform 0 values, we need to add a temporary
    # variable HasBsmt to determine with values to log transform, leaving 0 values
    # as 0.
    processed_features['HasBsmt'] = pd.Series(len(processed_features['TotalBsmtSF']), index=processed_features.index)
    processed_features['HasBsmt'] = 0 
    processed_features.loc[processed_features['TotalBsmtSF']>0,'HasBsmt'] = 1
    
    # Log transform TotalBsmtSF value if not 0 (house has basement)
    processed_features.loc[processed_features['HasBsmt']==1,'TotalBsmtSF'] = np.log(processed_features['TotalBsmtSF'])
    
    # HasBsmt field no longer needed so remove it
    processed_features = processed_features.drop(labels='HasBsmt', axis=1)

    # Remove observations with missing fields
    processed_features = processed_features.fillna(processed_features.mean())

    # Categorical features (need to one-hot encode them)
    # First, fill in categorical values that aren't represented in train dataset but are in test dataset
#     processed_features['YearBuilt'] = processed_features['YearBuilt'].astype(
#         'category', 
#         categories=[year for year in range(MIN_YEAR, MAX_YEAR + 1)],
#     )
#     processed_features['GarageCars'] = processed_features['GarageCars'].astype(int)
#     processed_features['GarageCars'] = processed_features['GarageCars'].astype(
#         'category', 
#         categories=[year for year in range(MIN_GARAGE_CARS, MAX_GARAGE_CARS + 1)],
#     )
#     year_built_one_hot = pd.get_dummies(processed_features['YearBuilt'], prefix='YearBuilt', drop_first=True)
#     garage_cars_one_hot = pd.get_dummies(processed_features['GarageCars'], prefix='GarageCars', drop_first=True)
#     processed_features = processed_features.drop(labels=['YearBuilt', 'GarageCars'], axis=1)
#     processed_features = pd.concat([processed_features, year_built_one_hot, garage_cars_one_hot], axis=1)
    return processed_features

def _remove_missing_data(dataframe):
    """Removes observations that have missing fields from dataframe."""
    before_length = len(dataframe.index)
    rows_with_missing_data = dataframe.loc[dataframe[FEATURE_FIELDS].isnull().sum(axis=1) > 0].index
    print("Rows with missing data: %s" % rows_with_missing_data)
    dataframe = dataframe.drop(rows_with_missing_data)
    after_length = len(dataframe.index)
    num_rows_removed = before_length - after_length
    print("Removed %d rows with missing features." % num_rows_removed)
    return dataframe

def preprocess_targets(dataframe):
    """Prepares target features (labels) from dataframe for model training.
    
    Args:
        dataframe: A Pandas DataFrame containing housing data.
    
    Returns:
        A DataFrame containing preprocessed targets to be used for model.
    """
    processed_targets = pd.DataFrame()
    processed_targets[TARGET_FIELD] = np.log(dataframe[TARGET_FIELD])
    return processed_targets

# Prepare train targets (labels)
training_targets = preprocess_targets(df_train)

# Split training dataset into training and validation subsets
train_without_missing = (df_train)
training_examples = preprocess_features(df_train)
training_examples, validation_examples, training_targets, validation_targets = model_selection.train_test_split(training_examples, training_targets, test_size=0.25)

# Prepate test examples to make predictions on
test_examples = preprocess_features(df_test)

print("Training examples summary:")
display.display(training_examples.describe())
print("Training targets summary:")
display.display(training_targets.describe())

print("Validation examples summary:")
display.display(validation_examples.describe())
print("Validation targets summary:")
display.display(validation_targets.describe())

print("Test examples summary:")
display.display(test_examples.describe())
print("Test targets summary:")
display.display(test_targets.describe())









    



Training examples summary:






    







  
    
      
      OverallQual
      GrLivArea
      GarageCars
      TotalBsmtSF
      YearBuilt
    
  
  
    
      count
      1095.000000
      1095.000000
      1095.000000
      1095.000000
      1095.000000
    
    
      mean
      6.114155
      7.263901
      1.777169
      6.710815
      1971.856621
    
    
      std
      1.387701
      0.333896
      0.751372
      1.235947
      30.065110
    
    
      min
      1.000000
      5.811141
      0.000000
      0.000000
      1872.000000
    
    
      25%
      5.000000
      7.021084
      1.000000
      6.675823
      1954.500000
    
    
      50%
      6.000000
      7.292337
      2.000000
      6.887553
      1974.000000
    
    
      75%
      7.000000
      7.480146
      2.000000
      7.152269
      2001.000000
    
    
      max
      10.000000
      8.637994
      4.000000
      8.717682
      2010.000000
    
  








    



Training targets summary:






    







  
    
      
      SalePrice
    
  
  
    
      count
      1095.000000
    
    
      mean
      12.025229
    
    
      std
      0.397141
    
    
      min
      10.460242
    
    
      25%
      11.775290
    
    
      50%
      12.000892
    
    
      75%
      12.273731
    
    
      max
      13.521139
    
  








    



Validation examples summary:






    







  
    
      
      OverallQual
      GrLivArea
      GarageCars
      TotalBsmtSF
      YearBuilt
    
  
  
    
      count
      365.000000
      365.000000
      365.000000
      365.000000
      365.00000
    
    
      mean
      6.054795
      7.279395
      1.736986
      6.865692
      1969.50137
    
    
      std
      1.369712
      0.332698
      0.735203
      0.807504
      30.58609
    
    
      min
      2.000000
      6.173786
      0.000000
      0.000000
      1880.00000
    
    
      25%
      5.000000
      7.076654
      1.000000
      6.692084
      1950.00000
    
    
      50%
      6.000000
      7.283448
      2.000000
      6.933423
      1970.00000
    
    
      75%
      7.000000
      7.497762
      2.000000
      7.188413
      2000.00000
    
    
      max
      10.000000
      8.370084
      4.000000
      7.875879
      2009.00000
    
  








    



Validation targets summary:






    







  
    
      
      SalePrice
    
  
  
    
      count
      365.000000
    
    
      mean
      12.020518
    
    
      std
      0.406836
    
    
      min
      10.471950
    
    
      25%
      11.767180
    
    
      50%
      12.004568
    
    
      75%
      12.271392
    
    
      max
      13.534473
    
  








    



Test examples summary:






    







  
    
      
      OverallQual
      GrLivArea
      GarageCars
      TotalBsmtSF
      YearBuilt
    
  
  
    
      count
      1459.000000
      1459.000000
      1459.000000
      1459.000000
      1459.000000
    
    
      mean
      6.078821
      7.253745
      1.766118
      6.714643
      1971.357779
    
    
      std
      1.436812
      0.316152
      0.775679
      1.205425
      30.390071
    
    
      min
      1.000000
      6.008813
      0.000000
      0.000000
      1879.000000
    
    
      25%
      5.000000
      7.018849
      1.000000
      6.664409
      1953.000000
    
    
      50%
      6.000000
      7.266827
      2.000000
      6.895683
      1973.000000
    
    
      75%
      7.000000
      7.450661
      2.000000
      7.173191
      2001.000000
    
    
      max
      10.000000
      8.536015
      5.000000
      8.536015
      2010.000000
    
  








    



Test targets summary:






    







  
    
      
      SalePrice
    
  
  
    
      count
      365.000000
    
    
      mean
      12.000411
    
    
      std
      0.384291
    
    
      min
      10.542706
    
    
      25%
      11.744037
    
    
      50%
      11.964001
    
    
      75%
      12.230765
    
    
      max
      13.345507



In [225]:

    
def output_model_eval(training_targets, training_predictions, validation_targets, validation_predictions):
    # Calculate root mean squared error
    mse = metrics.mean_squared_error(training_targets, training_predictions)
    rsme = math.sqrt(mse)
    print("Training mean squared error: %.4f" % mse)
    print("Training root mean squared error: %.2f" % rsme)

    mse = metrics.mean_squared_error(validation_targets, validation_predictions)
    rsme = math.sqrt(mse)
    print("Validation mean squared error: %.4f" % mse)
    print("Validation root mean squared error: %.2f" % rsme)

    # Variance score: 1 is a perfect prediction
    r2_score = metrics.r2_score(training_targets, training_predictions)
    print("Training variance (r2) score: %.4f" % r2_score)

    r2_score = metrics.r2_score(validation_targets, validation_predictions)
    print("Validation variance (r2) score: %.4f" % r2_score)

    # Plot actual values vs. predictions for both training and validation sets and
    # identity line for reference
    fig, ax = plt.subplots(figsize=(12, 9))
    validation_plt = plt.scatter(validation_targets, validation_predictions, color='red', s=10)
    training_plt = plt.scatter(training_targets, training_predictions, color='blue', s=10)
    lims = [
        np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
        np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
    ]

    # now plot both limits against each other
    ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
    ax.set_aspect('equal')
    ax.set_xlim(lims)
    plt.legend([validation_plt, training_plt], ['Validation', 'Training'])
    plt.title('Actual vs. predicted')
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.show()
    return model



In [226]:

    
def linear_regression(training_examples,
                      training_targets,
                      validation_examples,
                      validation_targets):
    
    # For models with one-hot encoded variables, you need to set 'fit_intercept' to False to avoid the
    # dummy variable trap.
    model = linear_model.LinearRegression()
    model.fit(training_examples, training_targets)
    
    training_predictions = model.predict(training_examples)
    validation_predictions = model.predict(validation_examples)
    
    output_model_eval(training_targets, training_predictions, validation_targets, validation_predictions)
    return model

def ridgeCV_regression(training_examples,
                     training_targets,
                     validation_examples,
                     validation_targets):

    model = linear_model.RidgeCV()
    model.fit(training_examples, training_targets)
    
    training_predictions = model.predict(training_examples)
    validation_predictions = model.predict(validation_examples)
    
    output_model_eval(training_targets, training_predictions, validation_targets, validation_predictions)
    return model

model = linear_regression(training_examples, training_targets, validation_examples, validation_targets)

# Have to apply np.exp to undo log transform done on label, SalePrice
test_predictions = np.exp(model.predict(test_examples))
test_predictions_df = pd.DataFrame(test_predictions)









    



Training mean squared error: 0.0282
Training root mean squared error: 0.17
Validation mean squared error: 0.0328
Validation root mean squared error: 0.18
Training variance (r2) score: 0.8208
Validation variance (r2) score: 0.8011



In [245]:

    
# Prepare final dataframe for submission
test_examples_with_id = pd.concat([df_test['Id'], test_examples], axis=1)
test_examples_with_id['SalePrice'] = test_predictions_df
submission_df = pd.DataFrame({'Id': test_examples_with_id.Id, 'SalePrice': test_examples_with_id.SalePrice})
test_examples_with_id.loc[test_examples_with_id['Id'] == 2036]









    Out[245]:







  
    
      
      Id
      OverallQual
      GrLivArea
      GarageCars
      TotalBsmtSF
      YearBuilt
      SalePrice
    
  
  
    
      1419
      2036
      8
      7.36897
      2.0
      6.866933
      1999
      123563.284089



In [246]:

    
# Convert submission_df to csv
submission_df.to_csv('submission.csv', index=False)



In [ ]:

	Total	Percent
PoolQC	1453	0.995205
MiscFeature	1406	0.963014
Alley	1369	0.937671
Fence	1179	0.807534
FireplaceQu	690	0.472603
LotFrontage	259	0.177397
GarageCond	81	0.055479
GarageType	81	0.055479
GarageYrBlt	81	0.055479
GarageFinish	81	0.055479
GarageQual	81	0.055479
BsmtExposure	38	0.026027
BsmtFinType2	38	0.026027
BsmtFinType1	37	0.025342
BsmtCond	37	0.025342
BsmtQual	37	0.025342
MasVnrArea	8	0.005479
MasVnrType	8	0.005479
Electrical	1	0.000685
Utilities	0	0.000000

	OverallQual	GrLivArea	GarageCars	TotalBsmtSF	YearBuilt
count	1095.000000	1095.000000	1095.000000	1095.000000	1095.000000
mean	6.114155	7.263901	1.777169	6.710815	1971.856621
std	1.387701	0.333896	0.751372	1.235947	30.065110
min	1.000000	5.811141	0.000000	0.000000	1872.000000
25%	5.000000	7.021084	1.000000	6.675823	1954.500000
50%	6.000000	7.292337	2.000000	6.887553	1974.000000
75%	7.000000	7.480146	2.000000	7.152269	2001.000000
max	10.000000	8.637994	4.000000	8.717682	2010.000000

	OverallQual	GrLivArea	GarageCars	TotalBsmtSF	YearBuilt
count	365.000000	365.000000	365.000000	365.000000	365.00000
mean	6.054795	7.279395	1.736986	6.865692	1969.50137
std	1.369712	0.332698	0.735203	0.807504	30.58609
min	2.000000	6.173786	0.000000	0.000000	1880.00000
25%	5.000000	7.076654	1.000000	6.692084	1950.00000
50%	6.000000	7.283448	2.000000	6.933423	1970.00000
75%	7.000000	7.497762	2.000000	7.188413	2000.00000
max	10.000000	8.370084	4.000000	7.875879	2009.00000

	OverallQual	GrLivArea	GarageCars	TotalBsmtSF	YearBuilt
count	1459.000000	1459.000000	1459.000000	1459.000000	1459.000000
mean	6.078821	7.253745	1.766118	6.714643	1971.357779
std	1.436812	0.316152	0.775679	1.205425	30.390071
min	1.000000	6.008813	0.000000	0.000000	1879.000000
25%	5.000000	7.018849	1.000000	6.664409	1953.000000
50%	6.000000	7.266827	2.000000	6.895683	1973.000000
75%	7.000000	7.450661	2.000000	7.173191	2001.000000
max	10.000000	8.536015	5.000000	8.536015	2010.000000