Predicting House Sale Prices with Python MICE + Regularized Linear Models


Author: Mizio Andersen

My notebook is on Kaggle

This notebook implements MICE (Multiple Imputation by Chained Equations) in fields with missing values.

Models used: LassoCV, RidgeCV, xgbRegressor combined with feature selection.

Feature engineering: one-hot encoding

Feature scaling: standardization (centering and scaling) of dataset that removes mean and scales to unit variance. It is necessary for machine learning estimators.

Outlier identification: based on LassoCV it compares rmse with residual of every point predicted vs. actual sale price.

Validation of models by split of training set and plotting predicted vs. actual sale price.

Data Cleaning using Class (python OOP)

We will use a class to hold all our important methods, which is the easiest way to generalize our analysis to treat many cases with few code changes.


In [1]:
# Predict the SalePrice
__author__ = 'mizio'
# import csv as csv
import numpy as np
import pandas as pd
# import matplotlib
# matplotlib.use('TkAgg')
import pylab as plt
from fancyimpute import MICE
# import sys
# sys.path.append('/custom/path/to/modules')
import random
# from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import skew
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LassoCV
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn_pandas import DataFrameMapper
import xgboost as xgb
from matplotlib.backends.backend_pdf import PdfPages
import datetime
from sklearn.cluster import FeatureAgglomeration
import seaborn as sns
# import math


Using TensorFlow backend.
/home/mizio/anaconda2/envs/kaggle_env_pyth36/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
class HousePrices(object):
    def __init__(self):
        self.df = HousePrices.df
        self.df_test = HousePrices.df_test
        self.df_all_feature_var_names = []
        self.df_test_all_feature_var_names = []
        self.timestamp = datetime.datetime.now().strftime('%Y%m%d_%Hh%Mm%Ss')
        self.is_with_log1p_SalePrice = 0

    # Private variables
    _non_numerical_feature_names = []
    _numerical_feature_names = []
    _is_one_hot_encoder = 0
    _feature_names_num = []
    _save_path = '/home/mizio/Documents/Kaggle/HousePrices/prepared_data_train_and_test_saved/'
    _is_not_import_data = 0
    _is_dataframe_with_sale_price = 1

    ''' Pandas Data Frame '''
    df = pd.read_csv('/home/mizio/Documents/Kaggle/HousePrices/train.csv', header=0)
    df_test = pd.read_csv('/home/mizio/Documents/Kaggle/HousePrices/test.csv', header=0)

    @staticmethod
    def square_feet_to_meters(area):
        square_meter_per_square_feet = 0.3048**2
        return area*square_meter_per_square_feet

    @staticmethod
    def extract_numerical_features(df):
        df = df.copy()
        # Identify numerical columns which are of type object
        numerical_features = pd.Series(data=False, index=df.columns, dtype=bool)

        for feature in df.columns:
            if any(tuple(df[feature].apply(lambda x: type(x)) == int)) or \
                            any(tuple(df[feature].apply(lambda x: type(x)) == float)) & \
                            (not any(tuple(df[feature].apply(lambda x: type(x)) == str))):
                numerical_features[feature] = 1
        return numerical_features[numerical_features == 1].index

    @staticmethod
    def extract_non_numerical_features(df):
        df = df.copy()
        return df.select_dtypes(exclude=[np.number])

    def clean_data(self, df):
        df = df.copy()
        is_with_MICE = 1
        if df.isnull().sum().sum() > 0:
            if is_with_MICE:
                # Imputation using MICE
                numerical_features_names = self.extract_numerical_features(df)
                df.loc[:, tuple(numerical_features_names)] = self.estimate_by_mice(
                    df[numerical_features_names])
            else:
                if any(tuple(df.columns == 'SalePrice')):
                    df = df.dropna()
                else:
                    df = df.dropna(1)
                    HousePrices._feature_names_num = pd.Series(data=np.intersect1d(
                        HousePrices._feature_names_num.values, df.columns), dtype=object)
        return df

    @staticmethod
    def encode_labels_in_numeric_format(df, estimated_var):
        # Transform non-numeric labels into numerical values
        # Cons.: gives additional unwanted structure to data, since some values are high and 
        # others low, despite labels
        # where no such comparing measure exists.
        # Alternative: use one-hot-encoding giving all labels their own column represented with 
        # only binary values.
        feature_name_num = ''.join([estimated_var, 'Num'])
        mask = ~df[estimated_var].isnull()
        df[feature_name_num] = df[estimated_var]
        df.loc[mask, tuple([feature_name_num])] = df[estimated_var].factorize()[0][
            mask[mask == 1].index]

    @staticmethod
    def label_classes(df, estimated_var):
        le = LabelEncoder()
        le.fit(df[estimated_var].values)
        return le.classes_

    @staticmethod
    def one_hot_encoder(df, estimated_var):
        df_class = df.copy()
        ohe = OneHotEncoder()
        label_classes = df_class[estimated_var].factorize()[1]
        new_one_hot_encoded_features = [''.join([estimated_var, '_', x]) for x in label_classes]
        mask = ~df[estimated_var].isnull()
        feature_var_values = ohe.fit_transform(np.reshape(np.array(
            df[''.join([estimated_var, 'Num'])][mask].values), 
                                                          (df[mask].shape[0], 
                                                           1))).toarray().astype(int)
        # Create new feature_var columns with one-hot encoded values
        for ite in new_one_hot_encoded_features:
            df[ite] = df[estimated_var]
        df.loc[mask, tuple(new_one_hot_encoded_features)] = feature_var_values

    @staticmethod
    def add_feature_var_name_with_zeros(df, feature_var_name):
        df[feature_var_name] = np.zeros((df.shape[0], 1), dtype=int)
        pass

    @staticmethod
    def feature_var_names_in_training_set_not_in_test_set(feature_var_names_training, 
                                                          feature_var_names_test):
        feature_var_name_addition_list = []
        for feature_var_name in feature_var_names_training:
            if not any(tuple(feature_var_name == feature_var_names_test)):
                feature_var_name_addition_list.append(feature_var_name)
        return np.array(feature_var_name_addition_list)

    def feature_mapping_to_numerical_values(self, df):
        HousePrices._is_one_hot_encoder = 1
        mask = ~df.isnull()
        # Assume that training set has all possible feature_var_names
        # Although it may occur in real life that a training set may hold a feature_var_name. 
        # But it is probably
        # avoided since such features cannot
        # be part of the trained learning algo.
        # Add missing feature_var_names of traning set not occuring in test set. Add these with 
        # zeros in columns.
        if not any(tuple(df.columns == 'SalePrice')):
            # All one-hot encoded feature var names occuring in test data is assigned the private 
            # public varaible
            # df_test_all_feature_var_names.
            self.df_test_all_feature_var_names = df.columns

        _feature_names_num = np.zeros((HousePrices._non_numerical_feature_names.shape[0],), 
                                      dtype=object)
        ith = 0
        for feature_name in HousePrices._non_numerical_feature_names:
            # Create a feature_nameNum list
            feature_name_num = ''.join([feature_name, 'Num'])
            _feature_names_num[ith] = feature_name_num
            ith += 1
            HousePrices.encode_labels_in_numeric_format(df, feature_name)

            if HousePrices._is_one_hot_encoder:
                is_with_label_binarizer = 0
                if is_with_label_binarizer:
                    if feature_name == 'MasVnrType':
                        print('debug')
                    # feature_var_values = mapper_df.fit_transform(df[feature_name][
                    #     mask[feature_name]])
                    mapper_df = DataFrameMapper([(feature_name, LabelBinarizer())], df_out=True)
                    # Check if we need to merge our result into df
                    feature_var_values = mapper_df.fit_transform(df.copy())
                    print(df[feature_name].isnull().sum().sum())
                    print(df[feature_name][mask[feature_name]].isnull().sum().sum())
                    for ite in feature_var_values.columns:
                        df[ite] = feature_var_values[ite]
                else:
                    HousePrices.one_hot_encoder(df, feature_name)
        HousePrices._feature_names_num = pd.Series(data=_feature_names_num, dtype=object)

    @staticmethod
    def feature_agglomeration(df, number_of_clusters=int(df.shape[1] / 1.2)):
        df = df.copy()
        # Todo: find optimal number of clusters for the feature clustering
        # number_of_clusters = int(df.shape[1]/2)

        agglomerated_features = FeatureAgglomeration(n_clusters=number_of_clusters)
        # mask = ~df[features].isnull()
        # mask_index = mask[mask == 1].index
        if any(tuple(df.columns == 'SalePrice')):
            res = agglomerated_features.fit_transform(np.reshape(np.array(df.dropna().values), 
                                                                 df.dropna().shape), 
                                                      y=df.SalePrice.values)
        else:
            res = agglomerated_features.fit_transform(np.reshape(np.array(df.values), df.shape))

        # Obs. case of adding values using df.loc[], remember mask is only possible for a 
        # single feature at a time.
        # print(''.join(['labels:', str(agglomerated_features.labels_)]))
        # print(''.join(['Children:', str(agglomerated_features.children_)]))
        # print(''.join(['number of leaves in the hierarchical tree:', 
        #                str(agglomerated_features.n_leaves_)]))
        # HousePrices.dendrogram(df)
        df = pd.DataFrame(data=res)
        return df

    @staticmethod
    def dendrogram(df, number_of_clusters=int(df.shape[1] / 1.2)):
        # Create Dendrogram
        agglomerated_features = FeatureAgglomeration(n_clusters=number_of_clusters)
        used_networks = np.arange(0, number_of_clusters, dtype=int)
        # used_networks = np.unique(agglomerated_features.labels_)

        # In our case all columns are clustered, which means used_columns is true in every element
        # used_columns = (df.columns.get_level_values(None)
                        # .astype(int)
                        # .isin(used_networks))
        # used_columns = (agglomerated_feature_labels.astype(int).isin(used_networks))
        # df = df.loc[:, used_columns]

        # Create a custom palette to identify the networks
        network_pal = sns.cubehelix_palette(len(used_networks),
                                            light=.9, dark=.1, reverse=True,
                                            start=1, rot=-2)
        network_lut = dict(zip(map(str, df.columns), network_pal))

        # Convert the palette to vectors that will be drawn on the side of the matrix
        networks = df.columns.get_level_values(None)
        network_colors = pd.Series(networks, index=df.columns).map(network_lut)
        sns.set(font="monospace")
        # Create custom colormap
        cmap = sns.diverging_palette(h_neg=210, h_pos=350, s=90, l=30, as_cmap=True)
        cg = sns.clustermap(df.astype(float).corr(), cmap=cmap, linewidths=.5, 
                            row_colors=network_colors, col_colors=network_colors)
        plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
        plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
        plt.show()


    def feature_engineering(self, df):
        # df['LotAreaSquareMeters'] = self.square_feet_to_meters(df.LotArea.values)

        is_skewness_correction_for_all_features = 1
        if is_skewness_correction_for_all_features:
            # Correcting for skewness
            # Treat all numerical variables that were not one-hot encoded
            if any(tuple(df.columns == 'SalePrice')):
                self.is_with_log1p_SalePrice = 1

            numerical_feature_names_of_non_modified_df = HousePrices._numerical_feature_names

            if HousePrices._is_one_hot_encoder:
                numerical_feature_names_of_non_modified_df = numerical_feature_names_of_non_modified_df.values
            else:
                numerical_feature_names_of_non_modified_df = np.concatenate([
                    HousePrices._feature_names_num.values, 
                    numerical_feature_names_of_non_modified_df.values])

            relevant_features = df[numerical_feature_names_of_non_modified_df].columns[
                (df[numerical_feature_names_of_non_modified_df].columns != 'Id')]
            self.skew_correction(df, relevant_features)
        else:
            # Only scale down sale price, since all leave other numerical features standardized.
            if any(tuple(df.columns == 'SalePrice')):
                # self.skew_correction(df, 'SalePrice')  # dropna complaining since no nulls
                self.is_with_log1p_SalePrice = 1
                df.loc[:, tuple(['SalePrice'])] = np.log1p(df.SalePrice)

    @staticmethod
    def skew_correction(df, numerical_features):
        # Skew correction
        # compute skewness
        skewed_feats = df[numerical_features].apply(lambda x: skew(x.dropna()))  
        skewed_feats = skewed_feats[skewed_feats > 0.75]
        skewed_feats = skewed_feats.index
        df.loc[:, tuple(skewed_feats)] = np.log1p(np.asarray(df[skewed_feats], dtype=float))
        # df[skewed_feats] = np.log1p(np.asarray(df[skewed_feats], dtype=float))

    @staticmethod
    def outlier_prediction(x_train, y_train):
        # Use built-in isolation forest or use predicted vs. actual
        # Compute squared residuals of every point
        # Make a threshold criteria for inclusion

        # The prediction returns 1 if sample point is inlier. If outlier prediction returns -1
        rng = np.random.RandomState(42)
        clf_all_features = IsolationForest(max_samples=100, random_state=rng)
        clf_all_features.fit(x_train)

        # Predict if a particular sample is an outlier using all features for higher dimensional 
        # data set.
        y_pred_train = clf_all_features.predict(x_train)

        # Exclude suggested outlier samples for improvement of prediction power/score
        outlier_map_out_train = np.array(map(lambda x: x == 1, y_pred_train))
        x_train_modified = x_train[outlier_map_out_train, ]
        y_train_modified = y_train[outlier_map_out_train, ]

        return x_train_modified, y_train_modified

    def drop_variable_before_preparation(self, df):
        # Acceptable limit of NaN in features
        limit_of_nans = 0.3*df.shape[0]
        # limit_of_nans = 800
        for feature in self.features_with_missing_values_in_dataframe(df).index:
            if df[feature].isnull().sum() > limit_of_nans:
                df = df.drop([feature], axis=1)

        # df = df.drop(['Alley'], axis=1)
        # df = df.drop(['MasVnrType'], axis=1)
        # df = df.drop(["Utilities", "LotFrontage", "Alley", "MasVnrType", "MasVnrArea", 
        #               "BsmtQual",
        #               "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
        #               "Electrical", "FireplaceQu", "GarageType", "GarageYrBlt",
        #               "GarageFinish", "GarageQual", "GarageCond", "PoolQC",
        #               "Fence", "MiscFeature"], axis=1)
        return df

    def drop_variable(self, df):
        # if HousePrices._is_one_hot_encoder:
            # Drop all categorical feature helping columns ('Num')
            # Todo: is it defined when importing data set? _feature_names_num
            # for feature_name in HousePrices._feature_names_num:
            #     df = df.drop([feature_name], axis=1)

        # is_with_feature_agglomeration = 0
        # if is_with_feature_agglomeration:
        #     print(df.shape)
        #     df = HousePrices.feature_agglomeration(df)
        #     print(df.shape)

        # df = df.drop(['Fireplaces'], axis=1)
        df = df.drop(['Id'], axis=1)

        if not any(tuple(df.columns == 'SalePrice')):
            # All feature var names occuring in test data is assigned the public varaible 
            # df_test_all_feature_var_names.
            self.df_test_all_feature_var_names = df.columns
        return df

    def save_dataframe(self, df):
        if HousePrices._is_dataframe_with_sale_price:
            df.to_csv(''.join([HousePrices._save_path, 'train_debug', self.timestamp, '.csv']), 
                      columns=df.columns, index=False)
        else:
            df.to_csv(''.join([HousePrices._save_path, 'test_debug', self.timestamp, '.csv']), 
                      columns=df.columns, index=False)

    @staticmethod
    def load_dataframe():
        if HousePrices._is_dataframe_with_sale_price:
            dataframe_name = 'train_debug'
        else:
            dataframe_name = 'test_debug'

        # one-hot encoded
        # date_time = '20170227_11h46m53s'  # has num col
        # date_time = '20170227_12h19m16s'  # has num col
        # date_time = '20170227_15h36m21s'  # has num col
        # corrected below
        # date_time = '20170227_16h50m45s'  # has bug in prediction
        date_time = '20170228_10h50m23s'
        # not one-hot
        # date_time = '20170226_19h53m38s'
        # date_time = '20170227_14h51m53s'
        # date_time = '20170227_15h04m15s'
        # date_time = '20170227_15h57m09s'
        # date_time = '20170227_16h04m23s'
        # corrected below
        # date_time = '20170228_00h05m40s'
        return pd.read_csv(''.join([HousePrices._save_path, dataframe_name, date_time, '.csv']), 
                           header=0)

    @staticmethod
    def drop_num_features(df):
        # Drop all categorical feature helping columns ('Num')
        # Todo: is it defined when importing data set? _feature_names_num
        for feature_name in HousePrices._feature_names_num:
            df = df.drop([feature_name], axis=1)
        return df

    def prepare_data_random_forest(self, df):
        df = df.copy()
        df = self.drop_variable_before_preparation(df)

        # Todo: correct extract_non_numerical_features() and check if similar things are 
        # new in python 3.6
        HousePrices._non_numerical_feature_names = HousePrices.extract_non_numerical_features(df)._get_axis(1)
        HousePrices._numerical_feature_names = HousePrices.extract_numerical_features(df)
        # HousePrices._non_numerical_feature_names = ['MSZoning', 'LotShape', 'Neighborhood', 
        # 'BldgType', 'HouseStyle',
        # 'Foundation', 'Heating']

        HousePrices._is_not_import_data = 1
        if HousePrices._is_not_import_data:
            self.feature_mapping_to_numerical_values(df)
            if HousePrices._is_one_hot_encoder:
                df = HousePrices.drop_num_features(df)
            self.feature_engineering(df)
            df = self.clean_data(df)
            df = self.feature_scaling(df)

            is_save_dataframe = 1
            if is_save_dataframe:
                self.save_dataframe(df)
                HousePrices._is_dataframe_with_sale_price = 0
        else:
            # Todo: create and save dataframe for debuggin in case of one-hot encoding
            # if not HousePrices._is_not_import_data:
            df = HousePrices.load_dataframe()
            # HousePrices._non_numerical_feature_names = HousePrices.extract_non_numerical_features(df)._get_axis(1)
            # HousePrices._numerical_feature_names = HousePrices.extract_numerical_features(df)
            HousePrices._is_dataframe_with_sale_price = 0

        df = self.drop_variable(df)
        return df

    @staticmethod
    def features_with_null_logical(df, axis=1):
        row_length = len(df._get_axis(0))
        # Axis to count non null values in. aggregate_axis=0 implies counting for every feature
        aggregate_axis = 1 - axis
        features_non_null_series = df.count(axis=aggregate_axis)
        # Whenever count() differs from row_length it implies a null value exists in feature 
        # column and a False in mask
        mask = row_length == features_non_null_series
        return mask

    @staticmethod
    def estimate_by_mice(df):
        df_estimated_var = df.copy()
        random.seed(129)
        mice = MICE()  # model=RandomForestClassifier(n_estimators=100))
        res = mice.complete(np.asarray(df.values, dtype=float))
        df_estimated_var.loc[:, df.columns] = res[:][:]
        return df_estimated_var

    def feature_scaling(self, df):
        df = df.copy()
        # Standardization (centering and scaling) of dataset that removes mean and scales to 
        # unit variance
        standard_scaler = StandardScaler()
        numerical_feature_names_of_non_modified_df = HousePrices._numerical_feature_names
        if any(tuple(df.columns == 'SalePrice')):
            if not HousePrices._is_one_hot_encoder:
                numerical_feature_names_of_non_modified_df = np.concatenate(
                    [HousePrices._feature_names_num.values, 
                     numerical_feature_names_of_non_modified_df.values])
            # Include scaling of SalePrice
            y = df.SalePrice.values
            relevant_features = df[numerical_feature_names_of_non_modified_df].columns[
                (df[numerical_feature_names_of_non_modified_df].columns != 'SalePrice')
                & (df[numerical_feature_names_of_non_modified_df].columns != 'Id')]
            mask = ~df[relevant_features].isnull()
            res = standard_scaler.fit_transform(X=df[relevant_features][mask].values, y=y)
            if (~mask).sum().sum() > 0:
                df = self.standardize_relevant_features(df, relevant_features, res)
            else:
                df.loc[:, tuple(relevant_features)] = res
        else:
            if not HousePrices._is_one_hot_encoder:
                numerical_feature_names_of_non_modified_df = np.concatenate(
                    [HousePrices._feature_names_num.values, 
                     numerical_feature_names_of_non_modified_df.values])

            relevant_features = df[numerical_feature_names_of_non_modified_df].columns[
                (df[numerical_feature_names_of_non_modified_df].columns != 'Id')]
            mask = ~df[relevant_features].isnull()
            res = standard_scaler.fit_transform(df[relevant_features][mask].values)
            if mask.sum().sum() > 0:
                df = self.standardize_relevant_features(df, relevant_features, res)
            else:
                df.loc[:, tuple(relevant_features)] = res
        return df

    @staticmethod
    def standardize_relevant_features(df, relevant_features, res):
        i_column = 0
        for feature in relevant_features:
            mask = ~df[feature].isnull()
            mask_index = mask[mask == 1].index
            df.loc[mask_index, tuple([feature])] = res[:, i_column]
            i_column += 1
        return df

    def missing_values_in_dataframe(self, df):
        mask = self.features_with_null_logical(df)
        print(df[mask[mask == 0].index.values].isnull().sum())
        print('\n')

    def features_with_missing_values_in_dataframe(self, df):
        df = df.copy()
        mask = self.features_with_null_logical(df)
        return df[mask[mask == 0].index.values].isnull().sum()

    @staticmethod
    def rmse_cv(model, x_train, y_train):
        rmse = np.sqrt(-cross_val_score(model, x_train, y_train, 
                                        scoring='neg_mean_squared_error', cv=5))
        return rmse

    @staticmethod
    def rmse(y_pred, y_actual):
        n_samples = np.shape(y_pred)[0]
        squared_residuals_summed = 0.5*sum((y_pred - y_actual)**2)
        return np.sqrt(2.0*squared_residuals_summed/n_samples)

    def outlier_identification(self, model, x_train, y_train):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train,
                                                                                    y_train)
        print('\nOutlier shapes')
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), 
              np.shape(y_test_split))
        model.fit(x_train_split, y_train_split)
        y_predicted = model.predict(x_test_split)
        residuals = np.absolute(y_predicted - y_test_split)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        outliers_mask = residuals >= rmse_pred_vs_actual
        # outliers_mask = np.insert(np.zeros((np.shape(y_train_split)[0],), dtype=np.int), 
        # np.shape(y_train_split)[0], outliers_mask)
        outliers_mask = np.concatenate([np.zeros((np.shape(y_train_split)[0],), dtype=bool), 
                                        outliers_mask])
        not_an_outlier = outliers_mask == 0
        # Resample the training set from split, since the set was randomly split
        x_out = np.insert(x_train_split, np.shape(x_train_split)[0], x_test_split, axis=0)
        y_out = np.insert(y_train_split, np.shape(y_train_split)[0], y_test_split, axis=0)
        return x_out[not_an_outlier, ], y_out[not_an_outlier, ]

    def predicted_vs_actual_sale_price(self, x_train, y_train, title_name):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, 
                                                                                    y_train)
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), 
              np.shape(y_test_split))
        lasso = LassoCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 
                                0.1, 0.3, 0.6, 1],
                        max_iter=50000, cv=10)
        # lasso = RidgeCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 
        #                          0.1, 0.3, 0.6, 1], cv=10)

        lasso.fit(x_train_split, y_train_split)
        y_predicted = lasso.predict(X=x_test_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', 
                           str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual Sale Price')
        plt.ylabel('Predicted Sale Price')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()

    def predicted_vs_actual_sale_price_input_model(self, model, x_train, y_train, title_name):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, 
                                                                                    y_train)
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), 
              np.shape(y_test_split))
        model.fit(x_train_split, y_train_split)
        y_predicted = model.predict(x_test_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', 
                           str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual Sale Price')
        plt.ylabel('Predicted Sale Price')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()

    def predicted_vs_actual_sale_price_xgb(self, xgb, best_nrounds, xgb_params, x_train, y_train, 
                                           title_name):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, 
                                                                                    y_train)
        dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
        dtest_split = xgb.DMatrix(x_test_split)

        # res = xgb.cv(xgb_params, dtrain_split, num_boost_round=1000, nfold=4, seed=seed, 
        # stratified=False,
        #              early_stopping_rounds=25, verbose_eval=10, show_stdv=True)
        #
        # best_nrounds = res.shape[0] - 1
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), 
              np.shape(y_test_split))
        gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
        y_predicted = gbdt.predict(dtest_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', 
                           str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual Sale Price')
        plt.ylabel('Predicted Sale Price')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()

    @staticmethod
    def multipage(filename, figs=None):
        pp = PdfPages(filename)
        if figs is None:
            figs = [plt.figure(n) for n in plt.get_fignums()]
        for fig in figs:
            fig.savefig(pp, format='pdf')
        pp.close()

In [3]:
if __name__ == '__main__':
    import xgboost as xgb
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.linear_model import SGDRegressor
    from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
    # from sklearn.linear_model import LogisticRegression
    from sklearn.feature_selection import SelectFromModel
    # from sklearn.naive_bayes import GaussianNB
    # from sklearn import svm
    # from collections import OrderedDict
    # from sklearn.ensemble import IsolationForest
    import seaborn as sns
    from sklearn.model_selection import StratifiedKFold
    from sklearn.model_selection import GridSearchCV
    # from sklearn.model_selection import KFold, train_test_split

Prepare Data

Before we start to do any predictions using our machine learning estimators there are a couple of important steps that needs to be done.

  • Transform categorical features to numerical values
  • Fill in missing values in our data using Multiple Impuations by Chained Equations (MICE).
  • Correct skewness and standardize our data by subtraction of mean and transforming to unit variance
  • Later we will also subtract potential outliers

In [4]:
''' Prepare data '''

    house_prices = HousePrices()
    df_publ = house_prices.df.copy()
    df_test_publ = house_prices.df_test.copy()

    df = house_prices.prepare_data_random_forest(df_publ)
    house_prices.df_all_feature_var_names = df[df.columns[df.columns != 'SalePrice']].columns
    print('\n TRAINING DATA:----------------------------------------------- \n')
    print(df.head(3))
    print('\n')
    print(df.info())
    print('\n')
    print(df.describe())

    # Test data
    Id_df_test = house_prices.df_test['Id']  # Submission column
    df_test = house_prices.prepare_data_random_forest(df_test_publ)
    print('\n TEST DATA:----------------------------------------------- \n')
    print(df_test.info())
    print('\n')
    print(df_test.describe())
    print('\n')

    # Check if feature_var_names of test exist that do not appear in training set
    # feature_var_names_addition_to_training_set = 
    # house_prices.feature_var_names_in_training_set_not_in_test_set(
    #     df_test.columns, df.columns)

    df = df[house_prices.df_test_all_feature_var_names.insert(
        np.shape(house_prices.df_test_all_feature_var_names)[0], 'SalePrice')]
    df_test = df_test[house_prices.df_test_all_feature_var_names]
    df_test_num_features = house_prices.extract_numerical_features(df_test)


[MICE] Completing matrix with shape (1460, 272)
[MICE] Starting imputation round 1/110, elapsed time 0.004
[MICE] Starting imputation round 2/110, elapsed time 0.598
[MICE] Starting imputation round 3/110, elapsed time 1.026
[MICE] Starting imputation round 4/110, elapsed time 1.426
[MICE] Starting imputation round 5/110, elapsed time 1.979
[MICE] Starting imputation round 6/110, elapsed time 2.378
[MICE] Starting imputation round 7/110, elapsed time 2.989
[MICE] Starting imputation round 8/110, elapsed time 3.432
[MICE] Starting imputation round 9/110, elapsed time 3.869
[MICE] Starting imputation round 10/110, elapsed time 4.644
[MICE] Starting imputation round 11/110, elapsed time 5.090
[MICE] Starting imputation round 12/110, elapsed time 5.554
[MICE] Starting imputation round 13/110, elapsed time 5.993
[MICE] Starting imputation round 14/110, elapsed time 6.462
[MICE] Starting imputation round 15/110, elapsed time 6.956
[MICE] Starting imputation round 16/110, elapsed time 7.482
[MICE] Starting imputation round 17/110, elapsed time 7.987
[MICE] Starting imputation round 18/110, elapsed time 8.646
[MICE] Starting imputation round 19/110, elapsed time 9.238
[MICE] Starting imputation round 20/110, elapsed time 9.794
[MICE] Starting imputation round 21/110, elapsed time 10.214
[MICE] Starting imputation round 22/110, elapsed time 10.617
[MICE] Starting imputation round 23/110, elapsed time 11.008
[MICE] Starting imputation round 24/110, elapsed time 11.473
[MICE] Starting imputation round 25/110, elapsed time 12.015
[MICE] Starting imputation round 26/110, elapsed time 12.411
[MICE] Starting imputation round 27/110, elapsed time 12.824
[MICE] Starting imputation round 28/110, elapsed time 13.256
[MICE] Starting imputation round 29/110, elapsed time 13.697
[MICE] Starting imputation round 30/110, elapsed time 14.249
[MICE] Starting imputation round 31/110, elapsed time 14.762
[MICE] Starting imputation round 32/110, elapsed time 15.187
[MICE] Starting imputation round 33/110, elapsed time 15.834
[MICE] Starting imputation round 34/110, elapsed time 16.466
[MICE] Starting imputation round 35/110, elapsed time 16.991
[MICE] Starting imputation round 36/110, elapsed time 17.592
[MICE] Starting imputation round 37/110, elapsed time 18.034
[MICE] Starting imputation round 38/110, elapsed time 18.460
[MICE] Starting imputation round 39/110, elapsed time 18.891
[MICE] Starting imputation round 40/110, elapsed time 19.306
[MICE] Starting imputation round 41/110, elapsed time 19.813
[MICE] Starting imputation round 42/110, elapsed time 20.238
[MICE] Starting imputation round 43/110, elapsed time 20.641
[MICE] Starting imputation round 44/110, elapsed time 21.047
[MICE] Starting imputation round 45/110, elapsed time 21.452
[MICE] Starting imputation round 46/110, elapsed time 21.865
[MICE] Starting imputation round 47/110, elapsed time 22.299
[MICE] Starting imputation round 48/110, elapsed time 22.703
[MICE] Starting imputation round 49/110, elapsed time 23.100
[MICE] Starting imputation round 50/110, elapsed time 23.692
[MICE] Starting imputation round 51/110, elapsed time 24.147
[MICE] Starting imputation round 52/110, elapsed time 24.577
[MICE] Starting imputation round 53/110, elapsed time 25.008
[MICE] Starting imputation round 54/110, elapsed time 25.463
[MICE] Starting imputation round 55/110, elapsed time 25.873
[MICE] Starting imputation round 56/110, elapsed time 26.284
[MICE] Starting imputation round 57/110, elapsed time 26.731
[MICE] Starting imputation round 58/110, elapsed time 27.128
[MICE] Starting imputation round 59/110, elapsed time 27.541
[MICE] Starting imputation round 60/110, elapsed time 27.945
[MICE] Starting imputation round 61/110, elapsed time 28.388
[MICE] Starting imputation round 62/110, elapsed time 28.790
[MICE] Starting imputation round 63/110, elapsed time 29.192
[MICE] Starting imputation round 64/110, elapsed time 29.617
[MICE] Starting imputation round 65/110, elapsed time 30.063
[MICE] Starting imputation round 66/110, elapsed time 30.459
[MICE] Starting imputation round 67/110, elapsed time 30.894
[MICE] Starting imputation round 68/110, elapsed time 31.350
[MICE] Starting imputation round 69/110, elapsed time 31.775
[MICE] Starting imputation round 70/110, elapsed time 32.181
[MICE] Starting imputation round 71/110, elapsed time 32.585
[MICE] Starting imputation round 72/110, elapsed time 33.020
[MICE] Starting imputation round 73/110, elapsed time 33.432
[MICE] Starting imputation round 74/110, elapsed time 33.835
[MICE] Starting imputation round 75/110, elapsed time 34.240
[MICE] Starting imputation round 76/110, elapsed time 34.707
[MICE] Starting imputation round 77/110, elapsed time 35.171
[MICE] Starting imputation round 78/110, elapsed time 35.599
[MICE] Starting imputation round 79/110, elapsed time 36.105
[MICE] Starting imputation round 80/110, elapsed time 36.542
[MICE] Starting imputation round 81/110, elapsed time 37.022
[MICE] Starting imputation round 82/110, elapsed time 37.437
[MICE] Starting imputation round 83/110, elapsed time 37.929
[MICE] Starting imputation round 84/110, elapsed time 38.338
[MICE] Starting imputation round 85/110, elapsed time 38.738
[MICE] Starting imputation round 86/110, elapsed time 39.158
[MICE] Starting imputation round 87/110, elapsed time 39.571
[MICE] Starting imputation round 88/110, elapsed time 39.988
[MICE] Starting imputation round 89/110, elapsed time 40.382
[MICE] Starting imputation round 90/110, elapsed time 40.792
[MICE] Starting imputation round 91/110, elapsed time 41.194
[MICE] Starting imputation round 92/110, elapsed time 41.586
[MICE] Starting imputation round 93/110, elapsed time 42.013
[MICE] Starting imputation round 94/110, elapsed time 42.444
[MICE] Starting imputation round 95/110, elapsed time 42.845
[MICE] Starting imputation round 96/110, elapsed time 43.263
[MICE] Starting imputation round 97/110, elapsed time 43.679
[MICE] Starting imputation round 98/110, elapsed time 44.112
[MICE] Starting imputation round 99/110, elapsed time 44.530
[MICE] Starting imputation round 100/110, elapsed time 44.939
[MICE] Starting imputation round 101/110, elapsed time 45.378
[MICE] Starting imputation round 102/110, elapsed time 45.795
[MICE] Starting imputation round 103/110, elapsed time 46.186
[MICE] Starting imputation round 104/110, elapsed time 46.601
[MICE] Starting imputation round 105/110, elapsed time 47.061
[MICE] Starting imputation round 106/110, elapsed time 47.494
[MICE] Starting imputation round 107/110, elapsed time 47.896
[MICE] Starting imputation round 108/110, elapsed time 48.286
[MICE] Starting imputation round 109/110, elapsed time 48.725
[MICE] Starting imputation round 110/110, elapsed time 49.110

 TRAINING DATA:----------------------------------------------- 

   MSSubClass MSZoning  LotFrontage   LotArea Street LotShape LandContour  \
0    0.424462       RL    -0.053910 -0.133270   Pave      Reg         Lvl   
1   -1.125202       RL     0.594751  0.113413   Pave      Reg         Lvl   
2    0.424462       RL     0.086885  0.420049   Pave      IR1         Lvl   

  Utilities LotConfig LandSlope         ...          SaleType_CWD  \
0    AllPub    Inside       Gtl         ...                   0.0   
1    AllPub       FR2       Gtl         ...                   0.0   
2    AllPub    Inside       Gtl         ...                   0.0   

  SaleType_ConLw SaleType_Con SaleType_Oth SaleCondition_Normal  \
0            0.0          0.0          0.0                  1.0   
1            0.0          0.0          0.0                  1.0   
2            0.0          0.0          0.0                  1.0   

   SaleCondition_Abnorml  SaleCondition_Partial  SaleCondition_AdjLand  \
0                    0.0                    0.0                    0.0   
1                    0.0                    0.0                    0.0   
2                    0.0                    0.0                    0.0   

   SaleCondition_Alloca SaleCondition_Family  
0                   0.0                  0.0  
1                   0.0                  0.0  
2                   0.0                  0.0  

[3 rows x 309 columns]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 309 entries, MSSubClass to SaleCondition_Family
dtypes: float64(271), object(38)
memory usage: 3.4+ MB
None


         MSSubClass   LotFrontage       LotArea   OverallQual   OverallCond  \
count  1.460000e+03  1.460000e+03  1.460000e+03  1.460000e+03  1.460000e+03   
mean  -4.659895e-16 -5.329071e-16 -1.947909e-15  1.387018e-16  3.540547e-16   
std    1.000343e+00  1.000343e+00  1.000343e+00  1.000343e+00  1.000343e+00   
min   -1.125202e+00 -3.533631e+00 -3.751177e+00 -3.688413e+00 -4.112970e+00   
25%   -1.125202e+00 -3.034390e-01 -3.500987e-01 -7.951515e-01 -5.171998e-01   
50%    1.642623e-01  4.064547e-02  8.878817e-02 -7.183611e-02 -5.171998e-01   
75%    6.450728e-01  5.554044e-01  4.795310e-01  6.514792e-01  3.817427e-01   
max    2.083189e+00  4.886371e+00  6.126490e+00  2.821425e+00  3.078570e+00   

          YearBuilt  YearRemodAdd    MasVnrArea    BsmtFinSF1    BsmtFinSF2  \
count  1.460000e+03  1.460000e+03  1.460000e+03  1.460000e+03  1.460000e+03   
mean   1.046347e-15  4.496860e-15  2.433366e-17  1.581688e-17  9.855130e-17   
std    1.000343e+00  1.000343e+00  1.000343e+00  1.000343e+00  1.000343e+00   
min   -3.287824e+00 -1.689368e+00 -8.133884e-01 -1.414140e+00 -3.553424e-01   
25%   -5.719226e-01 -8.656586e-01 -8.133884e-01 -1.414140e+00 -3.553424e-01   
50%    5.737148e-02  4.425864e-01 -8.133884e-01  5.757928e-01 -3.553424e-01   
75%    9.516316e-01  9.271216e-01  1.133405e+00  7.823736e-01 -3.553424e-01   
max    1.282839e+00  1.217843e+00  1.999008e+00  1.474007e+00  3.600612e+00   

               ...           SaleType_CWD  SaleType_ConLw  SaleType_Con  \
count          ...            1460.000000     1460.000000   1460.000000   
mean           ...               0.002740        0.003425      0.001370   
std            ...               0.052289        0.058440      0.036999   
min            ...               0.000000        0.000000      0.000000   
25%            ...               0.000000        0.000000      0.000000   
50%            ...               0.000000        0.000000      0.000000   
75%            ...               0.000000        0.000000      0.000000   
max            ...               1.000000        1.000000      1.000000   

       SaleType_Oth  SaleCondition_Normal  SaleCondition_Abnorml  \
count   1460.000000           1460.000000            1460.000000   
mean       0.002055              0.820548               0.069178   
std        0.045299              0.383862               0.253844   
min        0.000000              0.000000               0.000000   
25%        0.000000              1.000000               0.000000   
50%        0.000000              1.000000               0.000000   
75%        0.000000              1.000000               0.000000   
max        1.000000              1.000000               1.000000   

       SaleCondition_Partial  SaleCondition_AdjLand  SaleCondition_Alloca  \
count            1460.000000            1460.000000           1460.000000   
mean                0.085616               0.002740              0.008219   
std                 0.279893               0.052289              0.090317   
min                 0.000000               0.000000              0.000000   
25%                 0.000000               0.000000              0.000000   
50%                 0.000000               0.000000              0.000000   
75%                 0.000000               0.000000              0.000000   
max                 1.000000               1.000000              1.000000   

       SaleCondition_Family  
count           1460.000000  
mean               0.013699  
std                0.116277  
min                0.000000  
25%                0.000000  
50%                0.000000  
75%                0.000000  
max                1.000000  

[8 rows x 271 columns]
[MICE] Completing matrix with shape (1459, 255)
[MICE] Starting imputation round 1/110, elapsed time 0.006
[MICE] Starting imputation round 2/110, elapsed time 0.992
[MICE] Starting imputation round 3/110, elapsed time 1.762
[MICE] Starting imputation round 4/110, elapsed time 2.588
[MICE] Starting imputation round 5/110, elapsed time 3.453
[MICE] Starting imputation round 6/110, elapsed time 4.352
[MICE] Starting imputation round 7/110, elapsed time 5.251
[MICE] Starting imputation round 8/110, elapsed time 6.055
[MICE] Starting imputation round 9/110, elapsed time 6.836
[MICE] Starting imputation round 10/110, elapsed time 7.626
[MICE] Starting imputation round 11/110, elapsed time 8.414
[MICE] Starting imputation round 12/110, elapsed time 9.223
[MICE] Starting imputation round 13/110, elapsed time 9.996
[MICE] Starting imputation round 14/110, elapsed time 10.818
[MICE] Starting imputation round 15/110, elapsed time 11.589
[MICE] Starting imputation round 16/110, elapsed time 12.383
[MICE] Starting imputation round 17/110, elapsed time 13.138
[MICE] Starting imputation round 18/110, elapsed time 13.918
[MICE] Starting imputation round 19/110, elapsed time 14.654
[MICE] Starting imputation round 20/110, elapsed time 15.470
[MICE] Starting imputation round 21/110, elapsed time 16.225
[MICE] Starting imputation round 22/110, elapsed time 17.003
[MICE] Starting imputation round 23/110, elapsed time 17.775
[MICE] Starting imputation round 24/110, elapsed time 18.725
[MICE] Starting imputation round 25/110, elapsed time 19.614
[MICE] Starting imputation round 26/110, elapsed time 20.505
[MICE] Starting imputation round 27/110, elapsed time 21.313
[MICE] Starting imputation round 28/110, elapsed time 22.053
[MICE] Starting imputation round 29/110, elapsed time 22.838
[MICE] Starting imputation round 30/110, elapsed time 23.578
[MICE] Starting imputation round 31/110, elapsed time 24.363
[MICE] Starting imputation round 32/110, elapsed time 25.111
[MICE] Starting imputation round 33/110, elapsed time 25.907
[MICE] Starting imputation round 34/110, elapsed time 26.648
[MICE] Starting imputation round 35/110, elapsed time 27.451
[MICE] Starting imputation round 36/110, elapsed time 28.202
[MICE] Starting imputation round 37/110, elapsed time 28.986
[MICE] Starting imputation round 38/110, elapsed time 29.735
[MICE] Starting imputation round 39/110, elapsed time 30.501
[MICE] Starting imputation round 40/110, elapsed time 31.261
[MICE] Starting imputation round 41/110, elapsed time 32.003
[MICE] Starting imputation round 42/110, elapsed time 32.782
[MICE] Starting imputation round 43/110, elapsed time 33.581
[MICE] Starting imputation round 44/110, elapsed time 34.331
[MICE] Starting imputation round 45/110, elapsed time 35.138
[MICE] Starting imputation round 46/110, elapsed time 35.882
[MICE] Starting imputation round 47/110, elapsed time 36.649
[MICE] Starting imputation round 48/110, elapsed time 37.412
[MICE] Starting imputation round 49/110, elapsed time 38.257
[MICE] Starting imputation round 50/110, elapsed time 39.119
[MICE] Starting imputation round 51/110, elapsed time 40.022
[MICE] Starting imputation round 52/110, elapsed time 40.827
[MICE] Starting imputation round 53/110, elapsed time 41.595
[MICE] Starting imputation round 54/110, elapsed time 42.328
[MICE] Starting imputation round 55/110, elapsed time 43.085
[MICE] Starting imputation round 56/110, elapsed time 43.803
[MICE] Starting imputation round 57/110, elapsed time 44.613
[MICE] Starting imputation round 58/110, elapsed time 45.355
[MICE] Starting imputation round 59/110, elapsed time 46.164
[MICE] Starting imputation round 60/110, elapsed time 46.918
[MICE] Starting imputation round 61/110, elapsed time 47.722
[MICE] Starting imputation round 62/110, elapsed time 48.490
[MICE] Starting imputation round 63/110, elapsed time 49.311
[MICE] Starting imputation round 64/110, elapsed time 50.087
[MICE] Starting imputation round 65/110, elapsed time 50.935
[MICE] Starting imputation round 66/110, elapsed time 51.698
[MICE] Starting imputation round 67/110, elapsed time 52.518
[MICE] Starting imputation round 68/110, elapsed time 53.280
[MICE] Starting imputation round 69/110, elapsed time 54.066
[MICE] Starting imputation round 70/110, elapsed time 54.827
[MICE] Starting imputation round 71/110, elapsed time 55.589
[MICE] Starting imputation round 72/110, elapsed time 56.348
[MICE] Starting imputation round 73/110, elapsed time 57.130
[MICE] Starting imputation round 74/110, elapsed time 57.894
[MICE] Starting imputation round 75/110, elapsed time 58.773
[MICE] Starting imputation round 76/110, elapsed time 59.657
[MICE] Starting imputation round 77/110, elapsed time 60.469
[MICE] Starting imputation round 78/110, elapsed time 61.289
[MICE] Starting imputation round 79/110, elapsed time 62.077
[MICE] Starting imputation round 80/110, elapsed time 62.809
[MICE] Starting imputation round 81/110, elapsed time 63.578
[MICE] Starting imputation round 82/110, elapsed time 64.295
[MICE] Starting imputation round 83/110, elapsed time 65.043
[MICE] Starting imputation round 84/110, elapsed time 65.783
[MICE] Starting imputation round 85/110, elapsed time 66.513
[MICE] Starting imputation round 86/110, elapsed time 67.260
[MICE] Starting imputation round 87/110, elapsed time 68.023
[MICE] Starting imputation round 88/110, elapsed time 68.745
[MICE] Starting imputation round 89/110, elapsed time 69.492
[MICE] Starting imputation round 90/110, elapsed time 70.212
[MICE] Starting imputation round 91/110, elapsed time 70.966
[MICE] Starting imputation round 92/110, elapsed time 71.666
[MICE] Starting imputation round 93/110, elapsed time 72.399
[MICE] Starting imputation round 94/110, elapsed time 73.119
[MICE] Starting imputation round 95/110, elapsed time 73.894
[MICE] Starting imputation round 96/110, elapsed time 74.620
[MICE] Starting imputation round 97/110, elapsed time 75.349
[MICE] Starting imputation round 98/110, elapsed time 76.117
[MICE] Starting imputation round 99/110, elapsed time 76.839
[MICE] Starting imputation round 100/110, elapsed time 77.615
[MICE] Starting imputation round 101/110, elapsed time 78.350
[MICE] Starting imputation round 102/110, elapsed time 79.258
[MICE] Starting imputation round 103/110, elapsed time 80.184
[MICE] Starting imputation round 104/110, elapsed time 81.049
[MICE] Starting imputation round 105/110, elapsed time 81.854
[MICE] Starting imputation round 106/110, elapsed time 82.571
[MICE] Starting imputation round 107/110, elapsed time 83.333
[MICE] Starting imputation round 108/110, elapsed time 84.060
[MICE] Starting imputation round 109/110, elapsed time 84.835
[MICE] Starting imputation round 110/110, elapsed time 85.566

 TEST DATA:----------------------------------------------- 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 292 entries, MSSubClass to SaleCondition_AdjLand
dtypes: float64(254), object(38)
memory usage: 3.3+ MB
None


         MSSubClass   LotFrontage       LotArea   OverallQual   OverallCond  \
count  1.459000e+03  1.459000e+03  1.459000e+03  1.459000e+03  1.459000e+03   
mean  -2.167180e-16 -7.195524e-16 -4.748315e-16 -2.678537e-16 -3.530798e-17   
std    1.000343e+00  1.000343e+00  1.000343e+00  1.000343e+00  1.000343e+00   
min   -1.116585e+00 -2.310566e+00 -3.557719e+00 -3.535998e+00 -4.090153e+00   
25%   -1.116585e+00 -4.236499e-01 -3.401957e-01 -7.511012e-01 -4.974178e-01   
50%    1.573775e-01 -1.826211e-02  1.387318e-01 -5.487716e-02 -4.974178e-01   
75%    6.324078e-01  4.956167e-01  5.437875e-01  6.413469e-01  4.007660e-01   
max    2.053236e+00  6.349894e+00  3.716701e+00  2.730019e+00  3.095317e+00   

          YearBuilt  YearRemodAdd    MasVnrArea    BsmtFinSF1    BsmtFinSF2  \
count  1.459000e+03  1.459000e+03  1.459000e+03  1.459000e+03  1.459000e+03   
mean  -1.241867e-16  2.140394e-15 -7.548603e-17  2.264581e-16  9.740133e-17   
std    1.000343e+00  1.000343e+00  1.000343e+00  1.000343e+00  1.000343e+00   
min   -3.040119e+00 -1.593638e+00 -7.910846e-01 -1.421895e+00 -3.700647e-01   
25%   -6.042787e-01 -9.782021e-01 -7.910846e-01 -1.421895e+00 -3.700647e-01   
50%    5.405660e-02  3.946944e-01 -7.910846e-01  5.516894e-01 -3.700647e-01   
75%    9.757261e-01  9.627894e-01  1.164030e+00  8.087163e-01 -3.700647e-01   
max    1.271977e+00  1.246837e+00  1.958328e+00  1.372001e+00  3.466167e+00   

               ...            SaleType_Con  SaleType_ConLw  SaleType_ConLI  \
count          ...             1459.000000     1459.000000     1459.000000   
mean           ...                0.002057        0.002055        0.002740   
std            ...                0.045314        0.045314        0.052307   
min            ...                0.000000       -0.001054       -0.002395   
25%            ...                0.000000        0.000000        0.000000   
50%            ...                0.000000        0.000000        0.000000   
75%            ...                0.000000        0.000000        0.000000   
max            ...                1.000000        1.000000        1.000000   

       SaleType_CWD  SaleCondition_Normal  SaleCondition_Partial  \
count   1459.000000           1459.000000            1459.000000   
mean       0.005490              0.825223               0.082248   
std        0.073871              0.379907               0.274837   
min        0.000000              0.000000               0.000000   
25%        0.000000              1.000000               0.000000   
50%        0.000000              1.000000               0.000000   
75%        0.000000              1.000000               0.000000   
max        1.000000              1.000000               1.000000   

       SaleCondition_Abnorml  SaleCondition_Family  SaleCondition_Alloca  \
count            1459.000000           1459.000000           1459.000000   
mean                0.061001              0.017820              0.008225   
std                 0.239414              0.132344              0.090348   
min                 0.000000              0.000000              0.000000   
25%                 0.000000              0.000000              0.000000   
50%                 0.000000              0.000000              0.000000   
75%                 0.000000              0.000000              0.000000   
max                 1.000000              1.000000              1.000000   

       SaleCondition_AdjLand  
count            1459.000000  
mean                0.005483  
std                 0.073871  
min                 0.000000  
25%                 0.000000  
50%                 0.000000  
75%                 0.000000  
max                 1.000000  

[8 rows x 254 columns]


Agglomerated features in clusters


In [5]:
# Agglomeration of features
    is_with_agglomeration = 1
    if is_with_agglomeration:
        df_merged_train_and_test = pd.DataFrame(data=np.concatenate(
            (df[df.columns[df.columns != 'SalePrice']].values, df_test.values)), 
                                                columns=house_prices.df_test_all_feature_var_names)
        df_merged_train_and_test = df_merged_train_and_test[df_test_num_features]
        df_merged_train_and_test_agglom = HousePrices.feature_agglomeration(df_merged_train_and_test)
        train_data = np.concatenate((df_merged_train_and_test_agglom.values[:df.shape[0], 0::],
                                     np.reshape(df.SalePrice.values, 
                                                (df.SalePrice.values.shape[0], 1))), axis=1)
        test_data = df_merged_train_and_test_agglom.values[df.shape[0]::, 0::]
    else:
        df[np.concatenate((df_test_num_features, ['SalePrice']))].values
        test_data = df_test[df_test_num_features].values
        # print(sum(np.isnan(train_data)).sum()) # 348 is nan

In [6]:
is_simple_model = 0
    if is_simple_model:
        df_simple_model = house_prices.clean_data(df_publ)
        # df_simple_model = house_prices.prepare_data_random_forest(df_publ)

        # Prepare simple model
        df_test_simple_model = house_prices.extract_numerical_features(df_test_publ)
        is_remove_null = 0
        if is_remove_null:
            df_test_simple_model = df_test_simple_model.dropna(axis=1)
        else:
            df_test_simple_model = house_prices.estimate_by_mice(df_test_simple_model)

        df_simple_model = df_simple_model[df_test_simple_model.columns.insert(
            np.shape(df_test_simple_model.columns)[0], 'SalePrice')]

        train_data_simple = df_simple_model.values
        test_data_simple = df_test_simple_model.values
        x_train = train_data_simple[0::, :-1]
        y_train = train_data_simple[0::, -1]

        forest = RandomForestClassifier(max_features='sqrt')  
        # n_estimators=100), n_jobs=-1), max_depth=None,
        # min_samples_split=2, random_state=0)#, max_features=np.sqrt(5))
        parameter_grid = {'max_depth': [4, 5, 6, 7, 8], 'n_estimators': [200, 210, 240, 250], 
                          'criterion': ['gini', 'entropy']}
        cross_validation = StratifiedKFold(random_state=None, shuffle=False)  # , n_folds=10)
        grid_search = GridSearchCV(forest, param_grid=parameter_grid, cv=cross_validation, 
                                   n_jobs=24)
        grid_search.fit(x_train, y_train)
        # output = grid_search.predict(X=test_data_simple)
        print('Best score: {}'.format(grid_search.best_score_))
        print('Best parameters: {}'.format(grid_search.best_params_))

Explore Data and Missing Values


It is always a good idea to check if your data munging has worked. Below we check if there are any forgotten missing values in our numerical features concerning also transformed categorical features.


In [7]:
''' Explore data '''
    explore_data = 1
    if explore_data:

        is_missing_value_exploration = 1
        if is_missing_value_exploration:
            # Imputation for the 11 columns with none or nan values in the test data.
            # Using only numerical feature columns as first approach.

            # Train Data: numeric feature columns with none or nan in test data
            print('\nColumns in train data with none/nan values:\n')
            print('\nTraining set numerical features\' missing values')
            df_publ_numerical_features = house_prices.extract_numerical_features(df_publ)
            house_prices.missing_values_in_dataframe(df_publ[df_publ_numerical_features])

            # Test Data: Print numeric feature columns with none/nan in test data
            print('\nColumns in test data with none/nan values:\n')
            print('\nTest set numerical features\' missing values')
            df_test_publ_numerical_features = house_prices.extract_numerical_features(df_test_publ)
            house_prices.missing_values_in_dataframe(df_test_publ[df_test_publ_numerical_features])

            # Imputation method applied to numeric columns in test data with none/nan values
            # print("Training set missing values after imputation")
            # df_imputed = house_prices.estimate_by_mice(df_publ_numerical_features)
            # house_prices.missing_values_in_dataframe(df_imputed)
            # print("Testing set missing values after imputation")
            # df_test_imputed = house_prices.estimate_by_mice(df_test_publ_numerical_features)
            # house_prices.missing_values_in_dataframe(df_test_imputed)

            print('\nTotal Records for values: {}\n'.format(house_prices.df.count().sum() + 
                                                            house_prices.df_test.count().sum()))
            print('Total Records for missing values: {}\n'
                  .format(house_prices.df.isnull().sum().sum() + 
                          house_prices.df_test.isnull().sum().sum()))

            print('All Training set missing values')
            house_prices.missing_values_in_dataframe(house_prices.df)

            print('All Test set missing values')
            house_prices.missing_values_in_dataframe(house_prices.df_test)

            print("\n=== AFTER IMPUTERS ===\n")
            print("=== Check for missing values in set ===")
            # Todo: fix the bug that "Total Records for missing values" stays unchanged while
            # "Total Records for values" changes
            print('\nTotal Records for values: {}\n'.format(df.count().sum() + 
                                                            df_test.count().sum()))
            print('Total Records for missing values: {}\n'.format(df.isnull().sum().sum() + 
                                                                  df_test.isnull().sum().sum()))

            # Train Data: numeric feature columns with none or nan in test data
            print('\nColumns in train data with none/nan values:\n')
            print('\nTraining set numerical features\' missing values')
            df_numerical_features = house_prices.extract_numerical_features(df)
            house_prices.missing_values_in_dataframe(df[df_numerical_features])

            # Test Data: Print numeric feature columns with none/nan in test data
            print('\nColumns in test data with none/nan values:\n')
            print('\nTest set numerical features\' missing values')
            df_test_numerical_features = house_prices.extract_numerical_features(df_test)
            house_prices.missing_values_in_dataframe(df_test[df_test_numerical_features])

            # SalePrice square meter plot
            # Overview of data with histograms
            # feature_to_plot = ['LotAreaSquareMeters', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt']
            # feature_to_plot = ['YearBuilt', 'SalePrice', 'LotAreaSquareMeters', 'OverallCond', 
            #                   'TotalBsmtSF']
            # df_imputed_prepared = df_imputed.copy()
            # house_prices.feature_engineering(df_imputed_prepared)
            # bin_number = 25
            # df[df.LotAreaSquareMeters <= 2500.0][feature_to_plot].hist(bins=bin_number, alpha=.5)
            # df_imputed_prepared[df_imputed_prepared.LotAreaSquareMeters <= 
            #                    2500.0][feature_to_plot]
            # .hist(bins=bin_number, alpha=.5)


Columns in train data with none/nan values:


Training set numerical features' missing values
LotFrontage    259
MasVnrArea       8
GarageYrBlt     81
dtype: int64



Columns in test data with none/nan values:


Test set numerical features' missing values
LotFrontage     227
MasVnrArea       15
BsmtFinSF1        1
BsmtFinSF2        1
BsmtUnfSF         1
TotalBsmtSF       1
BsmtFullBath      2
BsmtHalfBath      2
GarageYrBlt      78
GarageCars        1
GarageArea        1
dtype: int64



Total Records for values: 221015

Total Records for missing values: 13965

All Training set missing values
LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


All Test set missing values
MSZoning           4
LotFrontage      227
Alley           1352
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        16
MasVnrArea        15
BsmtQual          44
BsmtCond          45
BsmtExposure      44
BsmtFinType1      42
BsmtFinSF1         1
BsmtFinType2      42
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu      730
GarageType        76
GarageYrBlt       78
GarageFinish      78
GarageCars         1
GarageArea         1
GarageQual        78
GarageCond        78
PoolQC          1456
Fence           1169
MiscFeature     1408
SaleType           1
dtype: int64



=== AFTER IMPUTERS ===

=== Check for missing values in set ===

Total Records for values: 852733

Total Records for missing values: 1075


Columns in train data with none/nan values:


Training set numerical features' missing values
Series([], dtype: float64)



Columns in test data with none/nan values:


Test set numerical features' missing values
Series([], dtype: float64)


Visualizing Data


Histograms are quick ways to see how our data is distributed. We notice that we need to correct for skewness and that the scale of sale price may be much larger than other features.


In [8]:
# Histogram of sale prices
            plt.figure()
            house_prices.df[['SalePrice']].hist(bins='auto', alpha=.5)
            plt.show()


<matplotlib.figure.Figure at 0x7faf61921fd0>

In [9]:
# Histogram of sale prices after data munging
            plt.figure()
            df[['SalePrice']].hist(bins='auto', alpha=.5)
            plt.show()


<matplotlib.figure.Figure at 0x7faef7fffbe0>

We could ask the question in what month or in what year were most houses sold?


In [10]:
# We expect more houses to be sold in the summer. Which is also the 
            # case month MM, year YYYY.
            # Sale tops in juli
            plt.figure()
            house_prices.df[['MoSold', 'YrSold']].hist(bins='auto', alpha=.5)
            plt.show()


<matplotlib.figure.Figure at 0x7faf28166748>

Two heatmaps showing correlations between features before and after data munging. In the second plot a lot more features occur, since categorical features have been transformed to numerical. Although it may be a little difficult to read the names of the many features in the second plot, it can still give an overview. Notice that sale price is the last feature occurring on both axes.


In [11]:
# Categorical plot with seaborn
        is_categorical_plot = 1
        if is_categorical_plot:
            # sns.countplot(y='MSZoning', hue='MSSubClass', data=df, palette='Greens_d')
            # plt.show()
            # sns.stripplot(x='SalePrice', y='MSZoning', data=df, jitter=True, hue='LandContour')
            # plt.show()
            # sns.boxplot(x='SalePrice', y='MSZoning', data=df, hue='MSSubClass')
            # plt.show()
            
            # Heatmap of feature correlations
            plt.figure(figsize=(10, 8))
            correlations = house_prices.df.corr()
            sns.heatmap(correlations, vmax=0.8, square=True)
            plt.show()

            # plt.figure()
            # sns.stripplot(x='SalePrice', y='OverallQual', data=house_prices.df, jitter=True)
            # plt.show()



In [12]:
# Heatmap of feature correlations after data munging
            plt.figure(figsize=(22, 14))
            correlations = df[house_prices.extract_numerical_features(df)].corr()
            sns.heatmap(correlations, vmax=0.8, square=True)
            plt.show()


Some features are highly correlated and it may therefore improve predictions if we apply feature agglomeration to our data


In [13]:
# Plot of agglomerated features in heatmap before data munging
            num_features = house_prices.extract_numerical_features(house_prices.df_test)
            
            
            df_merged_train_and_test_before_munging = pd.DataFrame(
                data=np.concatenate((house_prices.df[house_prices.df.columns[
                    house_prices.df.columns != 'SalePrice']].values, 
                                     house_prices.df_test.values)), columns=house_prices.df_test.columns)
            HousePrices.dendrogram(df_merged_train_and_test_before_munging[num_features])


/home/mizio/anaconda2/envs/kaggle_env_pyth36/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [14]:
# Plot of agglomerated features in heatmap after data munging
            HousePrices.dendrogram(df_merged_train_and_test)


/home/mizio/anaconda2/envs/kaggle_env_pyth36/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

With a heatmap we are able to discover how much some features depend on the sale price. Sale price is indicated inside each small box and color scale is one the right.


In [15]:
# Produce a heatmap with coefficients
            plt.figure()
            heat_data = house_prices.df.pivot_table(values='SalePrice', index=['OverallQual'], 
                                                    columns=['GarageCars'])
            htmp = sns.heatmap(heat_data, annot=True, cmap='YlGn')
            plt.show()


Another convenient way to use a heatmap is by showing correlation coefficients between features. This makes it easy to see what is very much correlated with sale price and what features are less correlated. Two plots are shown for cases before and after data munging. Notice the white empty lines in the first plot occur due to missing values.


In [16]:
# Zoom of heatmap with coefficients
            plt.figure(figsize=(20, 12))
            top_features = 10
            columns = correlations.nlargest(top_features, 'SalePrice')['SalePrice'].index
            correlation_coeff = np.corrcoef(house_prices.df[columns].values.T)
            sns.set(font_scale=1.20)
            coeff_heatmap = sns.heatmap(correlation_coeff, annot=True, cmap='YlGn', cbar=True, 
                                        square=True, fmt='.2f', annot_kws={'size': 10}, 
                                        yticklabels=columns.values, xticklabels=columns.values)
            plt.show()
            plt.close()



In [17]:
# Zoom of heatmap with coefficients after data munging
            plt.figure(figsize=(20, 12))
            top_features = 10
            columns = correlations.nlargest(top_features, 'SalePrice')['SalePrice'].index
            correlation_coeff = np.corrcoef(df[columns].values.T)
            sns.set(font_scale=1.20)
            coeff_heatmap = sns.heatmap(correlation_coeff, annot=True, cmap='YlGn', cbar=True, 
                                        square=True, fmt='.2f', annot_kws={'size': 10}, 
                                        yticklabels=columns.values, xticklabels=columns.values)
            plt.show()
            plt.close()


A boxplot is a good way to visualize how our data is positioned with respect to sale price and to indicate the mean value, which is shown by a straight line inside each box.


In [18]:
plt.figure()
            sns.boxplot(y='SalePrice', x='OverallQual', data=house_prices.df)
            plt.show()



In [19]:
plt.figure()
            sns.boxplot(x='SalePrice', y='MSZoning', data=df)
            plt.show()


We expect that house price may highly depend on neighborhood. In estate business location is a very important element.


In [20]:
plt.figure()
            sns.boxplot(x='SalePrice', y='Neighborhood', data=df)
            plt.show()



In [21]:
plt.figure()
            sns.boxplot(x='SalePrice', y='HouseStyle', data=df)
            plt.show()



In [22]:
plt.figure()
            sns.boxplot(x='SalePrice', y='SaleCondition', data=df)
            plt.show()



In [23]:
# sns.violinplot(x='SalePrice', y='MSZoning', data=df)
            # plt.show()
            # sns.violinplot(x='SalePrice', y='Neighborhood', data=df)
            # plt.show()

            # Arbitrary estimate, using the mean by default.
            # It also uses bootstrapping to compute a confidence interval around the estimate 
            # and plots that using error bars
            # sns.barplot(x='SalePrice', y='MSZoning', hue='LotShape', data=df)
            # plt.show()
            # sns.barplot(x='SalePrice', y='Neighborhood', data=df)#, hue='LotShape')
            # plt.show()
            # sns.barplot(x='SalePrice', y='SaleCondition', data=df)#, hue='LotShape')
            # plt.show()
            
            plt.figure()
            sns.barplot(x='SalePrice', y='HouseStyle', data=df)#, hue='LotShape')
            plt.show()


Let us plot a few features and study them with respect to SalePrice and GrLivArea.


In [24]:
sns.pointplot(x='SalePrice', y='MSZoning', hue='LotShape', data=df,
                          palette={"Reg": "g", "IR1": "m", "IR2": "b", "IR3": "r"}, 
                          markers=["^", "o", 'x', '<'], 
                          linestyles=["-", "--", '-.', ':'])
            plt.show()

            g = sns.PairGrid(df, x_vars=['SalePrice', 'GrLivArea'], y_vars=['MSZoning', 'Utilities', 
                                                                          'LotShape'], 
                             aspect=.75, size=3.5)
            g.map(sns.violinplot, palette='pastel')
            plt.show()

            # Quite slow
            # sns.swarmplot(x='MSZoning', y='MSSubClass', data=df, hue='LandContour')
            # plt.show()


It is important to study if our regularized linear models are able to find optimal regularization parameters alpha. A good parameter is able to prevent overfitting which is also called high variance.


In [25]:
is_choose_optimal_regularization_param = 1
       if is_choose_optimal_regularization_param:
            # Choose optimal value for alpha (regularization parameter) in Lasso and Ridge
            x_train = train_data[0::, :-1]
            y_train = train_data[0::, -1]
            alphas = [0.05, 0.1, 0.3, 1, 3, 4, 10, 15, 30, 50, 75, 100, 110, 130]

            ridge = RidgeCV(alphas=alphas)
            ridge.fit(x_train, y_train)
            alpha = ridge.alpha_
            print("Best Ridge alpha:", alpha)

            alphas_lasso = [1e-6, 1e-5, 0.00005, 0.0001, 0.0005, 0.001, 0.01, 0.03, 0.06, 0.09, 
                            0.1, 0.15] 
            # [1, 0.1, 0.001, 0.0005]
            lasso = LassoCV(alphas=alphas_lasso)
            lasso.fit(x_train, y_train)
            alpha = lasso.alpha_
            print("Best Lasso alpha:", alpha)

            cv_ridge = [house_prices.rmse_cv(Ridge(alpha=alpha), x_train, y_train).mean() 
                        for alpha in alphas]
            cv_ridge = pd.Series(np.expm1(cv_ridge), index=alphas)
            cv_ridge = pd.Series(cv_ridge, index=alphas)
            plt.figure()
            cv_ridge.plot(title = "Ridge, Validation")
            plt.xlabel('alpha')
            plt.ylabel('rmse')
            plt.show()
            print("\nRidge optimal regularization parameter alpha has rmse = ")
            print(cv_ridge.min())


Best Ridge alpha: 0.3
Best Lasso alpha: 0.0005
/home/mizio/anaconda2/envs/kaggle_env_pyth36/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Ridge optimal regularization parameter alpha has rmse = 
0.142770377379

In [26]:
# cv_lasso = [house_prices.rmse_cv(LassoCV(alphas=[alpha]), x_train, y_train).mean() 
            # for alpha in alphas_lasso]
            cv_lasso = [house_prices.rmse_cv(Lasso(alpha=alpha), x_train, y_train).mean() 
                        for alpha in alphas_lasso]
            # cv_lasso = pd.Series(np.expm1(cv_lasso), index=alphas_lasso)
            cv_lasso = pd.Series(cv_lasso, index=alphas_lasso)
            plt.figure()
            cv_lasso.plot(title="Lasso, Validation")
            plt.xlabel('alpha')
            plt.ylabel('rmse')
            plt.show()


/home/mizio/anaconda2/envs/kaggle_env_pyth36/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/home/mizio/anaconda2/envs/kaggle_env_pyth36/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/home/mizio/anaconda2/envs/kaggle_env_pyth36/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/home/mizio/anaconda2/envs/kaggle_env_pyth36/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/home/mizio/anaconda2/envs/kaggle_env_pyth36/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)

In [27]:
print("\nLasso optimal regularization parameter alpha has rmse = ")
            print(cv_lasso.min())

            print("\nMean lasso rmse:")
            model_lasso = LassoCV(alphas=alphas_lasso).fit(x_train, y_train)
            print(house_prices.rmse_cv(model_lasso, x_train, y_train).mean())
            print("\nbest lasso alpha:", model_lasso.alpha_)


Lasso optimal regularization parameter alpha has rmse = 
0.13292346762

Mean lasso rmse:
/home/mizio/anaconda2/envs/kaggle_env_pyth36/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/home/mizio/anaconda2/envs/kaggle_env_pyth36/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/home/mizio/anaconda2/envs/kaggle_env_pyth36/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/home/mizio/anaconda2/envs/kaggle_env_pyth36/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/home/mizio/anaconda2/envs/kaggle_env_pyth36/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
0.133013867877

best lasso alpha: 0.0005
/home/mizio/anaconda2/envs/kaggle_env_pyth36/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)

We want to make sure that a potential machine learning estimator will form a model based on some features that are highly correlated with sale price.


In [28]:
if is_with_agglomeration:
                coefficient_lasso = pd.Series(model_lasso.coef_, 
                                              index=df_merged_train_and_test_agglom
                                              .columns).sort_values()
            else:
                coefficient_lasso = pd.Series(model_lasso.coef_, 
                                              index=df_test_num_features).sort_values()
            
            importance_coeff = pd.concat([coefficient_lasso.head(10), coefficient_lasso.tail(10)])
            plt.figure()
            importance_coeff.plot(kind='barh')
            plt.title('Coefficients Lasso')
            plt.show()
            plt.close()


Make Predictions using Machine Learning Estimators

Let us finally make predictions on sale price using our machine learning estimators LassoCV and RidgeCV. To see how well our model does, we have created a test using cross validation, where we split our training data set in two parts to compare predicted vs. actual sale price. The points should be close to the line and we want lowest possible rmse (root mean square error). Furthermore, we also examine if feature selection (remove features with low variance) with our regularized linear models improves results.


In [29]:
is_make_a_prediction = 1
    if is_make_a_prediction:
        ''' XGBoost and Regularized Linear Models and Random Forest '''
        print("\nPrediction Stats:")
        x_train = train_data[0::, :-1]
        y_train = train_data[0::, -1]

        # Regularized linear regression is needed to avoid overfitting even if you 
        # have lots of features
        lasso = LassoCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 
                                0.1, 0.3, 0.6, 1],
                        max_iter=50000, cv=10)

        # Exclude outliers
        x_train, y_train = house_prices.outlier_identification(lasso, x_train, y_train)
        # plt.show()
        # Feature selection with Lasso
        # Make comparison plot using only the train data.
        # Predicted vs. Actual Sale price
        title_name = 'LassoCV'
        house_prices.predicted_vs_actual_sale_price_input_model(lasso, x_train, y_train, 
                                                                title_name)
        plt.show()
        # plt.close()
        lasso.fit(x_train, y_train)
        alpha = lasso.alpha_
        print('best LassoCV alpha:', alpha)
        score = lasso.score(x_train, y_train)
        output_lasso = lasso.predict(test_data)
        print('\nSCORE Lasso linear model:---------------------------------------------------')
        print(score)


Prediction Stats:

Outlier shapes
(1095, 67) (365, 67) (1095,) (365,)
(1030, 67) (344, 67) (1030,) (344,)
best LassoCV alpha: 0.0003

SCORE Lasso linear model:---------------------------------------------------
0.907116485409

In [30]:
is_ridge_estimator = 1
        if is_ridge_estimator:
            ridge = RidgeCV(alphas=[0.06, 0.1, 0.3, 0.6, 1, 10, 100, 110], cv=10)
            title_name = 'RidgeCV'
            house_prices.predicted_vs_actual_sale_price_input_model(ridge, x_train, y_train, 
                                                                    title_name)
            plt.show()
            ridge.fit(x_train, y_train)
            alpha = ridge.alpha_
            print('best RidgeCV alpha:', alpha)
            score = ridge.score(x_train, y_train)
            output_ridge = ridge.predict(test_data)
            print('\nSCORE Ridge linear model:--------------------------------------------------')
            print(score)


(1030, 67) (344, 67) (1030,) (344,)
best RidgeCV alpha: 0.3

SCORE Ridge linear model:--------------------------------------------------
0.908076902144

In [31]:
# Make comparison plot using only the train data.
            # Predicted vs. Actual Sale price
            add_name_of_regressor = 'RidgeCV'
            forest_feature_selection = ridge

            # Select most important features
            feature_selection_model = SelectFromModel(forest_feature_selection, prefit=True)
            x_train_new = feature_selection_model.transform(x_train)
            print(x_train_new.shape)
            test_data_new = feature_selection_model.transform(test_data)
            print(test_data_new.shape)
            # We get that 21 features are selected

            title_name = ''.join([add_name_of_regressor, ' Feature Selection'])
            house_prices.predicted_vs_actual_sale_price_input_model(forest_feature_selection, 
                                                                    x_train_new, y_train, 
                                                                    title_name)
            plt.show()
            forest_feature_selected = forest_feature_selection.fit(x_train_new, y_train)
            score = forest_feature_selected.score(x_train_new, y_train)
            output_feature_selection_ridge = forest_feature_selection.predict(test_data_new)
            print('\nSCORE {0} regressor (feature select):--------------------------------------'
                  .format(add_name_of_regressor))
            print(score)


(1374, 17)
(1459, 17)
(1030, 17) (344, 17) (1030,) (344,)
SCORE RidgeCV regressor (feature select):--------------------------------------
0.895001253172

In [32]:
is_grid_search_RF_prediction = 0
        if is_grid_search_RF_prediction:
            # Fit the training data to the survived labels and create the decision trees

            # Create the random forest object which will include all the parameters for the fit
            forest = RandomForestRegressor()
            # forest = SGDRegressor()
            parameter_grid = {'max_depth': [4,5,6,7,8], 'n_estimators': [200,210,240,250]}  
            # ,'criterion': ['gini', 'entropy']}
            cross_validation = StratifiedKFold(random_state=None, shuffle=False)  # , n_folds=10)
            grid_search = GridSearchCV(forest, param_grid=parameter_grid, cv=cross_validation, 
                                       n_jobs=24)
            title_name = 'Random Forest with GridSearchCV'
            house_prices.predicted_vs_actual_sale_price_input_model(grid_search, x_train, y_train, 
                                                                    title_name)
            plt.show()
            grid_search.fit(x_train, y_train)
            # output = grid_search.predict(test_data)

            print('Best score: {}'.format(grid_search.best_score_))
            print('Best parameters: {}'.format(grid_search.best_params_))

In [33]:
is_feature_selection_prediction = 1
        if is_feature_selection_prediction:

            is_feature_selection_with_lasso = 1
            if is_feature_selection_with_lasso:
                forest_feature_selection = lasso
                add_name_of_regressor = 'Lasso'
            else:
                add_name_of_regressor = 'Random Forest'
                # Random forest (rf) regressor for feature selection
                forest_feature_selection = RandomForestRegressor(n_estimators=240, max_depth=8)
                forest_feature_selection = forest_feature_selection.fit(x_train, y_train)

                # Evaluate variable importance with no cross validation
                importances = forest_feature_selection.feature_importances_
                std = np.std([tree.feature_importances_ for tree 
                              in forest_feature_selection.estimators_], axis=0)
                indices = np.argsort(importances)[::-1]

                print('\nFeatures:')
                df_test_num_features = house_prices.extract_numerical_features(df_test)
                print(np.reshape(
                    np.append(np.array(list(df_test_num_features)), 
                              np.arange(0, len(list(df_test_num_features)))), 
                    (len(list(df_test_num_features)), 2), 'F'))  # , 2, len(list(df_test)))

                print('\nFeature ranking:')
                for f in range(x_train.shape[1]):
                    print('%d. feature %d (%f)' % (f + 1, indices[f], importances[indices[f]]))

            # Select most important features
            feature_selection_model = SelectFromModel(forest_feature_selection, prefit=True)
            x_train_new = feature_selection_model.transform(x_train)
            print(x_train_new.shape)
            test_data_new = feature_selection_model.transform(test_data)
            print(test_data_new.shape)
            # We get that 21 features are selected

            title_name = ''.join([add_name_of_regressor, ' Feature Selection'])
            house_prices.predicted_vs_actual_sale_price_input_model(forest_feature_selection, 
                                                                    x_train_new, y_train, 
                                                                    title_name)
            plt.show()
            forest_feature_selected = forest_feature_selection.fit(x_train_new, y_train)
            score = forest_feature_selected.score(x_train_new, y_train)
            output_feature_selection_lasso = forest_feature_selection.predict(test_data_new)
            print('\nSCORE {0} regressor (feature select):-------------------------------------'
                  .format(add_name_of_regressor))
            print(score)


(1374, 58)
(1459, 58)
(1030, 58) (344, 58) (1030,) (344,)
SCORE Lasso regressor (feature select):-------------------------------------
0.907116598095

In [34]:
''' xgboost '''
        is_xgb_cv = 1
        if is_xgb_cv:
            seed = 0
            dtrain = xgb.DMatrix(x_train, label=y_train)
            dtest = xgb.DMatrix(test_data)

            xgb_params = {
                'seed': 0,
                'colsample_bytree': 0.8,
                'silent': 1,
                'subsample': 0.6,
                'learning_rate': 0.01,
                # 'booster': 'gblinear',  # default is gbtree
                'objective': 'reg:linear',
                'max_depth': 1,
                'num_parallel_tree': 1,
                'min_child_weight': 1,
                'eval_metric': 'rmse',
            }

            res = xgb.cv(xgb_params, dtrain, num_boost_round=10000, nfold=10, seed=seed, 
                         stratified=False, early_stopping_rounds=100, verbose_eval=10, 
                         show_stdv=True)

            best_nrounds = res.shape[0] - 1
            cv_mean = res.iloc[-1, 0]
            cv_std = res.iloc[-1, 1]

            print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))
            title_name = 'xgb.cv'
            # house_prices.predicted_vs_actual_sale_price_xgb(xgb, best_nrounds, xgb_params, 
            # x_train, y_train, title_name)
            gbdt = xgb.train(xgb_params, dtrain, best_nrounds)
            output_xgb_cv = gbdt.predict(dtest)


[0]	train-rmse:11.4241+0.002984	test-rmse:11.424+0.0270848
[10]	train-rmse:10.3344+0.00278957	test-rmse:10.3343+0.027345
[20]	train-rmse:9.34892+0.00255348	test-rmse:9.34888+0.0276269
[30]	train-rmse:8.45773+0.00233371	test-rmse:8.45769+0.0279303
[40]	train-rmse:7.6519+0.0020887	test-rmse:7.65185+0.0282333
[50]	train-rmse:6.92314+0.00182087	test-rmse:6.92311+0.0280162
[60]	train-rmse:6.26405+0.00162323	test-rmse:6.26409+0.0270195
[70]	train-rmse:5.66805+0.00141476	test-rmse:5.66813+0.0259912
[80]	train-rmse:5.12885+0.00130114	test-rmse:5.12906+0.024809
[90]	train-rmse:4.64125+0.00129331	test-rmse:4.64149+0.0237196
[100]	train-rmse:4.20015+0.00114083	test-rmse:4.20065+0.022828
[110]	train-rmse:3.80134+0.00105484	test-rmse:3.80192+0.0219125
[120]	train-rmse:3.44064+0.000932172	test-rmse:3.44147+0.0210432
[130]	train-rmse:3.11443+0.000842399	test-rmse:3.11551+0.0202767
[140]	train-rmse:2.81943+0.000801855	test-rmse:2.82031+0.0195329
[150]	train-rmse:2.55268+0.000688373	test-rmse:2.55344+0.018821
[160]	train-rmse:2.31156+0.000767254	test-rmse:2.31229+0.0180669
[170]	train-rmse:2.0936+0.000665197	test-rmse:2.09424+0.0177461
[180]	train-rmse:1.89652+0.00057469	test-rmse:1.8972+0.0173176
[190]	train-rmse:1.71843+0.000599941	test-rmse:1.71898+0.0168534
[200]	train-rmse:1.55753+0.000615166	test-rmse:1.55814+0.0164422
[210]	train-rmse:1.41218+0.000645246	test-rmse:1.41279+0.0162039
[220]	train-rmse:1.28089+0.000609404	test-rmse:1.28162+0.0159372
[230]	train-rmse:1.16241+0.000672307	test-rmse:1.16322+0.0154461
[240]	train-rmse:1.05538+0.000732195	test-rmse:1.05625+0.0150184
[250]	train-rmse:0.958885+0.000727057	test-rmse:0.959938+0.0148161
[260]	train-rmse:0.871858+0.000724146	test-rmse:0.87313+0.0147249
[270]	train-rmse:0.793516+0.000754219	test-rmse:0.794765+0.0146818
[280]	train-rmse:0.722927+0.000739331	test-rmse:0.724249+0.0146801
[290]	train-rmse:0.659447+0.000835396	test-rmse:0.660907+0.0147742
[300]	train-rmse:0.602305+0.000884822	test-rmse:0.603791+0.01472
[310]	train-rmse:0.550949+0.000956032	test-rmse:0.552684+0.0147762
[320]	train-rmse:0.505031+0.00093959	test-rmse:0.506886+0.0150274
[330]	train-rmse:0.463913+0.000975694	test-rmse:0.465869+0.0151583
[340]	train-rmse:0.427149+0.00101289	test-rmse:0.429279+0.0154838
[350]	train-rmse:0.394323+0.00103829	test-rmse:0.396819+0.0158399
[360]	train-rmse:0.365037+0.00107769	test-rmse:0.367656+0.0161919
[370]	train-rmse:0.339109+0.00107457	test-rmse:0.341883+0.0166624
[380]	train-rmse:0.316057+0.00114384	test-rmse:0.319002+0.0171694
[390]	train-rmse:0.295706+0.0012244	test-rmse:0.2989+0.0178021
[400]	train-rmse:0.277743+0.00129456	test-rmse:0.281059+0.0182821
[410]	train-rmse:0.261968+0.00137248	test-rmse:0.265447+0.0187599
[420]	train-rmse:0.248055+0.00150388	test-rmse:0.251779+0.0192107
[430]	train-rmse:0.235877+0.00149895	test-rmse:0.239884+0.0196919
[440]	train-rmse:0.225206+0.00154974	test-rmse:0.229512+0.0202258
[450]	train-rmse:0.215845+0.0016008	test-rmse:0.220455+0.0206217
[460]	train-rmse:0.2077+0.00159451	test-rmse:0.212568+0.021184
[470]	train-rmse:0.200595+0.00158625	test-rmse:0.205671+0.0215599
[480]	train-rmse:0.194376+0.00159486	test-rmse:0.199751+0.0219225
[490]	train-rmse:0.18894+0.00162417	test-rmse:0.194618+0.0222216
[500]	train-rmse:0.18418+0.00164005	test-rmse:0.190058+0.0224984
[510]	train-rmse:0.179987+0.00163874	test-rmse:0.185994+0.0226447
[520]	train-rmse:0.176326+0.00167145	test-rmse:0.182467+0.022934
[530]	train-rmse:0.173073+0.00171751	test-rmse:0.179344+0.0230991
[540]	train-rmse:0.170203+0.00172252	test-rmse:0.176594+0.0231901
[550]	train-rmse:0.167614+0.00174394	test-rmse:0.17402+0.0232553
[560]	train-rmse:0.165312+0.00176668	test-rmse:0.171733+0.0233021
[570]	train-rmse:0.163268+0.0017731	test-rmse:0.169823+0.0233125
[580]	train-rmse:0.161394+0.00176424	test-rmse:0.167945+0.0233825
[590]	train-rmse:0.159701+0.00173304	test-rmse:0.166293+0.0234632
[600]	train-rmse:0.15813+0.00174691	test-rmse:0.164765+0.0233924
[610]	train-rmse:0.156701+0.00175475	test-rmse:0.163506+0.023378
[620]	train-rmse:0.155375+0.00175812	test-rmse:0.162153+0.0232848
[630]	train-rmse:0.154136+0.00175659	test-rmse:0.161087+0.0232937
[640]	train-rmse:0.152968+0.0017531	test-rmse:0.159946+0.0232944
[650]	train-rmse:0.151889+0.00175121	test-rmse:0.158854+0.0232392
[660]	train-rmse:0.150851+0.00173879	test-rmse:0.157935+0.0231635
[670]	train-rmse:0.149881+0.00174694	test-rmse:0.157099+0.023105
[680]	train-rmse:0.148951+0.00176245	test-rmse:0.156282+0.0230896
[690]	train-rmse:0.148071+0.00175251	test-rmse:0.155569+0.0230358
[700]	train-rmse:0.147236+0.00174915	test-rmse:0.15474+0.0229966
[710]	train-rmse:0.146441+0.00175355	test-rmse:0.15406+0.0229223
[720]	train-rmse:0.14567+0.00174522	test-rmse:0.153351+0.0228559
[730]	train-rmse:0.144938+0.00174784	test-rmse:0.152762+0.0228329
[740]	train-rmse:0.144235+0.00173491	test-rmse:0.152215+0.0229167
[750]	train-rmse:0.143569+0.00174421	test-rmse:0.151715+0.022813
[760]	train-rmse:0.142921+0.00173638	test-rmse:0.151106+0.0228648
[770]	train-rmse:0.142292+0.00172577	test-rmse:0.15064+0.0227957
[780]	train-rmse:0.141673+0.00171781	test-rmse:0.150198+0.022769
[790]	train-rmse:0.141094+0.00174628	test-rmse:0.149832+0.0227427
[800]	train-rmse:0.140525+0.00176214	test-rmse:0.149448+0.022676
[810]	train-rmse:0.139962+0.00176285	test-rmse:0.148945+0.0225837
[820]	train-rmse:0.13943+0.00177197	test-rmse:0.148561+0.0225369
[830]	train-rmse:0.138915+0.00177206	test-rmse:0.148244+0.0226184
[840]	train-rmse:0.138392+0.00177583	test-rmse:0.148022+0.0226257
[850]	train-rmse:0.137915+0.0017847	test-rmse:0.147748+0.0225902
[860]	train-rmse:0.137417+0.00178965	test-rmse:0.147367+0.0225823
[870]	train-rmse:0.136943+0.00179952	test-rmse:0.14716+0.0226582
[880]	train-rmse:0.136468+0.00179516	test-rmse:0.146907+0.02275
[890]	train-rmse:0.136+0.00181544	test-rmse:0.146643+0.0226377
[900]	train-rmse:0.135565+0.00182367	test-rmse:0.146464+0.0226537
[910]	train-rmse:0.135125+0.00181499	test-rmse:0.146285+0.0227042
[920]	train-rmse:0.134703+0.00183364	test-rmse:0.146056+0.0227013
[930]	train-rmse:0.134287+0.00184286	test-rmse:0.145797+0.022761
[940]	train-rmse:0.133882+0.00183883	test-rmse:0.145581+0.0227509
[950]	train-rmse:0.133506+0.00185845	test-rmse:0.145417+0.022844
[960]	train-rmse:0.133095+0.00184828	test-rmse:0.145188+0.0229366
[970]	train-rmse:0.132706+0.00186158	test-rmse:0.145006+0.0229125
[980]	train-rmse:0.132314+0.00185161	test-rmse:0.144738+0.0228529
[990]	train-rmse:0.131945+0.00183902	test-rmse:0.144636+0.0230079
[1000]	train-rmse:0.131581+0.00184906	test-rmse:0.144513+0.023035
[1010]	train-rmse:0.131216+0.00184106	test-rmse:0.144319+0.0231011
[1020]	train-rmse:0.130855+0.00183562	test-rmse:0.144072+0.0231187
[1030]	train-rmse:0.130496+0.00182985	test-rmse:0.144032+0.0232845
[1040]	train-rmse:0.130142+0.00182975	test-rmse:0.143894+0.0232857
[1050]	train-rmse:0.129793+0.00182327	test-rmse:0.143729+0.0233179
[1060]	train-rmse:0.129449+0.00181848	test-rmse:0.143574+0.0233238
[1070]	train-rmse:0.129115+0.00182247	test-rmse:0.143452+0.0233213
[1080]	train-rmse:0.128797+0.00181128	test-rmse:0.143389+0.023352
[1090]	train-rmse:0.128484+0.00180932	test-rmse:0.143348+0.0234484
[1100]	train-rmse:0.128152+0.00180786	test-rmse:0.143119+0.0234239
[1110]	train-rmse:0.127836+0.00180734	test-rmse:0.14294+0.0234023
[1120]	train-rmse:0.127518+0.00181521	test-rmse:0.14274+0.0233393
[1130]	train-rmse:0.127209+0.00180665	test-rmse:0.142624+0.0233443
[1140]	train-rmse:0.126909+0.00179806	test-rmse:0.142511+0.0233301
[1150]	train-rmse:0.126617+0.00177864	test-rmse:0.142345+0.0233597
[1160]	train-rmse:0.126334+0.00177074	test-rmse:0.142282+0.0234111
[1170]	train-rmse:0.126034+0.00176245	test-rmse:0.142102+0.0234322
[1180]	train-rmse:0.125741+0.00176738	test-rmse:0.141962+0.0234175
[1190]	train-rmse:0.125464+0.00176406	test-rmse:0.141813+0.0233716
[1200]	train-rmse:0.125192+0.00176223	test-rmse:0.141691+0.0233683
[1210]	train-rmse:0.1249+0.00174968	test-rmse:0.141515+0.0233222
[1220]	train-rmse:0.124635+0.00174944	test-rmse:0.141363+0.0232917
[1230]	train-rmse:0.124358+0.00174878	test-rmse:0.141189+0.0232233
[1240]	train-rmse:0.124097+0.00174485	test-rmse:0.141081+0.0232689
[1250]	train-rmse:0.123836+0.00174459	test-rmse:0.140974+0.0233239
[1260]	train-rmse:0.123583+0.00174295	test-rmse:0.140854+0.0233355
[1270]	train-rmse:0.123317+0.00173058	test-rmse:0.140769+0.0233658
[1280]	train-rmse:0.123062+0.00172413	test-rmse:0.140645+0.0234086
[1290]	train-rmse:0.122816+0.00171101	test-rmse:0.140517+0.0234456
[1300]	train-rmse:0.122577+0.00171094	test-rmse:0.140407+0.0234268
[1310]	train-rmse:0.122358+0.00170346	test-rmse:0.140409+0.0235058
[1320]	train-rmse:0.122117+0.00169727	test-rmse:0.140337+0.0235404
[1330]	train-rmse:0.121884+0.0016912	test-rmse:0.140286+0.023583
[1340]	train-rmse:0.121665+0.00169004	test-rmse:0.140126+0.0235065
[1350]	train-rmse:0.121431+0.0016929	test-rmse:0.140011+0.0235179
[1360]	train-rmse:0.121203+0.00168527	test-rmse:0.139873+0.0234853
[1370]	train-rmse:0.120983+0.00167707	test-rmse:0.13976+0.0234497
[1380]	train-rmse:0.120763+0.00167806	test-rmse:0.139702+0.0234852
[1390]	train-rmse:0.120549+0.00166881	test-rmse:0.139618+0.0235018
[1400]	train-rmse:0.120331+0.00166184	test-rmse:0.139574+0.0236193
[1410]	train-rmse:0.120123+0.00165596	test-rmse:0.139545+0.0236671
[1420]	train-rmse:0.119917+0.00165008	test-rmse:0.139437+0.0236727
[1430]	train-rmse:0.119708+0.00164112	test-rmse:0.139378+0.0237131
[1440]	train-rmse:0.119509+0.00163456	test-rmse:0.139283+0.023715
[1450]	train-rmse:0.119308+0.0016328	test-rmse:0.139205+0.0237571
[1460]	train-rmse:0.119105+0.00162955	test-rmse:0.13915+0.0237807
[1470]	train-rmse:0.118916+0.00162121	test-rmse:0.139088+0.0237984
[1480]	train-rmse:0.118724+0.00161049	test-rmse:0.138991+0.0238387
[1490]	train-rmse:0.118523+0.00160744	test-rmse:0.138871+0.0238372
[1500]	train-rmse:0.118328+0.00160552	test-rmse:0.138774+0.023843
[1510]	train-rmse:0.118139+0.00159818	test-rmse:0.138709+0.0238437
[1520]	train-rmse:0.117952+0.00159147	test-rmse:0.138675+0.0238595
[1530]	train-rmse:0.117769+0.00158192	test-rmse:0.138633+0.0238985
[1540]	train-rmse:0.117581+0.00157545	test-rmse:0.138573+0.0239408
[1550]	train-rmse:0.117401+0.00156943	test-rmse:0.138515+0.0239631
[1560]	train-rmse:0.11723+0.00156696	test-rmse:0.138436+0.0238763
[1570]	train-rmse:0.117061+0.00156436	test-rmse:0.138347+0.0239221
[1580]	train-rmse:0.116897+0.00157028	test-rmse:0.13832+0.0240068
[1590]	train-rmse:0.11672+0.00155114	test-rmse:0.138275+0.024044
[1600]	train-rmse:0.116565+0.00154664	test-rmse:0.138294+0.0240969
[1610]	train-rmse:0.116407+0.00154402	test-rmse:0.138197+0.0240792
[1620]	train-rmse:0.116257+0.0015354	test-rmse:0.138164+0.0240986
[1630]	train-rmse:0.116094+0.00153432	test-rmse:0.138123+0.0241349
[1640]	train-rmse:0.115937+0.00152885	test-rmse:0.138053+0.024111
[1650]	train-rmse:0.115784+0.00152692	test-rmse:0.138006+0.0241324
[1660]	train-rmse:0.115625+0.00152262	test-rmse:0.137944+0.0241092
[1670]	train-rmse:0.115476+0.00152121	test-rmse:0.137857+0.0240752
[1680]	train-rmse:0.11532+0.0015172	test-rmse:0.137802+0.0240518
[1690]	train-rmse:0.11517+0.00150861	test-rmse:0.137744+0.024044
[1700]	train-rmse:0.115018+0.00149583	test-rmse:0.137752+0.0240517
[1710]	train-rmse:0.114872+0.0014875	test-rmse:0.137711+0.0240366
[1720]	train-rmse:0.114723+0.00148234	test-rmse:0.137642+0.0240346
[1730]	train-rmse:0.114583+0.00148163	test-rmse:0.137629+0.0240915
[1740]	train-rmse:0.114439+0.00148162	test-rmse:0.137582+0.024031
[1750]	train-rmse:0.114308+0.00147366	test-rmse:0.137609+0.0240887
[1760]	train-rmse:0.114171+0.00147187	test-rmse:0.137523+0.024085
[1770]	train-rmse:0.114041+0.00145788	test-rmse:0.13747+0.0240796
[1780]	train-rmse:0.113903+0.00145934	test-rmse:0.137491+0.0241359
[1790]	train-rmse:0.113777+0.00145775	test-rmse:0.137492+0.0241634
[1800]	train-rmse:0.11365+0.00145189	test-rmse:0.137377+0.0241708
[1810]	train-rmse:0.113523+0.00144515	test-rmse:0.137323+0.024144
[1820]	train-rmse:0.113391+0.00143837	test-rmse:0.137372+0.0242898
[1830]	train-rmse:0.113271+0.00142787	test-rmse:0.13734+0.0243259
[1840]	train-rmse:0.113147+0.0014248	test-rmse:0.13727+0.0242734
[1850]	train-rmse:0.113025+0.00142404	test-rmse:0.137244+0.024281
[1860]	train-rmse:0.112898+0.00140816	test-rmse:0.137142+0.0242824
[1870]	train-rmse:0.112788+0.00140762	test-rmse:0.137089+0.0242472
[1880]	train-rmse:0.112669+0.00140034	test-rmse:0.136996+0.0242115
[1890]	train-rmse:0.11256+0.00140174	test-rmse:0.13694+0.0241842
[1900]	train-rmse:0.112445+0.00139743	test-rmse:0.136903+0.0241803
[1910]	train-rmse:0.112331+0.00139	test-rmse:0.136893+0.0242107
[1920]	train-rmse:0.112215+0.00138469	test-rmse:0.136928+0.0242743
[1930]	train-rmse:0.112114+0.00136987	test-rmse:0.136965+0.0243538
[1940]	train-rmse:0.112001+0.00136428	test-rmse:0.136956+0.024443
[1950]	train-rmse:0.111892+0.00136184	test-rmse:0.136909+0.0243975
[1960]	train-rmse:0.111789+0.00136287	test-rmse:0.13695+0.0244868
[1970]	train-rmse:0.111693+0.00136475	test-rmse:0.136954+0.0244588
[1980]	train-rmse:0.111595+0.00136028	test-rmse:0.136984+0.0244937
[1990]	train-rmse:0.111485+0.00135378	test-rmse:0.13693+0.0245427
Ensemble-CV: 0.13687560000000001+0.024148062925212037

In [35]:
use_xgb_regressor = 0
        if use_xgb_regressor:
            # Is a parallel job
            xgb_model = xgb.XGBRegressor()
            # xgb_model = xgb.XGBRegressor(n_estimators = 360, max_depth = 2, learning_rate = 0.1)
            # XGBClassifier gives the best prediction
            # xgb_model = xgb.XGBClassifier()
            cross_validation = StratifiedKFold(n_splits=10, shuffle=False, random_state=None)  
            # , n_folds=10)
            parameter_grid = {'max_depth': [4, 5, 6, 7, 8], 'n_estimators': [200, 210, 240, 250]}
            # parameter_grid = {'max_depth': [2, 4, 6], 'n_estimators': [50, 100, 200]}  
            # , 'criterion': ['gini'
            # , 'entropy']}
            clf = GridSearchCV(xgb_model, param_grid=parameter_grid, cv=cross_validation)  
            # verbose=1)
            title_name = 'xgbRegressor'
            house_prices.predicted_vs_actual_sale_price_input_model(clf, x_train, y_train, 
                                                                    title_name)
            clf.fit(x_train, y_train)
            output_xgb_regressor = clf.predict(test_data)
            print('\nSCORE XGBRegressor train data:-------------------------------------------')
            print(clf.best_score_)
            print(clf.best_params_)

In [36]:
house_prices.timestamp = datetime.datetime.now().strftime('%Y%m%d_%Hh%Mm%Ss')
        save_path = '/home/mizio/Documents/Kaggle/HousePrices/house_prices_clone_0/predicted_vs_actual/'
        house_prices.multipage(''.join([save_path, 'Overview_estimators_rmse_', 
                                        house_prices.timestamp, '.pdf']))
        plt.close()

In [37]:
# Averaging the output using four different machine learning estimators
        # output = (output_feature_selection_lasso + output_feature_selection_ridge + output_xgb_cv
        # + output_xgb_regressor)/4.0
        # output = (output_feature_selection_lasso + output_ridge + output_xgb_regressor) / 3.0
        # output = (output_feature_selection_lasso + output_ridge) / 2.0
        output = (output_feature_selection_lasso + output_xgb_cv) / 2.0
        # print np.shape(output_ridge) == np.shape(output_lasso)

Submission



In [38]:
if is_simple_model or is_make_a_prediction:
        ''' Submission '''
        save_path = '/home/mizio/Documents/Kaggle/HousePrices/submission/'
        # Submission requires a csv file with Id and SalePrice columns.
        # dfBestScore = pd.read_csv(''.join([save_path, 'submission_house_prices.csv']), header=0)

        # We do not expect all to be equal since the learned model differs from time to time.
        # print (dfBestScore.values[0::, 1::].ravel() == output.astype(int))
        # print np.array_equal(dfBestScore.values[0::, 1::].ravel(), output.astype(int))  
        # But they are almost never all equal

        # Exp() is needed in order to get the correct sale price, since we took a log() earlier
        # if not is_simple_model:
        if house_prices.is_with_log1p_SalePrice:
            output = np.expm1(output)

        submission = pd.DataFrame({'Id': Id_df_test, 'SalePrice': output})
        submission.to_csv(''.join([save_path, 'submission_house_prices_', house_prices.timestamp, 
                                   '.csv']), index=False)

In [39]:
# import version_information, load_ext

    %reload_ext version_information

    %version_information numpy, scipy, matplotlib, sklearn, version_information


Out[39]:
SoftwareVersion
Python3.6.0 64bit [GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
IPython5.2.2
OSLinux 3.13.0 106 generic x86_64 with debian jessie sid
numpy1.11.3
scipy0.18.1
matplotlib2.0.0
sklearn0.18.1
version_information1.0.3
Wed Mar 01 10:47:28 2017 CET

In [ ]: