Energy Efficiency

What can you tell me about the data?



In [5]:

    
import matplotlib.pyplot as plt
%matplotlib inline



In [6]:

    
import pandas as pd
df = pd.read_csv('../data/energy/energy.csv')
df.shape









    Out[6]:





(768, 10)



In [7]:

    
df.describe()

X2, X3, X4 might be candidates for normalization;
X5, X6, X8 likely to be discrete values;
Y1, Y2 within the same range



In [8]:

    
import matplotlib.pyplot as plt
%matplotlib inline
from pandas.tools.plotting import scatter_matrix
scatter_matrix(df, alpha=0.2, figsize=(18,18), diagonal='kde')
plt.show()

observations: maybe X5 is binary? X2 and X1 seem to have strong correlation

Data Visualization with Pandas



In [9]:

    
import matplotlib.pyplot as plt
df.plot()
plt.show()



In [10]:

    
#Individual Elements in the DF
df['X1'].plot()
plt.show()



In [11]:

    
# Use the 'kind' keyword for different variations
# Other kinds: 'bar', 'hist', 'box', 'kde',
# 'area', 'scatter', 'hexbin', 'pie'
df.plot(x='X1', y='X2', kind='scatter')
plt.show()



In [12]:

    
plt.savefig('myfig.png')









    





<matplotlib.figure.Figure at 0x10db3a990>



In [13]:

    
# Complex function in pandas.plotting that take DataFrame or Series as arg
# Scatter Matrix, Andrews Curves, Parallel Coordinates, Lag Plot,
#      Autocorrelation Plot, Bootstrap Plot, RadViz

Loading the Prepared Data



In [24]:

    
from utils import *
from sklearn.cross_validation import train_test_split as tts
dataset = load_energy()



In [20]:

    
print(dataset)









    



{'_target_set': {'Y1': array([ 15.55,  15.55,  15.55,  15.55,  20.84,  21.46,  20.71,  19.68,
        19.5 ,  19.95,  19.34,  18.31,  17.05,  17.41,  16.95,  15.98,
        28.52,  29.9 ,  29.63,  28.75,  24.77,  23.93,  24.77,  23.93,
         6.07,   6.05,   6.01,   6.04,   6.37,   6.4 ,   6.37,   6.4 ,
         6.85,   6.79,   6.77,   6.81,   7.18,   7.1 ,   7.1 ,   7.1 ,
        10.85,  10.54,  10.77,  10.56,   8.6 ,   8.49,   8.45,   8.5 ,
        24.58,  24.63,  24.63,  24.59,  29.03,  29.87,  29.14,  28.09,
        26.28,  26.91,  26.37,  25.27,  23.53,  24.03,  23.54,  22.58,
        35.56,  37.12,  36.9 ,  35.94,  32.96,  32.12,  32.94,  32.21,
        10.36,  10.43,  10.36,  10.39,  10.71,  10.8 ,  10.7 ,  10.75,
        11.11,  11.13,  11.09,  11.16,  11.68,  11.69,  11.7 ,  11.69,
        15.41,  15.2 ,  15.42,  15.21,  12.96,  12.97,  12.93,  13.02,
        24.29,  24.31,  24.13,  24.25,  28.88,  29.68,  28.83,  27.9 ,
        26.48,  27.02,  26.33,  25.36,  23.75,  24.23,  23.67,  22.79,
        35.65,  37.26,  36.97,  36.03,  33.16,  32.4 ,  33.12,  32.41,
        10.42,  10.46,  10.32,  10.45,  10.64,  10.72,  10.55,  10.68,
        11.45,  11.46,  11.32,  11.49,  11.45,  11.42,  11.33,  11.43,
        15.41,  15.18,  15.34,  15.19,  12.88,  13.  ,  12.97,  13.04,
        24.28,  24.4 ,  24.11,  24.35,  28.07,  29.01,  29.62,  29.05,
        25.41,  26.47,  26.89,  26.46,  22.93,  23.84,  24.17,  23.87,
        35.78,  35.48,  36.97,  36.7 ,  32.52,  33.28,  32.33,  33.24,
        10.39,  10.34,  10.35,  10.38,  10.77,  10.68,  10.68,  10.7 ,
        11.22,  11.16,  11.1 ,  11.14,  11.59,  11.6 ,  11.53,  11.61,
        15.16,  15.36,  15.12,  15.36,  12.68,  12.63,  12.71,  12.73,
        24.38,  24.23,  24.04,  24.32,  29.06,  28.05,  28.86,  29.79,
        26.44,  25.37,  26.33,  27.03,  23.8 ,  22.8 ,  23.59,  24.24,
        36.86,  35.89,  35.45,  37.1 ,  33.08,  32.38,  33.09,  32.31,
        10.08,  10.15,  10.07,  10.14,  10.66,  10.68,  10.53,  10.72,
        11.18,  11.22,  11.07,  11.2 ,  11.44,  11.42,  11.33,  11.43,
        15.4 ,  15.19,  15.32,  15.16,  12.85,  13.04,  13.  ,  13.  ,
        24.35,  24.33,  24.03,  24.26,  29.83,  29.08,  28.03,  29.02,
        27.03,  26.45,  25.36,  26.45,  24.37,  23.89,  22.89,  23.86,
        37.03,  36.71,  36.77,  35.48,  32.31,  33.21,  32.46,  33.27,
        10.47,  10.37,  10.34,  10.39,  10.78,  10.7 ,  10.67,  13.69,
        11.21,  11.14,  11.11,  11.16,  11.38,  11.34,  11.22,  11.34,
        15.16,  15.37,  15.12,  15.36,  12.59,  12.74,  12.8 ,  12.62,
        28.15,  28.15,  28.37,  28.41,  32.68,  33.48,  32.84,  32.  ,
        29.54,  30.05,  29.6 ,  28.66,  26.84,  27.27,  26.97,  26.19,
        38.67,  40.03,  39.86,  39.04,  36.96,  36.13,  36.91,  36.43,
        12.43,  12.5 ,  12.41,  12.45,  12.57,  12.65,  12.57,  12.63,
        12.78,  12.93,  12.73,  12.72,  13.17,  13.18,  13.17,  13.18,
        17.5 ,  17.35,  17.52,  17.37,  15.09,  15.12,  15.08,  15.16,
        28.67,  28.57,  28.18,  28.6 ,  32.46,  33.27,  32.33,  31.66,
        29.34,  29.87,  29.27,  28.4 ,  25.74,  25.98,  25.38,  24.94,
        38.57,  40.19,  39.97,  38.98,  36.95,  36.28,  36.86,  36.45,
        12.35,  12.45,  12.16,  12.3 ,  12.33,  12.29,  12.2 ,  12.49,
        12.85,  12.87,  12.73,  12.95,  13.05,  12.93,  12.77,  13.  ,
        17.14,  16.84,  17.02,  17.11,  14.34,  14.66,  14.6 ,  14.6 ,
        28.67,  28.56,  28.17,  28.63,  31.63,  32.4 ,  32.68,  32.29,
        28.4 ,  29.4 ,  29.43,  29.07,  24.7 ,  25.48,  25.37,  25.17,
        39.04,  38.35,  39.81,  39.83,  35.99,  36.59,  35.64,  36.52,
        11.8 ,  12.03,  11.98,  11.69,  12.41,  12.28,  12.1 ,  12.19,
        12.34,  12.46,  12.31,  12.12,  12.97,  13.01,  12.74,  12.84,
        16.83,  16.93,  16.66,  16.86,  13.91,  14.34,  13.95,  13.99,
        28.7 ,  28.55,  28.15,  28.62,  32.67,  31.69,  32.07,  33.28,
        29.47,  28.42,  29.08,  29.88,  25.66,  24.96,  25.43,  26.  ,
        40.  ,  38.84,  38.33,  40.12,  36.95,  36.45,  36.81,  36.26,
        12.32,  12.3 ,  12.18,  12.43,  12.36,  12.49,  12.17,  12.28,
        12.91,  12.95,  12.67,  12.86,  12.95,  13.  ,  12.86,  12.92,
        16.99,  16.69,  16.56,  16.62,  14.33,  14.61,  14.61,  14.65,
        28.69,  28.58,  28.15,  28.61,  33.13,  32.31,  31.53,  32.46,
        29.71,  29.09,  28.31,  29.39,  25.7 ,  25.17,  24.6 ,  25.49,
        39.89,  39.83,  39.01,  38.65,  35.69,  36.64,  36.06,  36.7 ,
        12.12,  11.67,  11.64,  12.02,  12.27,  12.19,  12.25,  12.27,
        12.47,  12.12,  12.18,  12.47,  12.93,  12.82,  12.78,  13.02,
        16.73,  16.86,  16.76,  16.92,  13.68,  13.99,  14.16,  13.86,
        32.26,  32.26,  32.49,  32.53,  36.47,  37.24,  36.66,  35.96,
        31.89,  32.39,  32.09,  31.29,  29.22,  29.91,  29.53,  28.65,
        41.4 ,  42.62,  42.5 ,  41.67,  40.78,  39.97,  40.71,  40.43,
        14.52,  14.61,  14.5 ,  14.55,  14.51,  14.6 ,  14.5 ,  14.58,
        14.51,  14.7 ,  14.42,  14.42,  15.23,  15.23,  15.23,  15.23,
        19.52,  19.36,  19.48,  19.42,  15.09,  17.17,  17.14,  17.14,
        32.82,  32.71,  32.24,  32.72,  35.84,  36.57,  36.06,  35.69,
        32.48,  32.74,  32.13,  31.64,  28.95,  29.49,  28.64,  28.01,
        41.64,  43.1 ,  42.74,  41.92,  40.78,  40.15,  40.57,  40.42,
        14.54,  14.45,  14.18,  14.5 ,  14.7 ,  14.66,  14.4 ,  14.71,
        14.75,  14.71,  14.33,  14.62,  15.34,  15.29,  15.09,  15.3 ,
        19.2 ,  18.88,  18.9 ,  19.12,  16.76,  17.23,  17.26,  17.15,
        32.82,  32.69,  32.23,  32.75,  34.24,  34.95,  35.05,  34.29,
        31.28,  32.12,  32.05,  31.84,  28.67,  29.67,  29.47,  28.91,
        41.26,  41.3 ,  42.49,  42.08,  39.32,  39.84,  38.89,  39.68,
        13.97,  14.22,  14.1 ,  13.78,  14.07,  14.03,  13.94,  13.86,
        14.32,  14.56,  14.33,  14.08,  15.16,  15.18,  14.72,  14.9 ,
        18.48,  18.71,  18.48,  18.46,  16.47,  16.35,  16.55,  16.74,
        32.85,  32.67,  32.21,  32.74,  36.45,  35.73,  35.4 ,  36.57,
        32.38,  31.66,  32.15,  32.75,  28.93,  28.05,  28.64,  29.52,
        42.77,  41.73,  41.32,  42.96,  40.68,  40.4 ,  40.6 ,  40.11,
        14.37,  14.48,  14.32,  14.44,  14.6 ,  14.7 ,  14.47,  14.66,
        14.54,  14.62,  14.53,  14.71,  15.34,  15.29,  15.09,  15.3 ,
        19.06,  19.13,  19.  ,  18.84,  16.44,  16.9 ,  16.94,  16.77,
        32.84,  32.72,  32.21,  32.73,  35.67,  35.01,  34.72,  35.24,
        32.31,  31.81,  31.12,  32.06,  30.  ,  29.5 ,  29.06,  29.92,
        42.11,  41.96,  41.09,  40.79,  38.82,  39.72,  39.31,  39.86,
        14.41,  14.19,  14.17,  14.39,  12.43,  12.63,  12.76,  12.42,
        14.12,  14.28,  14.37,  14.21,  14.96,  14.92,  14.92,  15.16,
        17.69,  18.19,  18.16,  17.88,  16.54,  16.44,  16.48,  16.64]), 'Y2': array([ 21.33,  21.33,  21.33,  21.33,  28.28,  25.38,  25.16,  29.6 ,
        27.3 ,  21.97,  23.49,  27.87,  23.77,  21.46,  21.16,  24.93,
        37.73,  31.27,  30.93,  39.44,  29.79,  29.68,  29.79,  29.4 ,
        10.9 ,  11.19,  10.94,  11.17,  11.27,  11.72,  11.29,  11.67,
        11.74,  12.05,  11.73,  11.93,  12.4 ,  12.23,  12.4 ,  12.14,
        16.78,  16.8 ,  16.75,  16.67,  12.07,  12.22,  12.08,  12.04,
        26.47,  26.37,  26.44,  26.29,  32.92,  29.87,  29.58,  34.33,
        30.89,  25.6 ,  27.03,  31.73,  27.31,  24.91,  24.61,  28.51,
        41.68,  35.28,  34.43,  43.33,  33.87,  34.07,  34.14,  33.67,
        13.43,  13.71,  13.48,  13.7 ,  13.8 ,  14.28,  13.87,  14.27,
        14.28,  14.61,  14.3 ,  14.45,  13.9 ,  13.72,  13.88,  13.65,
        19.37,  19.43,  19.34,  19.32,  14.34,  14.5 ,  14.33,  14.27,
        25.95,  25.63,  26.13,  25.89,  32.54,  29.44,  29.36,  34.2 ,
        30.91,  25.63,  27.36,  31.9 ,  27.38,  25.02,  24.8 ,  28.79,
        41.07,  34.62,  33.87,  42.86,  33.91,  34.07,  34.17,  33.78,
        13.39,  13.72,  13.57,  13.79,  13.67,  14.11,  13.8 ,  14.21,
        13.2 ,  13.54,  13.32,  13.51,  14.86,  14.75,  15.  ,  14.74,
        19.23,  19.34,  19.32,  19.3 ,  14.37,  14.57,  14.27,  14.24,
        25.68,  26.02,  25.84,  26.14,  34.14,  32.85,  30.08,  29.67,
        31.73,  31.01,  25.9 ,  27.4 ,  28.68,  27.54,  25.35,  24.93,
        43.12,  41.22,  35.1 ,  34.29,  33.85,  34.11,  34.48,  34.5 ,
        13.6 ,  13.36,  13.65,  13.49,  14.14,  13.77,  14.3 ,  13.87,
        14.44,  14.27,  14.67,  14.4 ,  13.46,  13.7 ,  13.59,  13.83,
        19.14,  19.18,  19.37,  19.29,  14.09,  14.23,  14.14,  13.89,
        25.91,  25.72,  26.18,  25.87,  29.34,  33.91,  32.83,  29.92,
        27.17,  31.76,  31.06,  25.81,  24.61,  28.61,  27.57,  25.16,
        34.25,  43.3 ,  41.86,  35.29,  34.11,  33.62,  33.89,  34.05,
        13.2 ,  13.36,  13.21,  13.53,  13.67,  14.12,  13.79,  14.2 ,
        14.29,  14.49,  14.42,  14.73,  14.86,  14.67,  15.  ,  14.83,
        19.24,  19.25,  19.42,  19.48,  14.37,  14.34,  14.28,  14.47,
        25.64,  25.98,  25.88,  26.18,  29.82,  29.52,  34.45,  33.01,
        25.82,  27.33,  32.04,  31.28,  25.11,  24.77,  28.88,  27.69,
        34.99,  34.18,  43.14,  41.26,  34.25,  34.35,  33.64,  33.88,
        13.65,  13.44,  13.72,  13.5 ,  14.18,  13.75,  14.26,  13.89,
        14.55,  14.28,  14.46,  14.39,  14.54,  14.81,  14.65,  14.87,
        19.24,  19.18,  19.26,  19.29,  14.24,  13.97,  13.99,  14.15,
        29.79,  29.79,  29.28,  29.49,  36.12,  33.17,  32.71,  37.58,
        33.98,  28.61,  30.12,  34.73,  30.17,  27.84,  27.25,  31.39,
        43.8 ,  37.81,  36.85,  45.52,  36.85,  37.58,  37.45,  36.62,
        15.19,  15.5 ,  15.28,  15.5 ,  15.42,  15.85,  15.44,  15.81,
        15.21,  15.63,  15.48,  15.78,  16.39,  16.27,  16.39,  16.19,
        21.13,  21.19,  21.09,  21.08,  15.77,  15.95,  15.77,  15.76,
        29.62,  29.69,  30.18,  30.02,  35.56,  32.64,  32.77,  37.72,
        33.37,  27.89,  29.9 ,  34.52,  28.27,  26.96,  26.72,  29.88,
        43.86,  37.41,  36.77,  45.97,  36.87,  37.35,  37.28,  36.81,
        14.73,  15.1 ,  15.18,  15.44,  14.91,  15.4 ,  14.94,  15.32,
        15.52,  15.85,  15.66,  15.99,  15.89,  15.85,  16.22,  15.87,
        20.47,  20.56,  20.48,  20.43,  15.32,  15.64,  15.14,  15.3 ,
        29.43,  29.78,  30.1 ,  30.19,  36.35,  35.1 ,  32.83,  32.46,
        33.52,  32.93,  28.38,  29.82,  28.77,  27.76,  26.95,  26.41,
        45.13,  43.66,  37.76,  36.87,  36.07,  36.44,  37.28,  37.29,
        14.49,  13.79,  14.72,  14.76,  14.92,  14.74,  15.57,  14.94,
        14.92,  14.38,  15.44,  15.17,  15.53,  15.8 ,  16.14,  16.26,
        19.87,  20.03,  20.46,  20.28,  14.89,  14.96,  14.89,  14.35,
        29.61,  29.59,  30.19,  30.12,  32.12,  37.12,  36.16,  33.16,
        29.45,  34.19,  33.93,  28.31,  26.3 ,  29.43,  28.76,  27.34,
        36.26,  45.48,  44.16,  37.26,  37.2 ,  36.76,  37.05,  37.51,
        14.92,  15.24,  15.03,  15.35,  14.67,  15.09,  15.2 ,  15.64,
        15.37,  15.73,  15.83,  16.13,  15.95,  15.59,  16.17,  16.14,
        19.65,  19.76,  20.37,  19.9 ,  15.41,  15.56,  15.07,  15.38,
        29.53,  29.77,  30.  ,  30.2 ,  32.25,  32.  ,  37.19,  35.62,
        28.02,  29.43,  34.15,  33.47,  26.53,  26.08,  29.31,  28.14,
        37.54,  36.66,  45.28,  43.73,  36.93,  37.01,  35.73,  36.15,
        14.48,  14.58,  14.81,  14.03,  15.27,  14.71,  15.23,  14.97,
        15.14,  14.97,  15.22,  14.6 ,  15.83,  16.03,  15.8 ,  16.06,
        20.13,  20.01,  20.19,  20.29,  15.19,  14.61,  14.61,  14.75,
        33.37,  33.34,  32.83,  33.04,  39.28,  36.38,  35.92,  40.99,
        35.99,  30.66,  31.7 ,  36.73,  31.71,  29.13,  28.99,  33.54,
        45.29,  39.07,  38.35,  46.94,  39.55,  40.85,  40.63,  39.48,
        16.94,  17.25,  17.03,  17.25,  17.1 ,  17.51,  17.12,  17.47,
        16.5 ,  17.  ,  16.87,  17.2 ,  18.14,  18.03,  18.14,  17.95,
        22.72,  22.73,  22.72,  22.53,  17.2 ,  17.21,  17.15,  17.2 ,
        32.96,  33.13,  33.94,  33.78,  38.35,  35.39,  34.94,  40.66,
        35.48,  30.53,  32.28,  36.86,  30.34,  27.93,  28.95,  32.92,
        45.59,  39.41,  38.84,  48.03,  39.48,  40.4 ,  40.47,  39.7 ,
        16.43,  16.93,  16.99,  17.03,  16.77,  17.37,  17.27,  17.51,
        16.44,  17.01,  17.23,  17.22,  17.85,  17.89,  18.36,  18.15,
        21.72,  22.07,  22.09,  21.93,  17.36,  17.38,  16.86,  16.99,
        32.78,  33.24,  33.86,  34.  ,  37.26,  35.04,  33.82,  33.31,
        35.22,  34.7 ,  30.11,  31.6 ,  32.43,  30.65,  29.77,  29.64,
        46.44,  44.18,  38.81,  38.23,  38.17,  38.48,  39.66,  40.1 ,
        16.08,  15.39,  16.57,  16.6 ,  16.11,  15.47,  16.7 ,  16.1 ,
        16.35,  15.84,  16.99,  17.02,  17.04,  17.63,  18.1 ,  18.22,
        20.78,  20.72,  21.54,  21.53,  16.9 ,  17.14,  16.56,  16.  ,
        32.95,  33.06,  33.95,  33.88,  33.98,  39.92,  39.22,  36.1 ,
        31.53,  36.2 ,  36.21,  31.  ,  28.2 ,  32.35,  31.14,  28.43,
        38.33,  47.59,  46.23,  39.56,  40.36,  39.67,  39.85,  40.77,
        16.61,  16.74,  16.9 ,  17.32,  16.85,  17.2 ,  17.23,  17.74,
        16.81,  16.88,  16.9 ,  17.39,  17.86,  17.82,  18.36,  18.24,
        21.68,  21.54,  22.25,  22.49,  17.1 ,  16.79,  16.58,  16.79,
        32.88,  33.23,  33.76,  34.01,  33.94,  33.14,  38.79,  37.27,
        29.69,  31.2 ,  36.26,  35.71,  29.93,  29.56,  33.84,  32.54,
        38.56,  37.7 ,  47.01,  44.87,  39.37,  39.8 ,  37.79,  38.18,
        16.69,  16.62,  16.94,  16.7 ,  15.59,  14.58,  15.33,  15.31,
        16.63,  15.87,  16.54,  16.74,  17.64,  17.79,  17.55,  18.06,
        20.82,  20.21,  20.71,  21.4 ,  16.88,  17.11,  16.61,  16.03])}, 'target': <function <lambda> at 0x111a7e1b8>, 'DESCR': "# energy Data Set\n\nDownloaded from the [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/datasets/Energy+efficiency) on March 23, 2015\n\n- Data Set: Multivariate\n- Attribute: Integer, Real\n- Tasks: Regression, Classification\n- Instances: 768\n- Attributes: 8\n\n## Source:\n\nThe dataset was created by Angeliki Xifara (angxifara '@' gmail.com, Civil/Structural Engineer) and was processed by Athanasios Tsanas (tsanasthanasis '@' gmail.com, Oxford Centre for Industrial and Applied Mathematics, University of Oxford, UK).\n\n\n## Data Set Information:\n\nWe perform energy analysis using 12 different building shapes simulated in Ecotect. The buildings differ with respect to the glazing area, the glazing area distribution, and the orientation, amongst other parameters. We simulate various settings as functions of the afore-mentioned characteristics to obtain 768 building shapes. The dataset comprises 768 samples and 8 features, aiming to predict two real valued responses. It can also be used as a multi-class classification problem if the response is rounded to the nearest integer.\n\n\n### Attribute Information:\n\nThe dataset contains eight attributes (or features, denoted by X1...X8) and two responses (or outcomes, denoted by y1 and y2). The aim is to use the eight features to predict each of the two responses.\n\nSpecifically:\n\n- X1\tRelative Compactness\n- X2\tSurface Area\n- X3\tWall Area\n- X4\tRoof Area\n- X5\tOverall Height\n- X6\tOrientation\n- X7\tGlazing Area\n- X8\tGlazing Area Distribution\n- y1\tHeating Load\n- y2\tCooling Load\n\n\n### Relevant Papers:\n\n1. A. Tsanas, A. Xifara: 'Accurate quantitative estimation of energy performance of residential buildings using statistical machine learning tools', Energy and Buildings, Vol. 49, pp. 560-567, 2012\n\n\n### Citation Request:\n\n1. A. Tsanas, A. Xifara: 'Accurate quantitative estimation of energy performance of residential buildings using statistical machine learning tools', Energy and Buildings, Vol. 49, pp. 560-567, 2012 (the paper can be accessed from [Web Link])\n\nFor further details on the data analysis methodology:\n\n2. A. Tsanas, 'Accurate telemonitoring of Parkinson\xc3\xa2\xe2\x82\xac\xe2\x84\xa2s disease symptom severity using nonlinear speech signal processing and statistical machine learning', D.Phil. thesis, University of Oxford, 2012 (which can be accessed from [Web Link])\n", 'target_names': {u'Y1': u'heating load', u'Y2': u'cooling load'}, 'feature_names': [u'relative compactness', u'surface area', u'wall area', u'roof area', u'overall height', u'orientation', u'glazing area', u'glazing area distribution'], 'data': array([[  9.80000000e-01,   5.14500000e+02,   2.94000000e+02, ...,
          2.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  9.80000000e-01,   5.14500000e+02,   2.94000000e+02, ...,
          3.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  9.80000000e-01,   5.14500000e+02,   2.94000000e+02, ...,
          4.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  6.20000000e-01,   8.08500000e+02,   3.67500000e+02, ...,
          3.00000000e+00,   4.00000000e-01,   5.00000000e+00],
       [  6.20000000e-01,   8.08500000e+02,   3.67500000e+02, ...,
          4.00000000e+00,   4.00000000e-01,   5.00000000e+00],
       [  6.20000000e-01,   8.08500000e+02,   3.67500000e+02, ...,
          5.00000000e+00,   4.00000000e-01,   5.00000000e+00]]), 'filenames': {'rdme': '../data/energy/README.md', 'meta': '../data/energy/meta.json', 'data': '../data/energy/dataset.txt'}}



In [22]:

    
type(dataset)









    Out[22]:





sklearn.datasets.base.Bunch

What is a bunch??
We'll talk about that soon.
In the meantime ask for help...



In [23]:

    
help(dataset)









    



Help on Bunch in module sklearn.datasets.base object:

class Bunch(__builtin__.dict)
 |  Container object for datasets: dictionary-like object that
 |  exposes its keys as attributes.
 |  
 |  Method resolution order:
 |      Bunch
 |      __builtin__.dict
 |      __builtin__.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, **kwargs)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from __builtin__.dict:
 |  
 |  __cmp__(...)
 |      x.__cmp__(y) <==> cmp(x,y)
 |  
 |  __contains__(...)
 |      D.__contains__(k) -> True if D has a key k, else False
 |  
 |  __delitem__(...)
 |      x.__delitem__(y) <==> del x[y]
 |  
 |  __eq__(...)
 |      x.__eq__(y) <==> x==y
 |  
 |  __ge__(...)
 |      x.__ge__(y) <==> x>=y
 |  
 |  __getattribute__(...)
 |      x.__getattribute__('name') <==> x.name
 |  
 |  __getitem__(...)
 |      x.__getitem__(y) <==> x[y]
 |  
 |  __gt__(...)
 |      x.__gt__(y) <==> x>y
 |  
 |  __iter__(...)
 |      x.__iter__() <==> iter(x)
 |  
 |  __le__(...)
 |      x.__le__(y) <==> x<=y
 |  
 |  __len__(...)
 |      x.__len__() <==> len(x)
 |  
 |  __lt__(...)
 |      x.__lt__(y) <==> x<y
 |  
 |  __ne__(...)
 |      x.__ne__(y) <==> x!=y
 |  
 |  __repr__(...)
 |      x.__repr__() <==> repr(x)
 |  
 |  __setitem__(...)
 |      x.__setitem__(i, y) <==> x[i]=y
 |  
 |  __sizeof__(...)
 |      D.__sizeof__() -> size of D in memory, in bytes
 |  
 |  clear(...)
 |      D.clear() -> None.  Remove all items from D.
 |  
 |  copy(...)
 |      D.copy() -> a shallow copy of D
 |  
 |  fromkeys(...)
 |      dict.fromkeys(S[,v]) -> New dict with keys from S and values equal to v.
 |      v defaults to None.
 |  
 |  get(...)
 |      D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None.
 |  
 |  has_key(...)
 |      D.has_key(k) -> True if D has a key k, else False
 |  
 |  items(...)
 |      D.items() -> list of D's (key, value) pairs, as 2-tuples
 |  
 |  iteritems(...)
 |      D.iteritems() -> an iterator over the (key, value) items of D
 |  
 |  iterkeys(...)
 |      D.iterkeys() -> an iterator over the keys of D
 |  
 |  itervalues(...)
 |      D.itervalues() -> an iterator over the values of D
 |  
 |  keys(...)
 |      D.keys() -> list of D's keys
 |  
 |  pop(...)
 |      D.pop(k[,d]) -> v, remove specified key and return the corresponding value.
 |      If key is not found, d is returned if given, otherwise KeyError is raised
 |  
 |  popitem(...)
 |      D.popitem() -> (k, v), remove and return some (key, value) pair as a
 |      2-tuple; but raise KeyError if D is empty.
 |  
 |  setdefault(...)
 |      D.setdefault(k[,d]) -> D.get(k,d), also set D[k]=d if k not in D
 |  
 |  update(...)
 |      D.update([E, ]**F) -> None.  Update D from dict/iterable E and F.
 |      If E present and has a .keys() method, does:     for k in E: D[k] = E[k]
 |      If E present and lacks .keys() method, does:     for (k, v) in E: D[k] = v
 |      In either case, this is followed by: for k in F: D[k] = F[k]
 |  
 |  values(...)
 |      D.values() -> list of D's values
 |  
 |  viewitems(...)
 |      D.viewitems() -> a set-like object providing a view on D's items
 |  
 |  viewkeys(...)
 |      D.viewkeys() -> a set-like object providing a view on D's keys
 |  
 |  viewvalues(...)
 |      D.viewvalues() -> an object providing a view on D's values
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes inherited from __builtin__.dict:
 |  
 |  __hash__ = None
 |  
 |  __new__ = <built-in method __new__ of type object>
 |      T.__new__(S, ...) -> a new object with type S, a subtype of T



In [25]:

    
dataset.data.shape









    Out[25]:





(768, 8)



In [15]:

    
dataset.target('Y1').shape









    Out[15]:





(768,)



In [26]:

    
#other ways to explore 'dataset'
print(dataset.DESCR)









    



# energy Data Set

Downloaded from the [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/datasets/Energy+efficiency) on March 23, 2015

- Data Set: Multivariate
- Attribute: Integer, Real
- Tasks: Regression, Classification
- Instances: 768
- Attributes: 8

## Source:

The dataset was created by Angeliki Xifara (angxifara '@' gmail.com, Civil/Structural Engineer) and was processed by Athanasios Tsanas (tsanasthanasis '@' gmail.com, Oxford Centre for Industrial and Applied Mathematics, University of Oxford, UK).


## Data Set Information:

We perform energy analysis using 12 different building shapes simulated in Ecotect. The buildings differ with respect to the glazing area, the glazing area distribution, and the orientation, amongst other parameters. We simulate various settings as functions of the afore-mentioned characteristics to obtain 768 building shapes. The dataset comprises 768 samples and 8 features, aiming to predict two real valued responses. It can also be used as a multi-class classification problem if the response is rounded to the nearest integer.


### Attribute Information:

The dataset contains eight attributes (or features, denoted by X1...X8) and two responses (or outcomes, denoted by y1 and y2). The aim is to use the eight features to predict each of the two responses.

Specifically:

- X1	Relative Compactness
- X2	Surface Area
- X3	Wall Area
- X4	Roof Area
- X5	Overall Height
- X6	Orientation
- X7	Glazing Area
- X8	Glazing Area Distribution
- y1	Heating Load
- y2	Cooling Load


### Relevant Papers:

1. A. Tsanas, A. Xifara: 'Accurate quantitative estimation of energy performance of residential buildings using statistical machine learning tools', Energy and Buildings, Vol. 49, pp. 560-567, 2012


### Citation Request:

1. A. Tsanas, A. Xifara: 'Accurate quantitative estimation of energy performance of residential buildings using statistical machine learning tools', Energy and Buildings, Vol. 49, pp. 560-567, 2012 (the paper can be accessed from [Web Link])

For further details on the data analysis methodology:

2. A. Tsanas, 'Accurate telemonitoring of Parkinsonâ€™s disease symptom severity using nonlinear speech signal processing and statistical machine learning', D.Phil. thesis, University of Oxford, 2012 (which can be accessed from [Web Link])



In [27]:

    
#more ways to exlore 'dataset'
dir(dataset)









    Out[27]:





['DESCR',
 '__class__',
 '__cmp__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_target_set',
 'clear',
 'copy',
 'data',
 'feature_names',
 'filenames',
 'fromkeys',
 'get',
 'has_key',
 'items',
 'iteritems',
 'iterkeys',
 'itervalues',
 'keys',
 'pop',
 'popitem',
 'setdefault',
 'target',
 'target_names',
 'update',
 'values',
 'viewitems',
 'viewkeys',
 'viewvalues']



In [28]:

    
dir(dataset)









    Out[28]:





['DESCR',
 '__class__',
 '__cmp__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_target_set',
 'clear',
 'copy',
 'data',
 'feature_names',
 'filenames',
 'fromkeys',
 'get',
 'has_key',
 'items',
 'iteritems',
 'iterkeys',
 'itervalues',
 'keys',
 'pop',
 'popitem',
 'setdefault',
 'target',
 'target_names',
 'update',
 'values',
 'viewitems',
 'viewkeys',
 'viewvalues']



In [32]:

    
splits = tts(dataset.data, dataset.target('Y1'), test_size=0.2)



In [31]:

    
# what is splits? 
print(splits)









    



[array([[  7.10000000e-01,   7.10500000e+02,   2.69500000e+02, ...,
          4.00000000e+00,   2.50000000e-01,   5.00000000e+00],
       [  7.90000000e-01,   6.37000000e+02,   3.43000000e+02, ...,
          5.00000000e+00,   4.00000000e-01,   2.00000000e+00],
       [  8.20000000e-01,   6.12500000e+02,   3.18500000e+02, ...,
          5.00000000e+00,   1.00000000e-01,   4.00000000e+00],
       ..., 
       [  6.40000000e-01,   7.84000000e+02,   3.43000000e+02, ...,
          5.00000000e+00,   1.00000000e-01,   3.00000000e+00],
       [  7.10000000e-01,   7.10500000e+02,   2.69500000e+02, ...,
          5.00000000e+00,   1.00000000e-01,   2.00000000e+00],
       [  7.10000000e-01,   7.10500000e+02,   2.69500000e+02, ...,
          2.00000000e+00,   2.50000000e-01,   3.00000000e+00]]), array([[  6.90000000e-01,   7.35000000e+02,   2.94000000e+02, ...,
          4.00000000e+00,   1.00000000e-01,   2.00000000e+00],
       [  7.90000000e-01,   6.37000000e+02,   3.43000000e+02, ...,
          4.00000000e+00,   4.00000000e-01,   1.00000000e+00],
       [  7.60000000e-01,   6.61500000e+02,   4.16500000e+02, ...,
          3.00000000e+00,   1.00000000e-01,   1.00000000e+00],
       ..., 
       [  6.90000000e-01,   7.35000000e+02,   2.94000000e+02, ...,
          2.00000000e+00,   4.00000000e-01,   2.00000000e+00],
       [  6.60000000e-01,   7.59500000e+02,   3.18500000e+02, ...,
          2.00000000e+00,   1.00000000e-01,   3.00000000e+00],
       [  9.00000000e-01,   5.63500000e+02,   3.18500000e+02, ...,
          2.00000000e+00,   4.00000000e-01,   2.00000000e+00]]), array([ 12.25,  41.92,  24.24,  12.78,  11.43,  14.53,  26.46,  15.23,
        26.91,  24.04,  12.86,  16.56,  11.64,  28.91,  24.77,  23.86,
        12.96,  36.86,  33.28,  38.82,   6.05,  29.06,  31.28,  11.7 ,
        11.45,  32.21,  13.86,  11.34,  14.72,  26.84,  25.43,  19.48,
        15.4 ,  40.42,  12.84,  24.31,  17.41,  14.61,  32.13,  35.78,
        17.37,  15.23,  27.27,  32.85,  15.16,  20.84,  11.13,  12.74,
        35.45,  11.42,  29.54,   6.07,  11.11,  36.86,  16.44,  18.31,
        28.93,  29.83,  16.76,  16.47,  15.3 ,  29.71,  39.04,  28.58,
        29.88,  14.37,  10.72,  11.45,  13.02,  10.43,  15.36,  14.56,
        12.95,  12.62,  32.05,  37.26,  29.06,  38.33,  13.95,  28.31,
        11.53,  12.93,  29.52,  15.41,  14.6 ,  14.54,  29.14,  12.74,
        12.71,  32.46,  26.33,  14.92,  13.01,   6.79,  36.13,  11.16,
        28.55,  24.35,  10.36,  32.39,  38.67,  40.12,  29.5 ,  12.97,
        28.05,  11.18,  12.85,  11.33,  12.18,  32.74,  15.12,  15.55,
        25.17,  14.41,  33.48,  24.35,  19.68,  36.28,  12.85,  19.  ,
        35.89,  38.84,  36.57,  35.73,  12.72,  12.77,  18.19,  14.61,
        27.9 ,  29.87,  18.9 ,  36.95,  12.57,  12.8 ,  32.31,   6.77,
        36.26,  25.7 ,  32.75,  25.98,  17.88,  40.57,  11.61,  39.84,
        37.24,  32.38,  10.34,  39.97,  26.28,  42.96,  10.7 ,  39.97,
        28.67,  29.92,  16.84,  16.62,  12.18,  41.26,  28.62,  15.29,
        12.42,  15.34,  11.11,  25.37,  25.36,  28.17,  28.15,  14.16,
        16.55,  16.44,  13.04,  28.41,  12.49,  23.59,  24.59,  24.96,
        10.45,  12.34,  16.35,  36.97,  25.74,  42.77,  32.67,  12.59,
        18.71,  15.16,  25.48,  17.5 ,  10.8 ,  12.12,  11.2 ,  24.11,
         6.4 ,  14.28,  12.36,  32.12,  29.03,  33.21,  29.01,   6.37,
        13.18,  17.52,  12.02,  14.7 ,  12.63,  32.75,  37.12,  32.41,
        12.35,  38.89,   6.04,  14.62,  19.12,  24.25,  19.42,  10.55,
        12.3 ,  26.19,  22.89,  28.75,  12.78,  33.27,  31.66,  14.6 ,
        11.22,  41.96,  10.67,  29.05,  12.76,  42.08,  14.34,  32.48,
        15.18,  10.39,  12.73,  14.08,  17.69,  15.41,  10.37,  24.94,
        31.53,  25.27,  16.73,  16.99,  23.75,  35.24,  12.12,  12.29,
        16.92,  10.66,  35.05,  35.65,  34.72,  40.78,  32.31,  14.55,
        26.37,  29.09,  38.35,  12.12,  32.53,  31.81,  14.51,  23.53,
         8.6 ,  10.64,  17.11,  13.  ,  41.3 ,   7.1 ,  14.62,  14.19,
        10.15,  23.54,  12.5 ,  36.43,  28.86,  15.19,  14.7 ,  11.1 ,
        35.96,  28.52,  29.08,  29.07,  40.68,  24.23,  10.78,  11.09,
        11.22,  29.91,  17.14,  12.65,  13.78,  36.45,  40.6 ,  17.15,
        32.26,  16.76,  23.84,   8.5 ,  11.8 ,   8.49,  33.08,  26.48,
        25.17,  12.45,  13.  ,   8.45,  22.58,   7.1 ,  36.06,  29.34,
        29.53,  18.88,  40.03,  11.16,  29.49,  36.77,  26.97,  12.93,
        26.33,  15.98,  10.68,  15.21,  11.38,  15.16,  16.95,  24.03,
        12.19,  32.23,  16.86,  14.32,  36.95,  14.96,  14.66,  14.6 ,
        28.63,  33.28,  36.96,  28.64,  24.37,  10.46,  12.43,  14.71,
        41.32,  24.33,  27.03,  28.66,  12.03,  31.63,  12.63,  20.71,
        42.11,  26.44,  15.16,  39.68,  12.68,  12.17,  11.49,  40.4 ,
        16.48,  41.64,  14.12,  29.63,  32.06,  32.4 ,  30.  ,  28.07,
        28.69,  19.52,  32.67,  32.  ,  12.88,  15.09,  36.52,  28.09,
        14.9 ,  28.7 ,  11.44,  18.46,  14.5 ,  37.03,  16.9 ,  10.08,
        28.05,  22.79,  13.91,  14.33,  36.57,  35.64,  12.1 ,  28.56,
        12.2 ,  35.48,  14.21,  34.29,  15.23,   7.1 ,  28.15,  28.03,
        32.33,  39.83,  12.32,  12.28,  10.85,  14.4 ,  13.99,  23.67,
        12.3 ,  32.74,  10.07,  35.67,  32.71,  23.89,  32.4 ,  12.87,
        36.81,  10.53,  25.41,  17.02,   6.85,  32.33,  31.84,  25.66,
        15.42,  39.86,  29.02,  36.71,  32.68,  23.8 ,  31.29,  28.61,
        15.19,  11.69,  32.84,  32.24,  12.97,  12.27,  12.97,  12.95,
        15.2 ,  33.16,  34.24,  35.01,  32.31,  35.69,  36.91,  37.1 ,
        28.67,  36.47,  12.47,  14.92,  32.31,  12.82,  38.57,  36.7 ,
        32.15,  11.14,  24.38,  17.17,  14.07,  36.66,  10.7 ,  12.46,
        11.07,  15.18,  15.12,  14.71,  24.63,  11.6 ,  31.66,  14.47,
        11.46,  32.96,  15.09,  10.42,  13.18,  14.66,  11.14,  28.4 ,
        11.68,  30.05,  10.32,  14.33,  14.33,  32.46,  13.94,  36.64,
        14.44,  28.18,  14.7 ,  14.17,  32.38,  13.05,  12.73,  36.97,
        32.72,  16.93,  36.06,  10.75,  24.4 ,  28.4 ,  11.69,  10.34,
        15.34,  42.49,  28.65,  16.74,  15.32,  35.56,  28.15,  29.87,
        32.21,  14.45,  11.67,  26.47,   6.81,  36.9 ,  39.83,  12.43,
        12.28,  14.65,  24.58,   6.37,  13.97,  15.09,  12.73,  39.72,
        10.68,  14.48,  14.5 ,  15.36,  39.04,  15.37,  26.89,  32.07,
        14.37,  14.1 ,  12.27,  19.34,  18.48,  12.47,  28.42,  33.09,
        15.29,  16.77,  24.26,  33.24,  25.38,  11.22,  19.06,  33.12,
        35.4 ,  15.34,  14.6 ,  29.68,  11.33,  13.04,  10.71,  14.32,
        29.47,  12.86,  17.35,  14.71,  24.77,  31.64,  28.15,  28.88,
        43.1 ,   6.01,  41.4 ,  17.14,  13.  ,  10.38,  29.08,  29.79,
        16.69,  29.6 ,  32.82,  24.03,  12.16,  32.84,  13.99,  39.01,
        14.39,  19.95,  25.37,  40.11,  22.93,  10.14,  12.49,  14.58,
        10.39,  42.74,  12.19,  15.55,  42.62,  40.43,  28.01,  16.86,
        25.36,  29.22,  13.17,  19.5 ,  39.86,  17.23,  32.26,  10.77,
        15.55,  40.78,  12.95,  15.36,  10.68,  12.41]), array([ 11.32,  42.5 ,  32.12,  29.47,  12.45,  41.67,  32.09,  14.34,
        15.23,  24.17,  28.64,  32.46,  10.47,  13.02,  17.14,  29.39,
        41.09,  11.21,  10.35,  13.  ,  15.08,  15.3 ,  40.79,  18.16,
        12.92,  10.77,  15.55,  17.05,  26.45,  33.27,  32.29,  28.57,
        39.81,  12.43,  27.03,  13.69,  24.23,  14.03,  18.48,  23.93,
        33.13,  28.67,  40.71,  38.65,  15.16,  24.63,  34.95,  39.31,
        41.73,  32.68,  24.28,  24.29,  11.16,  15.12,  28.95,  14.18,
        12.93,  29.43,  15.09,  11.34,  23.93,  14.52,   6.4 ,  14.22,
        35.69,  11.98,  19.13,  35.48,  28.83,  32.52,  21.46,  28.6 ,
        10.7 ,  24.7 ,  26.  ,  17.26,  12.91,  29.67,  12.57,  19.36,
        12.33,  29.27,  36.45,  39.32,  36.45,  29.9 ,  24.6 ,  24.32,
        14.54,  10.72,  35.94,  13.  ,  29.4 ,  32.73,  32.82,  31.89,
        23.87,  15.16,  10.39,  16.64,  16.94,  28.37,  40.  ,  14.42,
        27.02,  36.7 ,  12.63,  29.62,  32.94,  13.86,  14.42,  11.42,
        25.49,  12.41,  11.43,  12.93,  22.8 ,  32.72,  10.36,  11.69,
        14.5 ,  38.98,  16.54,  10.68,  18.84,  19.2 ,  14.66,  36.03,
        16.66,  32.21,  39.89,  13.17,  10.56,  26.45,  35.99,  12.31,
        13.68,  16.83,  31.69,   7.18,  31.12,  12.67,  32.49,  40.15,
        14.51,  24.13,  32.69,  36.59,  10.54,  40.19,  14.61,  14.75,
        11.59,  35.84])]



In [33]:

    
X_train, X_test, y_train, y_test = splits
X_train.shape









    Out[33]:





(614, 8)



In [34]:

    
y_train.shape









    Out[34]:





(614,)

Simple Regression with Ordinary Least Squares (OLS)



In [36]:

    
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
regr = linear_model.LinearRegression()
regr.fit(X_train,y_train)









    Out[36]:





LinearRegression(copy_X=True, fit_intercept=True, normalize=False)



In [37]:

    
print regr.coef_









    



[ -6.64029790e+01  -2.59125300e+11   2.59125300e+11   5.18250600e+11
   4.23074467e+00  -6.32883460e-02   1.99031163e+01   1.75659106e-01]



In [38]:

    
print regr.intercept_









    



86.3109079805



In [39]:

    
print mean_squared_error(y_test, regr.predict(X_test))









    



7.27837481184



In [40]:

    
regr.score(X_test,y_test)
# same as doing r2_score(y_est, regr.predict(X_test))









    Out[40]:





0.92360948663878939

Ridge Regression



In [41]:

    
clf = linear_model.Ridge(alpha=0.5)
clf.fit(X_train, y_train)









    Out[41]:





Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, solver='auto', tol=0.001)



In [42]:

    
print mean_squared_error(y_test, clf.predict(X_test))









    



7.67171720027



In [43]:

    
clf.score(X_test, y_test)









    Out[43]:





0.91948114373870293

so we picked a bad alpha - let's pick a better one...

Choosing Alpha



In [44]:

    
import numpy as np

# try 200 different alphas between -10 and -2
n_alphas = 200
alphas = np.logspace(-10, -2, n_alphas)

clf = linear_model.RidgeCV(alphas=alphas)
clf.fit(X_train, y_train)

#which alpha did it pick?
print clf.alpha_









    



0.000988495904663



In [45]:

    
clf.score(X_test, y_test)









    Out[45]:





0.92325895076025666



In [54]:

    
# plot our alphas
linear_model.Ridge(fit_intercept=False)
errors = []



In [55]:

    
for alpha in alphas:
    splits = tts(dataset.data, dataset.target('Y1'), test_size=0.2)
    X_train, X_test, y_train, y_test = splits
    clf.set_params(alpha=alpha)
    clf.fit(X_train, y_train)
    error = mean_squared_error(y_test, clf.predict(X_test))
    errors.append(error)









    



/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/sklearn/linear_model/coordinate_descent.py:490: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations
  ConvergenceWarning)



In [56]:

    
axe = plt.gca()
axe.plot(alphas, errors)
plt.show()

Lasso Regression



In [51]:

    
clf = linear_model.Lasso(alpha=0.5)
clf.fit(X_train, y_train)









    Out[51]:





Lasso(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute='auto', tol=0.0001,
   warm_start=False)



In [52]:

    
print mean_squared_error(y_test, clf.predict(X_test))









    



21.4254306394



In [53]:

    
clf.score(X_test, y_test)









    Out[53]:





0.7978589472104094

Pipelined Model



In [57]:

    
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline



In [58]:

    
model = make_pipeline(PolynomialFeatures(2), linear_model. Ridge())
model.fit(X_train, y_train)









    Out[58]:





Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('ridge', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, solver='auto', tol=0.001))])



In [59]:

    
mean_squared_error(y_test, model.predict(X_test))









    Out[59]:





4.3190411801923325



In [60]:

    
model.score(X_test, y_test)









    Out[60]:





0.95415019535758072

now it's time to worry about overfit

Visualization with MatPlotLib



In [61]:

    
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline



In [62]:

    
x = np.linspace(-15,15,100) 
# 100 evenly spaced nums between -15 and 15

y = np.sin(x)/x 
# compute values of sin(x) / x



In [66]:

    
# compose plot
plt.plot(x,y, label="f(x)") # sin(x)/x
plt.plot(x,y, 'co', label="cyan dot f(x)")
plt.plot(x,2*y,x,3*y, label="scaled f(x)")









    Out[66]:





[<matplotlib.lines.Line2D at 0x10e3b0c10>,
 <matplotlib.lines.Line2D at 0x10e38e8d0>]



In [69]:

    
# add plot details! Or else Ben will be mad
plt.plot(x,y, label="f(x)")
plt.plot(x,y, 'co', label="cyan dot f(x)")
plt.plot(x,2*y,x,3*y, label="scaled f(x)")
plt.xlabel("x-axis")
plt.ylabel("y-axis")
plt.title("Graph of Functions")
plt.legend()
plt.show()

more at http://matplotlib.org/users/pyplot_tutorial.html



In [ ]:

	X1	X2	X3	X4	X5	X6	X7	X8	Y1	Y2
count	768.000000	768.000000	768.000000	768.000000	768.00000	768.000000	768.000000	768.00000	768.000000	768.000000
mean	0.764167	671.708333	318.500000	176.604167	5.25000	3.500000	0.234375	2.81250	22.307201	24.587760
std	0.105777	88.086116	43.626481	45.165950	1.75114	1.118763	0.133221	1.55096	10.090196	9.513306
min	0.620000	514.500000	245.000000	110.250000	3.50000	2.000000	0.000000	0.00000	6.010000	10.900000
25%	0.682500	606.375000	294.000000	140.875000	3.50000	2.750000	0.100000	1.75000	12.992500	15.620000
50%	0.750000	673.750000	318.500000	183.750000	5.25000	3.500000	0.250000	3.00000	18.950000	22.080000
75%	0.830000	741.125000	343.000000	220.500000	7.00000	4.250000	0.400000	4.00000	31.667500	33.132500
max	0.980000	808.500000	416.500000	220.500000	7.00000	5.000000	0.400000	5.00000	43.100000	48.030000