In [1]:
from __future__ import division
import os
import numpy as np 
import pandas as pd
from helpers import data_provider
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sns

In [3]:
%matplotlib inline
plt.style.use('classic')
plt.rc("figure", facecolor="white")


fig_width_pt = 469.755  # Get this from LaTeX using \showthe\columnwidth
inches_per_pt = 1.0/72.27               # Convert pt to inch
golden_mean = (np.sqrt(5)-1.0)/2.0         # Aesthetic ratio
fig_width = fig_width_pt*inches_per_pt  # width in inches
fig_height = fig_width*golden_mean      # height in inches
fig_size =  [fig_width,fig_height]
params = {'backend': 'ps',
          'axes.labelsize': 10,
          'text.fontsize': 10,
          'legend.fontsize': 10,
          'xtick.labelsize': 8,
          'ytick.labelsize': 8,
          'text.usetex': True,
          'figure.figsize': fig_size}
plt.rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
plt.rcParams.update(params)


def plot_consumption(figure_name, which_house, data):
    plt.figure(1)
    plt.clf()
    plt.axes([0.125,0.2,0.95-0.125,0.95-0.2])
    plt.plot(data, color='b')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.xlabel('Time')
    plt.ylabel('Consumption (kwh)')
    plt.title('House nr. '+str(which_house)+'Electricity Consumption')
    plt.savefig('figures/electricity_consumption/'+figure_name+'_'+str(which_house)+'.pdf')
    plt.savefig('figures/electricity_consumption/'+figure_name+'_'+str(which_house)+'.eps')
    plt.show()

def insta_plot(vals):
    plt.figure(figsize=(12,6))
    plt.plot(vals, color='b')
    plt.xticks(rotation=45)
    plt.title('Unregistered Electricity Consumption')
    plt.xlabel('Time')
    plt.ylabel('Consumption (kwh)')
    plt.grid(True)
    plt.show()


C:\Users\chris\Anaconda3\envs\mlp\lib\site-packages\matplotlib\__init__.py:913: UserWarning: text.fontsize is deprecated and replaced with font.size; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

Plotting and saving data figures


In [3]:
#Plot and save consumption figures for each house
houses=[1,2,4,5,6,7,8,9,10,12,13,15,16,17,18,19,20]
for house_nr in houses:
    data = data_provider.load_aggregate(house_nr)
    plot_consumption(figure_name='electricity_consumption',which_house=house_nr,data=data.Aggregate)



In [9]:
# Save important summary statistics for each house
d = {}
houses=[1,2,4,5,6,7,8,9,10,12,13,15,16,17,18,19,20]
for house_nr in houses:
    data = data_provider.load_aggregate(house_nr)
    key = 'house_'+str(house_nr)
    d[key] = data.describe()
    
write_to_file('summary_statistics.txt',d)

Boxplot for spotting outliers


In [28]:
df = pd.DataFrame()
for house_nr in houses:
    data = data_provider.load_aggregate(house_nr)
    key = 'house_'+str(house_nr)
    df = pd.concat([df,data], ignore_index=True, axis=1)
names= ['House '+str(house) for house in houses]
df.columns = names

In [36]:
plt.figure(1)
plt.clf()
plt.axes([0.125,0.2,0.95-0.125,0.95-0.2])
_=df.boxplot()
plt.xticks(rotation=45)
plt.ylabel('Consumption (kwh)')
plt.title('Electricity Consumption Summary per House')
plt.savefig('figures/houses_box.eps')
plt.savefig('figures/houses_box.pdf')



In [40]:
plt.figure(1)
plt.clf()
plt.axes([0.125,0.2,0.95-0.125,0.95-0.2])
sns.boxplot(data=df)
plt.xticks(rotation=45)
plt.ylabel('Consumption (kwh)')
plt.title('Electricity Consumption Summary per House')
plt.savefig('figures/test_houses_box.eps')
plt.savefig('figures/test_houses_box.pdf')


Cleaning Extreme Values


In [4]:
data = pd.read_csv('houses.csv',parse_dates=['Time'],index_col='Time')

In [5]:
houses=[1,2,4,5,6,7,8,9,10,12,13,15,16,17,18,19,20]
names= ['House '+str(house) for house in houses]
data.columns = names

In [22]:
insta_plot(data.House_1['2015-02-08'])



In [54]:
insta_plot(data.House_16['2014-05-06'])



In [60]:
insta_plot(data.House_18['2015-03-22'])



In [4]:
df = pd.read_csv('houses_clean.csv',parse_dates=['Time'],index_col='Time')
houses=[1,2,4,5,6,7,8,9,10,12,13,15,16,17,18,19,20]
names= ['House '+str(house) for house in houses]
df.columns = names

In [5]:
plt.figure(1)
plt.clf()
plt.axes([0.125,0.2,0.95-0.125,0.95-0.2])
sns.boxplot(data=df)
plt.xticks(rotation=45)
plt.ylabel('Consumption (kWh)')
plt.title('Electricity Consumption Summary per House')
plt.savefig('figures/sns_houses_clean_box.eps')
plt.savefig('figures/sns_houses_clean_box.pdf')



In [6]:
plt.figure(1)
plt.clf()
plt.axes([0.125,0.2,0.95-0.125,0.95-0.2])
_=df.boxplot()
plt.xticks(rotation=45)
plt.ylabel('Consumption (kwh)')
plt.title('Electricity Consumption Summary per House')
plt.savefig('figures/houses_clean_box.eps')
plt.savefig('figures/houses_clean_box.pdf')



In [11]:
df.describe()


Out[11]:
House 1 House 2 House 4 House 5 House 6 House 7 House 8 House 9 House 10 House 12 House 13 House 15 House 16 House 17 House 18 House 19 House 20
count 13520.000000 11354.000000 13584.000000 14535.000000 11829.000000 12929.000000 12003.000000 11581.000000 12949.000000 10985.000000 9436.000000 12130.000000 11155.000000 10632.000000 10042.000000 10644.000000 10412.000000
mean 0.475820 0.458801 0.379930 0.742775 0.480945 0.561838 0.683744 0.576275 0.760164 0.371035 0.560799 0.255822 0.558509 0.407223 0.447657 0.291412 0.376308
std 0.403045 0.579903 0.236769 0.564555 0.260507 0.584454 0.689319 0.652033 0.667145 0.365521 0.577496 0.228375 0.423730 0.450664 0.303978 0.199979 0.258193
min 0.152000 0.052667 0.000000 0.127994 0.177595 0.092454 0.175323 0.107652 0.152500 0.123012 0.119870 0.072198 0.134351 0.115270 0.000000 0.131496 0.117722
25% 0.206211 0.124262 0.242241 0.352525 0.273069 0.169588 0.245537 0.178300 0.292460 0.156293 0.230378 0.163121 0.286835 0.179352 0.281819 0.166761 0.203967
50% 0.302760 0.204322 0.330292 0.565768 0.426965 0.256397 0.441163 0.272867 0.493998 0.226987 0.336059 0.181815 0.419901 0.243439 0.361262 0.218569 0.286349
75% 0.591032 0.537030 0.457598 0.928521 0.609872 0.795434 0.787935 0.752978 0.975398 0.419458 0.593209 0.270725 0.662000 0.391860 0.487658 0.339759 0.445000
max 4.262840 5.103723 4.159200 6.540499 2.960240 5.343128 6.108152 5.818901 5.210000 4.537846 4.460671 4.133278 4.583966 4.629530 4.044538 2.000186 2.425551

In [10]:
write_to_file('clean_summary_statistics.txt',d)

Keep 95% of the data


In [30]:
# load the data set
data_process = pd.read_csv('houses_clean.csv',parse_dates=['Time'], index_col='Time')
data = data_process.copy(deep=True)

In [34]:
data.quantile(0.95)


Out[34]:
House_1     1.382303
House_2     1.681160
House_4     0.756751
House_5     1.857474
House_6     0.921208
House_7     1.783033
House_8     2.392620
House_9     1.966144
House_10    2.169000
House_12    1.086586
House_13    1.822782
House_15    0.517603
House_16    1.417786
House_17    1.375860
House_18    1.123966
House_19    0.654410
House_20    0.939158
Name: 0.95, dtype: float64

In [35]:
data = data[data <= data.quantile(0.95)]

In [38]:
x = data.House_1.dropna(axis=0)
mu = data.House_1.mean()
sigma = data.House_1.std()

# the histogram of the data
n, bins, patches = plt.hist(x, 50, normed=1, facecolor='green', alpha=0.75)

# add a 'best fit' line
y = mlab.normpdf( bins, mu, sigma)
l = plt.plot(bins, y, 'r--', linewidth=1)

plt.xlabel('Consumption')
plt.ylabel('Frequency')
plt.grid(True)

plt.show()



In [43]:
houses=[1,2,4,5,6,7,8,9,10,12,13,15,16,17,18,19,20]
names= ['House '+str(house) for house in houses]
data.columns = names

In [85]:
flierprops = dict(marker='.', markerfacecolor='grey', markersize=4, fillstyle='none',
                  linestyle='none')

In [86]:
plt.figure(1)
plt.clf()
plt.axes([0.125,0.2,0.95-0.125,0.95-0.2])
sns.boxplot(data=data,flierprops=flierprops)
plt.xticks(rotation=45)
plt.ylabel('Consumption (kwh)')
plt.title('Electricity Consumption Summary per House')
plt.savefig('figures/95_percent/sns_houses_clean_box_95.eps')
plt.savefig('figures/95_percent/sns_houses_clean_box_95.pdf')



In [ ]: