In [2]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
import datetime as dt
from sklearn import datasets, linear_model
%matplotlib inline

In [3]:
data = np.genfromtxt('recs2009_public.csv',delimiter=',',skip_header=1)

In [4]:
totalSF=data[:,827]

In [5]:
totalKWH=data[:,839]

In [6]:
income=data[:,785]

In [7]:
age=data[:,760]

In [8]:
members=data[:,759]

In [9]:
education=data[:,758]

In [10]:
plt.figure(figsize=(15,10))
plt.plot(totalSF,totalKWH,'ro')
plt.ylabel('Residential Electric Consumption [kWH]')


Out[10]:
<matplotlib.text.Text at 0x7f16a54bc250>

In [11]:
SFtrain=totalSF[:-6040]
KWHtrain=totalKWH[:-6040]

In [12]:
print SFtrain


[ 5075.  3136.   528. ...,   644.   834.   952.]

In [13]:
SFtest=totalSF[-6040:]
KWHtest=totalKWH[-6040:]

In [14]:
print SFtest


[ 3640.  2025.  3068. ...,  4581.  1728.  4920.]

In [15]:
plt.figure(figsize=(15,10))
plt.plot(SFtrain,KWHtrain,'ro')
plt.ylabel('Residential Electric Consumption Train Data [kWH]')
plt.xlabel('Total Residential Area [SF]')


Out[15]:
<matplotlib.text.Text at 0x7f16ad163b50>

In [16]:
plt.figure(figsize=(15,10))
plt.plot(SFtest,KWHtest,'ro')
plt.ylabel('Residential Electric Consumption [kWH]')
plt.xlabel('Total Residential Area [SF]')


Out[16]:
<matplotlib.text.Text at 0x7f16ace7dc50>

In [17]:
from scipy import stats

In [18]:
slope, intercept, r_value, p_value, std_err = stats.linregress(SFtrain,KWHtrain)

In [19]:
slope


Out[19]:
2.0735207102480704

In [20]:
intercept


Out[20]:
6817.2690298966618

In [21]:
KWHpredict=slope*SFtest+intercept

In [22]:
plt.figure(figsize=(15,10))
plt.plot(SFtest,KWHtest,'.b')
plt.plot(SFtest,KWHpredict,'.r')
plt.legend(['true data','predicted data'])
plt.title('Prediction on testing data')


Out[22]:
<matplotlib.text.Text at 0x7f16acd4f450>

In [46]:
plt.scatter(data[:,34], data[:,461])


Out[46]:
<matplotlib.collections.PathCollection at 0x7f16a863bc50>
/opt/anaconda/envs/np18py27-1.9/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [29]:
full=data[np.where(data[:,34]==data[:,461])]

In [31]:
SFfull=full[:,827]

In [32]:
SFfull


Out[32]:
array([ 5075.,  3136.,   528., ...,  4581.,  1728.,  4920.])

In [35]:
KWHfull=full[:,839]

In [36]:
len(SFfull)


Out[36]:
9472

In [38]:
SFfulltrain=SFfull[:-4718]
SFfulltest=SFfull[-4718:]

In [39]:
KWHfulltrain=KWHfull[:-4718]
KWHfulltest=KWHfull[-4718:]

In [40]:
plt.figure(figsize=(15,10))
plt.plot(SFfulltrain,KWHfulltrain,'ro')
plt.ylabel('Residential Electric Consumption Full[kWH]')
plt.xlabel('Total Residential Area [SF]')


Out[40]:
<matplotlib.text.Text at 0x7f16acbfb190>

In [41]:
slopef, interceptf, r_valuef, p_valuef, std_errf = stats.linregress(SFfulltrain,KWHfulltrain)

In [42]:
slopef


Out[42]:
2.0986028355898076

In [43]:
interceptf


Out[43]:
6939.3870321567492

In [44]:
KWHfullpredict=slopef*SFfulltest+interceptf

In [45]:
plt.figure(figsize=(15,10))
plt.plot(SFfulltest,KWHfulltest,'.b')
plt.plot(SFfulltest,KWHfullpredict,'.r')
plt.legend(['true data','predicted data'])
plt.title('Prediction on testing data')


Out[45]:
<matplotlib.text.Text at 0x7f16a8724890>

In [ ]: