In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
import itertools
import pickle
import sklearn
%matplotlib inline
In [2]:
f = open('/Users/Raafe/Desktop/DataDriven/Project/Stever.csv')
data = pd.read_csv(f,sep=',', header='infer', parse_dates=[1])
In [3]:
data.columns=['Time','Power','Temperature']
data1=data
In [4]:
data1['Temperature'] = pd.to_numeric(data1['Temperature'], errors='coerce')
data1['Power'] = pd.to_numeric(data1['Power'], errors='coerce')
data1['Time']=pd.to_datetime(data1['Time'],errors='coerce')
In [5]:
data1=data.set_index('Time',drop=False)
data1.dtypes
data2=data1.drop(data1.columns[0],axis=1)
data3=data2.resample('H').mean()
data4=data3.interpolate()
df=data4
In [6]:
fig=plt.figure()
ax1 = fig.add_subplot(111)
ax1.plot(df.Power)
ax1.set_ylabel('Load in W')
ax2 = ax1.twinx()
ax2.plot(df.Temperature, 'r-')
ax2.set_ylabel('Temperature in F', color='r')
for tl in ax2.get_yticklabels():
tl.set_color('r')
In [7]:
plt.scatter(df.Temperature,df.Power)
plt.xlabel('Temperature in Fahrenheit')
plt.ylabel('Power in Watts')
Out[7]:
In [8]:
df['Weekday']=df.index.dayofweek
df['Hour']=df.index.hour
In [9]:
Value=[]
for row in df['Power']:
if row>98790.240626:
Value.append(1)
else:
Value.append(-1)
df['Value']=Value
In [10]:
d5 = df.set_index('Hour',append=True)
d6 = d5.copy()
d7 = d6.drop(d6.columns[1],axis =1)
d8 = d7.unstack('Hour')
d8['Time'] = d8.index
d8['day'] = d8['Time'].dt.dayofyear
d8 = d8.drop(d8.columns[24], axis = 1)
d9 = d8['Power'].groupby(d8['day']).mean()
d9
Out[10]:
In [11]:
plt.imshow(d9, aspect='auto',cmap='summer')
plt.ylabel('Day of Year')
plt.xlabel('Hour of the Day')
plt.colorbar()
Out[11]:
In [12]:
d10 = df.copy()
d10['Time'] = d10.index
d10['Weekday'] = d10['Time'].dt.dayofweek
d10.boxplot(by="Hour",column=['Power'])
plt.ylabel('Power')
d10.groupby('Weekday').boxplot(by="Hour",column=['Power'],figsize=(20,30),layout=(4,2))
Out[12]:
In [13]:
df1=df.resample('D').mean()
df1['Day']=np.arange(367)
# df1
Y=df1['Power']
X=df1[['Temperature','Day']]
Y_mat=Y.as_matrix()
X_mat=X.as_matrix()
In [14]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(min_samples_split=20,random_state=99,min_samples_leaf=2)
model.fit(X_mat,Y_mat)
print(model)
expected=Y_mat
predicted=model.predict(X_mat)
# print(metrics.classification_report(expected, predicted))
# print(metrics.confusion_matrix(expected, predicted))
model.score(X_mat,Y_mat)
Out[14]:
In [15]:
plt.plot(expected,'b')
plt.ylabel('Power')
plt.xlabel('Days')
plt.plot(predicted,'--g')
Out[15]:
In [16]:
# plt.plot(df[df.Value>0].Temperature,df[df.Value<0].Temperature, 'b+')
# plt.plot(df[df.Value<=0].x1,df[df.Value<=0].x2, 'ro')
# plt.show()
plt.plot(df[df.Value==1].Temperature,'b+')
plt.plot(df[df.Value==-1].Temperature,'r')
plt.ylabel('Temperature in F')
Out[16]:
In [17]:
# plt.plot(df[df.Value==1].Power,'b+')
# plt.plot(df[df.Value==-1].Power,'r')
In [18]:
from sklearn.cluster import KMeans
# Clean it up
d9 = d9.replace(np.inf,np.nan).fillna(0)
# Make it compatible with sklearn:
X = d9.as_matrix().astype(np.float32)
## remove days with weird consumption pattern, as shown in stem plot above
X = np.concatenate([X[:297,:],X[314:,:]])
print(X.shape)
#since we are interested in weekdays/weekends, lets subtract the seasonal effects
#here I compute a naive low-pass over 10 days
lp = 10
seasonal = []
for i in range(int(len(X))):
seasonal.append(np.mean(X[np.max([i-lp,0]):i+lp,:]))
plt.plot(seasonal, label='Seasonal Effect')
plt.plot(np.mean(X,axis=1), label='Daily Average')
X = (X.T - seasonal).T
plt.plot(np.mean(X,axis=1), label='Normalized Days')
plt.ylabel('Power')
plt.xlabel('Days')
plt.legend()
# Find the clusters
clusters = KMeans(n_clusters=3).fit(X)
In [32]:
num_clust = 3
cluster_assignments = clusters.predict(X)
plt.subplot(num_clust+1,1,1)
plt.plot(cluster_assignments[:150])
plt.ylim([0.2,1.1])
for cluster_id in range(len(clusters.cluster_centers_)):
plt.subplot(num_clust+1,1,cluster_id+2)
cluster_members = X[cluster_assignments==cluster_id,:]
print(len(cluster_members))
for i in range(len(cluster_members)):
plt.plot(cluster_members[i,:], color='grey', lw='0.1')
plt.plot(clusters.cluster_centers_[cluster_id,:], color='k', lw='1')
# plt.ylim([-4000,4000])
In [20]:
plot_step = 1
X1 = df1[['Temperature','Day']]
y1 = df1['Power']
# Let's pre-compute the range for our features
x_min, x_max = X1['Temperature'].min() - 1, X1['Temperature'].max() + 1
y_min, y_max = X1['Day'].min() - 1, X1['Day'].max() + 1
# And create a meshgrid so that we can create a countour plot on it
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),np.arange(y_min, y_max, plot_step))
# Now we predict the values for all of the cells in the meshgrid
Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) # If you're curious what this is, read here: https://docs.scipy.org/doc/numpy/reference/generated/numpy.c_.html and https://docs.scipy.org/doc/numpy/reference/generated/numpy.ravel.html
# And we reshape those results to have the same shape as the mesh
Z = Z.reshape(xx.shape)
# Now we can finally contour-plot, using a specific colormap
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
# Let's add labels to the axes
plt.xlabel('Temperature')
plt.ylabel('Day')
plt.axis("Tight")
# Plot the training points
plt.plot(df1[df1.Value>0].Temperature,df1[df1.Value>0].Day, 'b+')
plt.plot(df1[df1.Value<=0].Temperature,df1[df1.Value<=0].Day, 'ro')
plt.axis("Tight")
# And add a few more beautification items
plt.suptitle("Decision Surface of our Decision Tree")
plt.legend(['Positives','Negatives'])
plt.show()
In [21]:
plt.plot(df1[df1.Value>0].Temperature,df1[df1.Value>0].Day, 'b+')
plt.plot(df1[df1.Value<=0].Temperature,df1[df1.Value<=0].Day, 'ro')
plt.show()
In [22]:
df_P=df[df['Value']==1]
df_N=df[df['Value']==-1]
In [23]:
import scipy.stats as stats
a=df_P['Power'].corr(df_P['Temperature'])
b=df_N['Power'].corr(df_N['Temperature'])
print(a)
print(b)
In [24]:
Temp=[]
for row in df['Temperature']:
if row>58.482674:
Temp.append(1)
else:
Temp.append(-1)
df['Temp']=Temp
In [25]:
df_P1=df[df['Temp']==1]
df_N1=df[df['Temp']==-1]
In [26]:
a1=df_P1['Power'].corr(df_P1['Temperature'])
b1=df_N1['Power'].corr(df_N1['Temperature'])
In [27]:
print(a1)
print(b1)
In [28]:
plt.hist2d(df['Power'],df['Temperature'])
plt.colorbar()
Out[28]:
In [29]:
plt.violinplot(df['Power'])
plt.ylabel('Power')
plt.xlabel('Probability')
Out[29]:
In [30]:
plt.psd(df['Power'])
Out[30]:
In [ ]: