In [30]:
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline
df=pd.read_table('C:\\Users\\Siddy\\Desktop\\occupancy_data\\datatraining.txt', sep=',')
df.dtypes
df.shape
Out[30]:
In [31]:
df['date']=pd.to_datetime(df.date) #I have converted to datetime so we can use maths with datetime
In [32]:
ts=pd.to_datetime('2/2/2015')
df.reset_index(inplace=True)
del df['index']
df.head(2)
Out[32]:
In [33]:
feature_cols=['Temperature', 'Humidity', 'Light', 'CO2']
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import random
In [34]:
import random
reservoir=[]
def reservoir_sample(dfo,size):
for index, row in dfo.iterrows():
if index<=size:
reservoir.append(row)
elif index >= size and random.random() < size/float(index+1):
replace = random.randint(0,size-1)
reservoir[replace] = (row)
return reservoir
In [38]:
df_test=pd.DataFrame()
max_size=len(df)
chunk_count=10
all_chunks=[]
nxt_chunks=[]
allsamples=[]
AccuScore=[]
PreScore=[]
RScore=[]
f1score=[]
i=0
j=1
chunk_size=int(round(max_size/chunk_count))
for i in range (chunk_count):
idx_start=chunk_size*i
idx_end=min(idx_start+chunk_size,max_size)
nxt_idx_start=chunk_size*(i+1)
nxt_idx_end=min(nxt_idx_start+chunk_size,max_size)
all_chunks.append(df.iloc[idx_start:idx_end])
nxt_chunks.append(df.iloc[nxt_idx_start:nxt_idx_end])
df_sample=pd.DataFrame(reservoir_sample(all_chunks[i],5))
allsamples.append(df_sample)
df_test=nxt_chunks[i]
print (df_sample)
df1_train_x=df_sample[feature_cols]
df1_train_y=df_sample.Occupancy
logreg.fit(df1_train_x,df1_train_y)
df1_test_x=df_test[feature_cols]
df1_test_y=df_test.Occupancy
y_pred=logreg.predict(df1_test_x)
Ascore=metrics.accuracy_score(df1_test_y,y_pred)
pscore=metrics.precision_score(df1_test_y,y_pred)
rcscore=metrics.recall_score(df1_test_y,y_pred)
fscore=metrics.f1_score(df1_test_y,y_pred)
AccuScore.append(Ascore)
PreScore.append(pscore)
RScore.append(rcscore)
f1score.append(fscore)
print('All done')
In [36]:
print(AccuScore)
print(PreScore)
print(RScore)
print(f1score)
allchunkssizes=[0,814,1628,2442,3256,4070,4884,5698,6512,7326]
In [37]:
plt.plot(allchunkssizes,AccuScore,'r--')
plt.plot(allchunkssizes,PreScore,'g--')
plt.plot(allchunkssizes,RScore,'bs')
plt.plot(allchunkssizes,f1score,'r')
plt.title('Logistic regression Accuracy Measures')
plt.ylabel('Score')
plt.xlabel('Number of events')
plt.show()
In [ ]: