In [262]:
import pandas as pd
import numpy as np
import matplotlib as plt
df=pd.read_table('C:\\Users\\Siddy\\Desktop\\occupancy_data\\datatraining.txt', sep=',')
df.dtypes
Out[262]:
In [263]:
df.head(2)
Out[263]:
In [264]:
df.columns
Out[264]:
In [265]:
df1 = pd.DataFrame()
df2 = pd.DataFrame()
df2_new = pd.DataFrame()
df3 = pd.DataFrame()
df3_new = pd.DataFrame()
df4 = pd.DataFrame()
df4_new = pd.DataFrame()
df5 = pd.DataFrame()
df5_new = pd.DataFrame()
df6 = pd.DataFrame()
df6_new = pd.DataFrame()
df7 = pd.DataFrame()
df7_new = pd.DataFrame()
df8 = pd.DataFrame()
df8_new = pd.DataFrame()
df9 = pd.DataFrame()
df9_new = pd.DataFrame()
df10 = pd.DataFrame()
df10_new = pd.DataFrame()
In [266]:
df1 = df[:814]
df2= df[815:1628]
df3= df[1629:2443]
df4= df[2444:3258]
df5= df[3259:4073]
df6= df[4074:4888]
df7= df[4889:5703]
df8= df[5704:6518]
df9= df[6519:7333]
df10= df[7334:8148]
In [267]:
import random
def reservoir_sample(size):
reservoir=[]
for index, row in df1.iterrows():
if index<=size:
reservoir.append(row)
elif random.random()<(1 / index):
reservoir[random.randint(0, size-1)] = (row)
return reservoir
In [268]:
df1_sample=pd.DataFrame(reservoir_sample(100)) #apply sample function of size 100 on df1
#which is our 10% batch and save in datframe df1_sample
In [269]:
df1_sample.shape #cross check sampled dataframe size
Out[269]:
In [270]:
df1_sample.head()
Out[270]:
In [271]:
feature_cols=['Temperature', 'Humidity', 'Light', 'CO2']
df1_train_x=df1_sample[feature_cols] #create feature and lable of sampled dataframe
df1_train_y=df1_sample.Occupancy
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
logreg.fit(df1_train_x,df1_train_y) #train model sampled dataframe
Out[271]:
In [272]:
df2_x=df2[feature_cols]
df2_y=df2.Occupancy
df3_x=df3[feature_cols]
df3_y=df3.Occupancy
df4_x=df4[feature_cols]
df4_y=df4.Occupancy
df5_x=df5[feature_cols]
df5_y=df5.Occupancy
df6_x=df6[feature_cols]
df6_y=df6.Occupancy
df7_x=df7[feature_cols]
df7_y=df7.Occupancy
df8_x=df8[feature_cols]
df8_y=df8.Occupancy
df9_x=df9[feature_cols]
df9_y=df9.Occupancy
df10_x=df10[feature_cols]
df10_y=df10.Occupancy
In [273]:
y_pred_class_df2=logreg.predict(df2_x) #test on df2 or next 10%
In [274]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
df2_score=metrics.accuracy_score(df2_y,y_pred_class_df2)
In [275]:
results=pd.DataFrame({'Accuracy':[df2_score]}) # putting results in dataframe
results.head()
Out[275]:
In [276]:
df2_new=pd.concat([df1,df2],axis=0) #concating row waswise so axis =0, df2_new will be our new combined 20%
In [277]:
df2_new.shape
Out[277]:
In [278]:
from sklearn.utils import shuffle
df2_new = shuffle(df2_new)
In [279]:
df2_new.reset_index(drop=True,inplace=True)
df2_new.index = df2_new.index + 1
df2_new.head(2)
Out[279]:
In [280]:
df2_new.shape
Out[280]:
In [281]:
def reservoir_sample(size):
reservoir=[]
for index, row in df2_new.iterrows():
if index<=size:
reservoir.append(row)
elif random.random()<(1 / index):
reservoir[random.randint(0, size-1)] = (row)
return reservoir
In [282]:
df2_sample=pd.DataFrame(reservoir_sample(100)) #apply sample function of size 100 on df2_new which is
# our 20% batch and save in datframe df2_sample
In [283]:
df2_sample.shape #cross check sampled dataframe size
Out[283]:
In [284]:
df2_sample.head()
Out[284]:
In [285]:
df2_sample_x=df2_sample[feature_cols]
df2_sample_y=df2_sample.Occupancy
logreg.fit(df2_sample_x,df2_sample_y) #train model on sampled dataframe from first 20% of size 100
Out[285]:
In [286]:
y_pred_class_df3=logreg.predict(df3_x) #test on df3 which 3rd chunk or batch
df3_score=metrics.accuracy_score(df3_y,y_pred_class_df3)
results.loc[1] = df3_score
results.head()
Out[286]:
In [287]:
df3_new=pd.concat([df2_new,df3],axis=0) #concating row wise so axis =0, df3_new will be our new combined 30%
In [288]:
df3_new.shape
Out[288]:
In [289]:
df3_new = shuffle(df3_new)
df3_new.reset_index(drop=True,inplace=True)
df3_new.index = df3_new.index + 1
df3_new.head(2)
Out[289]:
In [290]:
def reservoir_sample(size):
reservoir=[]
for index, row in df3_new.iterrows():
if index<=size:
reservoir.append(row)
elif random.random()<(1 / index):
reservoir[random.randint(0, size-1)] = (row)
return reservoir
In [291]:
df3_sample=pd.DataFrame(reservoir_sample(100)) #apply sample function of size 100 on df3_new which is
# our 30% batch and save in datframe df3_sample
In [292]:
df3_sample.head(5)
Out[292]:
In [293]:
df3_sample_x=df3_sample[feature_cols]
df3_sample_y=df3_sample.Occupancy
logreg.fit(df3_sample_x,df3_sample_y) #train model on sampled dataframe from first 30% of size 100
Out[293]:
In [294]:
y_pred_class_df4=logreg.predict(df4_x) #test on df4 which 4th chunk or batch
df4_score=metrics.accuracy_score(df4_y,y_pred_class_df4)
results.loc[2] = df4_score
results.head()
Out[294]:
In [295]:
df4_new=pd.concat([df3_new,df4],axis=0) #concating row waswise so axis =0, df4_new will be our new combined 40%
In [296]:
df4_new.shape
Out[296]:
In [297]:
df4_new = shuffle(df4_new)
df4_new.reset_index(drop=True,inplace=True)
df4_new.index = df4_new.index + 1
df4_new.head(2)
Out[297]:
In [298]:
def reservoir_sample(size):
reservoir=[]
for index, row in df4_new.iterrows():
if index<=size:
reservoir.append(row)
elif random.random()<(1 / index):
reservoir[random.randint(0, size-1)] = (row)
return reservoir
In [299]:
df4_sample=pd.DataFrame(reservoir_sample(100)) #apply sample function of size 100 on df4_new which is
# our 40% batch and save in datframe df4_sample
In [300]:
df4_sample.head(2)
Out[300]:
In [301]:
df4_sample_x=df4_sample[feature_cols]
df4_sample_y=df4_sample.Occupancy
logreg.fit(df4_sample_x,df4_sample_y) #train model on sampled dataframe from first 30% of size 100
Out[301]:
In [302]:
y_pred_class_df5=logreg.predict(df5_x) #test on df5 which 5th chunk or batch
df5_score=metrics.accuracy_score(df5_y,y_pred_class_df5)
results.loc[3] = df5_score
results.head()
Out[302]:
In [ ]:
# And so on till last batch arrives