In [262]:
import pandas as pd
import numpy as np
import matplotlib as plt
df=pd.read_table('C:\\Users\\Siddy\\Desktop\\occupancy_data\\datatraining.txt', sep=',')
df.dtypes


Out[262]:
date              object
Temperature      float64
Humidity         float64
Light            float64
CO2              float64
HumidityRatio    float64
Occupancy          int64
dtype: object

In [263]:
df.head(2)


Out[263]:
date Temperature Humidity Light CO2 HumidityRatio Occupancy
1 2015-02-04 17:51:00 23.18 27.2720 426.0 721.25 0.004793 1
2 2015-02-04 17:51:59 23.15 27.2675 429.5 714.00 0.004783 1

In [264]:
df.columns


Out[264]:
Index(['date', 'Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio',
       'Occupancy'],
      dtype='object')

In [265]:
df1 = pd.DataFrame()
df2 = pd.DataFrame()
df2_new = pd.DataFrame()
df3 = pd.DataFrame()
df3_new = pd.DataFrame()
df4 = pd.DataFrame()
df4_new = pd.DataFrame()
df5 = pd.DataFrame()
df5_new = pd.DataFrame()
df6 = pd.DataFrame()
df6_new = pd.DataFrame()
df7 = pd.DataFrame()
df7_new = pd.DataFrame()
df8 = pd.DataFrame()
df8_new = pd.DataFrame()
df9 = pd.DataFrame()
df9_new = pd.DataFrame()
df10 = pd.DataFrame()
df10_new = pd.DataFrame()

In [266]:
df1 = df[:814]
df2= df[815:1628]
df3= df[1629:2443]
df4= df[2444:3258]
df5= df[3259:4073]
df6= df[4074:4888]
df7= df[4889:5703]
df8= df[5704:6518]
df9= df[6519:7333]
df10= df[7334:8148]

In [267]:
import random
def reservoir_sample(size):
    reservoir=[]
    for index, row in df1.iterrows():
            if index<=size:
                  reservoir.append(row)
            elif random.random()<(1 / index):   
                 reservoir[random.randint(0, size-1)] = (row)
    return reservoir

In [268]:
df1_sample=pd.DataFrame(reservoir_sample(100)) #apply sample function of size 100 on df1
                                               #which is our 10% batch and save in datframe df1_sample

In [269]:
df1_sample.shape #cross check sampled dataframe size


Out[269]:
(100, 7)

In [270]:
df1_sample.head()


Out[270]:
date Temperature Humidity Light CO2 HumidityRatio Occupancy
1 2015-02-04 17:51:00 23.18 27.2720 426.0 721.25 0.004793 1
2 2015-02-04 17:51:59 23.15 27.2675 429.5 714.00 0.004783 1
3 2015-02-04 17:53:00 23.15 27.2450 426.0 713.50 0.004779 1
4 2015-02-04 17:54:00 23.15 27.2000 426.0 708.25 0.004772 1
5 2015-02-04 17:55:00 23.10 27.2000 426.0 704.50 0.004757 1

In [271]:
feature_cols=['Temperature', 'Humidity', 'Light', 'CO2']
df1_train_x=df1_sample[feature_cols]                          #create feature and lable of sampled dataframe 
df1_train_y=df1_sample.Occupancy
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
logreg.fit(df1_train_x,df1_train_y)                            #train model sampled dataframe


Out[271]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [272]:
df2_x=df2[feature_cols]
df2_y=df2.Occupancy
df3_x=df3[feature_cols]
df3_y=df3.Occupancy
df4_x=df4[feature_cols]
df4_y=df4.Occupancy
df5_x=df5[feature_cols]
df5_y=df5.Occupancy
df6_x=df6[feature_cols]
df6_y=df6.Occupancy
df7_x=df7[feature_cols]
df7_y=df7.Occupancy
df8_x=df8[feature_cols]
df8_y=df8.Occupancy
df9_x=df9[feature_cols]
df9_y=df9.Occupancy
df10_x=df10[feature_cols]
df10_y=df10.Occupancy

In [273]:
y_pred_class_df2=logreg.predict(df2_x) #test on df2 or next 10%

In [274]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
df2_score=metrics.accuracy_score(df2_y,y_pred_class_df2)

In [275]:
results=pd.DataFrame({'Accuracy':[df2_score]}) # putting results in dataframe 
results.head()


Out[275]:
Accuracy
0 0.920049

In [276]:
df2_new=pd.concat([df1,df2],axis=0)  #concating row waswise so axis =0, df2_new will be our new combined 20%

In [277]:
df2_new.shape


Out[277]:
(1627, 7)

In [278]:
from sklearn.utils import shuffle
df2_new = shuffle(df2_new)

In [279]:
df2_new.reset_index(drop=True,inplace=True)
df2_new.index = df2_new.index + 1
df2_new.head(2)


Out[279]:
date Temperature Humidity Light CO2 HumidityRatio Occupancy
1 2015-02-05 14:41:00 22.290 28.39 479.0 1021.0 0.004727 1
2 2015-02-05 05:19:59 20.945 23.50 0.0 449.0 0.003597 0

In [280]:
df2_new.shape


Out[280]:
(1627, 7)

In [281]:
def reservoir_sample(size):
    reservoir=[]
    for index, row in df2_new.iterrows():
            if index<=size:
                  reservoir.append(row)
            elif random.random()<(1 / index):   
                 reservoir[random.randint(0, size-1)] = (row)
    return reservoir

In [282]:
df2_sample=pd.DataFrame(reservoir_sample(100)) #apply sample function of size 100 on df2_new which is 
                                               # our 20% batch and save in datframe df2_sample

In [283]:
df2_sample.shape #cross check sampled dataframe size


Out[283]:
(100, 7)

In [284]:
df2_sample.head()


Out[284]:
date Temperature Humidity Light CO2 HumidityRatio Occupancy
1 2015-02-05 14:41:00 22.290 28.39 479.000000 1021.0 0.004727 1
2 2015-02-05 05:19:59 20.945 23.50 0.000000 449.0 0.003597 0
3 2015-02-05 12:01:00 22.200 27.10 465.666667 1110.0 0.004486 1
4 2015-02-04 21:38:00 21.500 25.79 0.000000 487.0 0.004088 0
5 2015-02-05 20:53:00 21.000 19.70 0.000000 476.0 0.003023 0

In [285]:
df2_sample_x=df2_sample[feature_cols]
df2_sample_y=df2_sample.Occupancy   
logreg.fit(df2_sample_x,df2_sample_y) #train model on sampled dataframe from first 20% of size 100


Out[285]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [286]:
y_pred_class_df3=logreg.predict(df3_x) #test on df3 which 3rd chunk or batch 
df3_score=metrics.accuracy_score(df3_y,y_pred_class_df3)
results.loc[1] = df3_score
results.head()


Out[286]:
Accuracy
0 0.920049
1 0.984029

In [287]:
df3_new=pd.concat([df2_new,df3],axis=0)  #concating row wise so axis =0, df3_new will be our new combined 30%

In [288]:
df3_new.shape


Out[288]:
(2441, 7)

In [289]:
df3_new = shuffle(df3_new)
df3_new.reset_index(drop=True,inplace=True)
df3_new.index = df3_new.index + 1
df3_new.head(2)


Out[289]:
date Temperature Humidity Light CO2 HumidityRatio Occupancy
1 2015-02-05 19:24:00 21.1 20.195 0.0 539.5 0.003118 0
2 2015-02-05 09:58:00 22.1 26.390 461.5 1038.5 0.004341 1

In [290]:
def reservoir_sample(size):
    reservoir=[]
    for index, row in df3_new.iterrows():
            if index<=size:
                  reservoir.append(row)
            elif random.random()<(1 / index):   
                 reservoir[random.randint(0, size-1)] = (row)
    return reservoir

In [291]:
df3_sample=pd.DataFrame(reservoir_sample(100)) #apply sample function of size 100 on df3_new which is 
                                               # our 30% batch and save in datframe df3_sample

In [292]:
df3_sample.head(5)


Out[292]:
date Temperature Humidity Light CO2 HumidityRatio Occupancy
1 2015-02-05 19:24:00 21.100000 20.195000 0.0 539.50 0.003118 0
109 2015-02-05 10:14:00 22.100000 26.700000 455.5 1049.00 0.004392 1
3 2015-02-05 10:06:00 22.100000 26.500000 449.5 1036.75 0.004359 1
4 2015-02-06 08:31:00 20.823333 19.666667 433.0 591.00 0.002985 1
5 2015-02-05 20:51:59 21.000000 19.700000 0.0 467.50 0.003023 0

In [293]:
df3_sample_x=df3_sample[feature_cols]
df3_sample_y=df3_sample.Occupancy   
logreg.fit(df3_sample_x,df3_sample_y) #train model on sampled dataframe from first 30% of size 100


Out[293]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [294]:
y_pred_class_df4=logreg.predict(df4_x) #test on df4 which 4th chunk or batch 
df4_score=metrics.accuracy_score(df4_y,y_pred_class_df4)
results.loc[2] = df4_score
results.head()


Out[294]:
Accuracy
0 0.920049
1 0.984029
2 0.986486

In [295]:
df4_new=pd.concat([df3_new,df4],axis=0)  #concating row waswise so axis =0, df4_new will be our new combined 40%

In [296]:
df4_new.shape


Out[296]:
(3255, 7)

In [297]:
df4_new = shuffle(df4_new)
df4_new.reset_index(drop=True,inplace=True)
df4_new.index = df4_new.index + 1
df4_new.head(2)


Out[297]:
date Temperature Humidity Light CO2 HumidityRatio Occupancy
1 2015-02-06 14:09:00 21.79 19.1000 546.5 654.50 0.003076 1
2 2015-02-06 10:24:00 21.34 21.1475 455.5 848.25 0.003315 1

In [298]:
def reservoir_sample(size):
    reservoir=[]
    for index, row in df4_new.iterrows():
            if index<=size:
                  reservoir.append(row)
            elif random.random()<(1 / index):   
                 reservoir[random.randint(0, size-1)] = (row)
    return reservoir

In [299]:
df4_sample=pd.DataFrame(reservoir_sample(100)) #apply sample function of size 100 on df4_new which is 
                                               # our 40% batch and save in datframe df4_sample

In [300]:
df4_sample.head(2)


Out[300]:
date Temperature Humidity Light CO2 HumidityRatio Occupancy
1 2015-02-06 14:09:00 21.79 19.1000 546.5 654.50 0.003076 1
2 2015-02-06 10:24:00 21.34 21.1475 455.5 848.25 0.003315 1

In [301]:
df4_sample_x=df4_sample[feature_cols]
df4_sample_y=df4_sample.Occupancy   
logreg.fit(df4_sample_x,df4_sample_y) #train model on sampled dataframe from first 30% of size 100


Out[301]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [302]:
y_pred_class_df5=logreg.predict(df5_x) #test on df5 which 5th chunk or batch 
df5_score=metrics.accuracy_score(df5_y,y_pred_class_df5)
results.loc[3] = df5_score
results.head()


Out[302]:
Accuracy
0 0.920049
1 0.984029
2 0.986486
3 0.957002

In [ ]:
# And so on till last batch arrives