notebook.community

Edit and run



In [262]:

    
import pandas as pd
import numpy as np
import matplotlib as plt
df=pd.read_table('C:\\Users\\Siddy\\Desktop\\occupancy_data\\datatraining.txt', sep=',')
df.dtypes









    Out[262]:





date              object
Temperature      float64
Humidity         float64
Light            float64
CO2              float64
HumidityRatio    float64
Occupancy          int64
dtype: object



In [263]:

    
df.head(2)









    Out[263]:






  
    
      
      date
      Temperature
      Humidity
      Light
      CO2
      HumidityRatio
      Occupancy
    
  
  
    
      1
      2015-02-04 17:51:00
      23.18
      27.2720
      426.0
      721.25
      0.004793
      1
    
    
      2
      2015-02-04 17:51:59
      23.15
      27.2675
      429.5
      714.00
      0.004783
      1



In [264]:

    
df.columns









    Out[264]:





Index(['date', 'Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio',
       'Occupancy'],
      dtype='object')



In [265]:

    
df1 = pd.DataFrame()
df2 = pd.DataFrame()
df2_new = pd.DataFrame()
df3 = pd.DataFrame()
df3_new = pd.DataFrame()
df4 = pd.DataFrame()
df4_new = pd.DataFrame()
df5 = pd.DataFrame()
df5_new = pd.DataFrame()
df6 = pd.DataFrame()
df6_new = pd.DataFrame()
df7 = pd.DataFrame()
df7_new = pd.DataFrame()
df8 = pd.DataFrame()
df8_new = pd.DataFrame()
df9 = pd.DataFrame()
df9_new = pd.DataFrame()
df10 = pd.DataFrame()
df10_new = pd.DataFrame()



In [266]:

    
df1 = df[:814]
df2= df[815:1628]
df3= df[1629:2443]
df4= df[2444:3258]
df5= df[3259:4073]
df6= df[4074:4888]
df7= df[4889:5703]
df8= df[5704:6518]
df9= df[6519:7333]
df10= df[7334:8148]



In [267]:

    
import random
def reservoir_sample(size):
    reservoir=[]
    for index, row in df1.iterrows():
            if index<=size:
                  reservoir.append(row)
            elif random.random()<(1 / index):   
                 reservoir[random.randint(0, size-1)] = (row)
    return reservoir



In [268]:

    
df1_sample=pd.DataFrame(reservoir_sample(100)) #apply sample function of size 100 on df1
                                               #which is our 10% batch and save in datframe df1_sample



In [269]:

    
df1_sample.shape #cross check sampled dataframe size









    Out[269]:





(100, 7)



In [270]:

    
df1_sample.head()









    Out[270]:






  
    
      
      date
      Temperature
      Humidity
      Light
      CO2
      HumidityRatio
      Occupancy
    
  
  
    
      1
      2015-02-04 17:51:00
      23.18
      27.2720
      426.0
      721.25
      0.004793
      1
    
    
      2
      2015-02-04 17:51:59
      23.15
      27.2675
      429.5
      714.00
      0.004783
      1
    
    
      3
      2015-02-04 17:53:00
      23.15
      27.2450
      426.0
      713.50
      0.004779
      1
    
    
      4
      2015-02-04 17:54:00
      23.15
      27.2000
      426.0
      708.25
      0.004772
      1
    
    
      5
      2015-02-04 17:55:00
      23.10
      27.2000
      426.0
      704.50
      0.004757
      1



In [271]:

    
feature_cols=['Temperature', 'Humidity', 'Light', 'CO2']
df1_train_x=df1_sample[feature_cols]                          #create feature and lable of sampled dataframe 
df1_train_y=df1_sample.Occupancy
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
logreg.fit(df1_train_x,df1_train_y)                            #train model sampled dataframe









    Out[271]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [272]:

    
df2_x=df2[feature_cols]
df2_y=df2.Occupancy
df3_x=df3[feature_cols]
df3_y=df3.Occupancy
df4_x=df4[feature_cols]
df4_y=df4.Occupancy
df5_x=df5[feature_cols]
df5_y=df5.Occupancy
df6_x=df6[feature_cols]
df6_y=df6.Occupancy
df7_x=df7[feature_cols]
df7_y=df7.Occupancy
df8_x=df8[feature_cols]
df8_y=df8.Occupancy
df9_x=df9[feature_cols]
df9_y=df9.Occupancy
df10_x=df10[feature_cols]
df10_y=df10.Occupancy



In [273]:

    
y_pred_class_df2=logreg.predict(df2_x) #test on df2 or next 10%



In [274]:

    
from sklearn import metrics
from sklearn.metrics import accuracy_score
df2_score=metrics.accuracy_score(df2_y,y_pred_class_df2)



In [275]:

    
results=pd.DataFrame({'Accuracy':[df2_score]}) # putting results in dataframe 
results.head()



In [276]:

    
df2_new=pd.concat([df1,df2],axis=0)  #concating row waswise so axis =0, df2_new will be our new combined 20%



In [277]:

    
df2_new.shape









    Out[277]:





(1627, 7)



In [278]:

    
from sklearn.utils import shuffle
df2_new = shuffle(df2_new)



In [279]:

    
df2_new.reset_index(drop=True,inplace=True)
df2_new.index = df2_new.index + 1
df2_new.head(2)









    Out[279]:






  
    
      
      date
      Temperature
      Humidity
      Light
      CO2
      HumidityRatio
      Occupancy
    
  
  
    
      1
      2015-02-05 14:41:00
      22.290
      28.39
      479.0
      1021.0
      0.004727
      1
    
    
      2
      2015-02-05 05:19:59
      20.945
      23.50
      0.0
      449.0
      0.003597
      0



In [280]:

    
df2_new.shape









    Out[280]:





(1627, 7)



In [281]:

    
def reservoir_sample(size):
    reservoir=[]
    for index, row in df2_new.iterrows():
            if index<=size:
                  reservoir.append(row)
            elif random.random()<(1 / index):   
                 reservoir[random.randint(0, size-1)] = (row)
    return reservoir



In [282]:

    
df2_sample=pd.DataFrame(reservoir_sample(100)) #apply sample function of size 100 on df2_new which is 
                                               # our 20% batch and save in datframe df2_sample



In [283]:

    
df2_sample.shape #cross check sampled dataframe size









    Out[283]:





(100, 7)



In [284]:

    
df2_sample.head()









    Out[284]:






  
    
      
      date
      Temperature
      Humidity
      Light
      CO2
      HumidityRatio
      Occupancy
    
  
  
    
      1
      2015-02-05 14:41:00
      22.290
      28.39
      479.000000
      1021.0
      0.004727
      1
    
    
      2
      2015-02-05 05:19:59
      20.945
      23.50
      0.000000
      449.0
      0.003597
      0
    
    
      3
      2015-02-05 12:01:00
      22.200
      27.10
      465.666667
      1110.0
      0.004486
      1
    
    
      4
      2015-02-04 21:38:00
      21.500
      25.79
      0.000000
      487.0
      0.004088
      0
    
    
      5
      2015-02-05 20:53:00
      21.000
      19.70
      0.000000
      476.0
      0.003023
      0



In [285]:

    
df2_sample_x=df2_sample[feature_cols]
df2_sample_y=df2_sample.Occupancy   
logreg.fit(df2_sample_x,df2_sample_y) #train model on sampled dataframe from first 20% of size 100









    Out[285]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [286]:

    
y_pred_class_df3=logreg.predict(df3_x) #test on df3 which 3rd chunk or batch 
df3_score=metrics.accuracy_score(df3_y,y_pred_class_df3)
results.loc[1] = df3_score
results.head()



In [287]:

    
df3_new=pd.concat([df2_new,df3],axis=0)  #concating row wise so axis =0, df3_new will be our new combined 30%



In [288]:

    
df3_new.shape









    Out[288]:





(2441, 7)



In [289]:

    
df3_new = shuffle(df3_new)
df3_new.reset_index(drop=True,inplace=True)
df3_new.index = df3_new.index + 1
df3_new.head(2)









    Out[289]:






  
    
      
      date
      Temperature
      Humidity
      Light
      CO2
      HumidityRatio
      Occupancy
    
  
  
    
      1
      2015-02-05 19:24:00
      21.1
      20.195
      0.0
      539.5
      0.003118
      0
    
    
      2
      2015-02-05 09:58:00
      22.1
      26.390
      461.5
      1038.5
      0.004341
      1



In [290]:

    
def reservoir_sample(size):
    reservoir=[]
    for index, row in df3_new.iterrows():
            if index<=size:
                  reservoir.append(row)
            elif random.random()<(1 / index):   
                 reservoir[random.randint(0, size-1)] = (row)
    return reservoir



In [291]:

    
df3_sample=pd.DataFrame(reservoir_sample(100)) #apply sample function of size 100 on df3_new which is 
                                               # our 30% batch and save in datframe df3_sample



In [292]:

    
df3_sample.head(5)









    Out[292]:






  
    
      
      date
      Temperature
      Humidity
      Light
      CO2
      HumidityRatio
      Occupancy
    
  
  
    
      1
      2015-02-05 19:24:00
      21.100000
      20.195000
      0.0
      539.50
      0.003118
      0
    
    
      109
      2015-02-05 10:14:00
      22.100000
      26.700000
      455.5
      1049.00
      0.004392
      1
    
    
      3
      2015-02-05 10:06:00
      22.100000
      26.500000
      449.5
      1036.75
      0.004359
      1
    
    
      4
      2015-02-06 08:31:00
      20.823333
      19.666667
      433.0
      591.00
      0.002985
      1
    
    
      5
      2015-02-05 20:51:59
      21.000000
      19.700000
      0.0
      467.50
      0.003023
      0



In [293]:

    
df3_sample_x=df3_sample[feature_cols]
df3_sample_y=df3_sample.Occupancy   
logreg.fit(df3_sample_x,df3_sample_y) #train model on sampled dataframe from first 30% of size 100









    Out[293]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [294]:

    
y_pred_class_df4=logreg.predict(df4_x) #test on df4 which 4th chunk or batch 
df4_score=metrics.accuracy_score(df4_y,y_pred_class_df4)
results.loc[2] = df4_score
results.head()



In [295]:

    
df4_new=pd.concat([df3_new,df4],axis=0)  #concating row waswise so axis =0, df4_new will be our new combined 40%



In [296]:

    
df4_new.shape









    Out[296]:





(3255, 7)



In [297]:

    
df4_new = shuffle(df4_new)
df4_new.reset_index(drop=True,inplace=True)
df4_new.index = df4_new.index + 1
df4_new.head(2)









    Out[297]:






  
    
      
      date
      Temperature
      Humidity
      Light
      CO2
      HumidityRatio
      Occupancy
    
  
  
    
      1
      2015-02-06 14:09:00
      21.79
      19.1000
      546.5
      654.50
      0.003076
      1
    
    
      2
      2015-02-06 10:24:00
      21.34
      21.1475
      455.5
      848.25
      0.003315
      1



In [298]:

    
def reservoir_sample(size):
    reservoir=[]
    for index, row in df4_new.iterrows():
            if index<=size:
                  reservoir.append(row)
            elif random.random()<(1 / index):   
                 reservoir[random.randint(0, size-1)] = (row)
    return reservoir



In [299]:

    
df4_sample=pd.DataFrame(reservoir_sample(100)) #apply sample function of size 100 on df4_new which is 
                                               # our 40% batch and save in datframe df4_sample



In [300]:

    
df4_sample.head(2)









    Out[300]:






  
    
      
      date
      Temperature
      Humidity
      Light
      CO2
      HumidityRatio
      Occupancy
    
  
  
    
      1
      2015-02-06 14:09:00
      21.79
      19.1000
      546.5
      654.50
      0.003076
      1
    
    
      2
      2015-02-06 10:24:00
      21.34
      21.1475
      455.5
      848.25
      0.003315
      1



In [301]:

    
df4_sample_x=df4_sample[feature_cols]
df4_sample_y=df4_sample.Occupancy   
logreg.fit(df4_sample_x,df4_sample_y) #train model on sampled dataframe from first 30% of size 100









    Out[301]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [302]:

    
y_pred_class_df5=logreg.predict(df5_x) #test on df5 which 5th chunk or batch 
df5_score=metrics.accuracy_score(df5_y,y_pred_class_df5)
results.loc[3] = df5_score
results.head()



In [ ]:

    
# And so on till last batch arrives

	Accuracy
0	0.920049
1	0.984029
2	0.986486

	Accuracy
0	0.920049
1	0.984029
2	0.986486
3	0.957002

	date	Temperature	Humidity	Light	CO2	HumidityRatio	Occupancy
1	2015-02-04 17:51:00	23.18	27.2720	426.0	721.25	0.004793	1
2	2015-02-04 17:51:59	23.15	27.2675	429.5	714.00	0.004783	1

	date	Temperature	Humidity	Light	CO2	HumidityRatio	Occupancy
1	2015-02-05 14:41:00	22.290	28.39	479.0	1021.0	0.004727	1
2	2015-02-05 05:19:59	20.945	23.50	0.0	449.0	0.003597	0

	date	Temperature	Humidity	Light	CO2	HumidityRatio	Occupancy
1	2015-02-05 19:24:00	21.1	20.195	0.0	539.5	0.003118	0
2	2015-02-05 09:58:00	22.1	26.390	461.5	1038.5	0.004341	1

	date	Temperature	Humidity	Light	CO2	HumidityRatio	Occupancy
1	2015-02-05 19:24:00	21.100000	20.195000	0.0	539.50	0.003118	0
109	2015-02-05 10:14:00	22.100000	26.700000	455.5	1049.00	0.004392	1
3	2015-02-05 10:06:00	22.100000	26.500000	449.5	1036.75	0.004359	1
4	2015-02-06 08:31:00	20.823333	19.666667	433.0	591.00	0.002985	1
5	2015-02-05 20:51:59	21.000000	19.700000	0.0	467.50	0.003023	0

	date	Temperature	Humidity	Light	CO2	HumidityRatio	Occupancy
1	2015-02-06 14:09:00	21.79	19.1000	546.5	654.50	0.003076	1
2	2015-02-06 10:24:00	21.34	21.1475	455.5	848.25	0.003315	1