notebook.community

Edit and run



In [19]:

    
import pandas as pd
import numpy as np
import matplotlib as plt
df=pd.read_table('C:\\Users\\Siddy\\Desktop\\occupancy_data\\datatraining.txt', sep=',')
df.dtypes









    Out[19]:





date              object
Temperature      float64
Humidity         float64
Light            float64
CO2              float64
HumidityRatio    float64
Occupancy          int64
dtype: object



In [20]:

    
df.head(2)









    Out[20]:






  
    
      
      date
      Temperature
      Humidity
      Light
      CO2
      HumidityRatio
      Occupancy
    
  
  
    
      1
      2015-02-04 17:51:00
      23.18
      27.2720
      426.0
      721.25
      0.004793
      1
    
    
      2
      2015-02-04 17:51:59
      23.15
      27.2675
      429.5
      714.00
      0.004783
      1



In [21]:

    
df['date']=pd.to_datetime(df.date) #I have converted to datetime so we can use maths with datetime
df.dtypes









    Out[21]:





date             datetime64[ns]
Temperature             float64
Humidity                float64
Light                   float64
CO2                     float64
HumidityRatio           float64
Occupancy                 int64
dtype: object



In [44]:

    
df.set_index('date',inplace=True)



In [45]:

    
df1 = pd.DataFrame()
df2 = pd.DataFrame()
df2_new = pd.DataFrame()
df3 = pd.DataFrame()
df3_new = pd.DataFrame()
df4 = pd.DataFrame()
df4_new = pd.DataFrame()
df5 = pd.DataFrame()
df5_new = pd.DataFrame()
df6 = pd.DataFrame()
df6_new = pd.DataFrame()
df7 = pd.DataFrame()
df7_new = pd.DataFrame()
df8 = pd.DataFrame()
df8_new = pd.DataFrame()
df9 = pd.DataFrame()
df9_new = pd.DataFrame()
df10 = pd.DataFrame()
df10_new = pd.DataFrame()



In [46]:

    
df1 = df[:814]
df2= df[815:1628]
df3= df[1629:2443]
df4= df[2444:3258]
df5= df[3259:4073]
df6= df[4074:4888]
df7= df[4889:5703]
df8= df[5704:6518]
df9= df[6519:7333]
df10= df[7334:8148]



In [47]:

    
feature_cols=['Temperature', 'Humidity', 'Light', 'CO2']
df1_train_x=df1[feature_cols]
df1_train_y=df1.Occupancy
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
logreg.fit(df1_train_x,df1_train_y)









    Out[47]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [90]:

    
df2_x=df2[feature_cols]
df2_y=df2.Occupancy
df3_x=df3[feature_cols]
df3_y=df3.Occupancy
df4_x=df4[feature_cols]
df4_y=df4.Occupancy
df5_x=df5[feature_cols]
df5_y=df5.Occupancy
df6_x=df6[feature_cols]
df6_y=df6.Occupancy
df7_x=df7[feature_cols]
df7_y=df7.Occupancy
df8_x=df8[feature_cols]
df8_y=df8.Occupancy
df9_x=df9[feature_cols]
df9_y=df9.Occupancy
df10_x=df10[feature_cols]
df10_y=df10.Occupancy



In [91]:

    
y_pred_class_df2=logreg.predict(df2_x) #test on df2 or next 10%



In [92]:

    
from sklearn import metrics
from sklearn.metrics import accuracy_score
df2_score=metrics.accuracy_score(df2_y,y_pred_class_df2)



In [93]:

    
results=pd.DataFrame({'Accuracy':[df2_score]}) # putting results in dataframe 
results.head()



In [94]:

    
df2_new=pd.concat([df1,df2],axis=0)  #concating row waswise so axis =0, df2_new will be our new combined 20%



In [95]:

    
df2_new.shape #lets check shape









    Out[95]:





(1627, 6)



In [96]:

    
df2_new_x=df2_new[feature_cols]
df2_new_y=df2_new.Occupancy



In [97]:

    
logreg.fit(df2_new_x,df2_new_y) #train model on combined 20%









    Out[97]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [98]:

    
y_pred_class_df3=logreg.predict(df3_x) #test on df3 which 3rd chunk or batch



In [99]:

    
df3_score=metrics.accuracy_score(df3_y,y_pred_class_df3)



In [100]:

    
results.loc[1] = df3_score
results.head()



In [101]:

    
df3_new=pd.concat([df2_new,df3],axis=0)  #concating row waswise so axis =0, df3_new will be our new combined 30%



In [102]:

    
df3_new.shape









    Out[102]:





(2441, 6)



In [103]:

    
df3_new_x=df3_new[feature_cols]
df3_new_y=df3_new.Occupancy



In [104]:

    
logreg.fit(df3_new_x,df3_new_y) #train model on combined 30%









    Out[104]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [105]:

    
y_pred_class_df4=logreg.predict(df4_x) #test on df4 which 4th chunk or batch



In [106]:

    
df4_score=metrics.accuracy_score(df4_y,y_pred_class_df4)



In [107]:

    
results.loc[2] = df4_score
results.head()



In [108]:

    
df4_new=pd.concat([df3_new,df4],axis=0)  #concating row waswise so axis =0, df4_new will be our new combined 40%



In [109]:

    
df4_new.shape









    Out[109]:





(3255, 6)



In [110]:

    
df4_new_x=df4_new[feature_cols]
df4_new_y=df4_new.Occupancy



In [111]:

    
logreg.fit(df4_new_x,df4_new_y) #train model on combined 40%









    Out[111]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [112]:

    
y_pred_class_df5=logreg.predict(df5_x) #test on df5 which 5th chunk or batch



In [113]:

    
df5_score=metrics.accuracy_score(df5_y,y_pred_class_df5)



In [114]:

    
results.loc[3] = df5_score
results.head()



In [115]:

    
df5_new=pd.concat([df4_new,df5],axis=0)  #concating row waswise so axis =0, df5_new will be our new combined 50%



In [116]:

    
df5_new.shape









    Out[116]:





(4069, 6)



In [117]:

    
df5_new_x=df5_new[feature_cols]
df5_new_y=df5_new.Occupancy



In [118]:

    
logreg.fit(df5_new_x,df5_new_y) #train model on combined 50%









    Out[118]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [119]:

    
y_pred_class_df6=logreg.predict(df6_x) #test on df6 which 6th chunk or batch



In [120]:

    
df6_score=metrics.accuracy_score(df6_y,y_pred_class_df6)



In [121]:

    
results.loc[4] = df6_score
results.head()



In [122]:

    
df6_new=pd.concat([df5_new,df6],axis=0)  #concating row waswise so axis =0, df6_new will be our new combined 60%



In [123]:

    
df6_new.shape









    Out[123]:





(4883, 6)



In [124]:

    
df6_new_x=df6_new[feature_cols]
df6_new_y=df6_new.Occupancy
logreg.fit(df6_new_x,df6_new_y) #train model on combined 60%









    Out[124]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [126]:

    
y_pred_class_df7=logreg.predict(df7_x) #test on df7 which 7th chunk or batch 
df7_score=metrics.accuracy_score(df7_y,y_pred_class_df7)
results.loc[5] = df7_score
results.head(10)



In [127]:

    
df7_new=pd.concat([df6_new,df7],axis=0)  #concating row waswise so axis =0, df7_new will be our new combined 70%
df7_new.shape









    Out[127]:





(5697, 6)



In [128]:

    
df7_new_x=df7_new[feature_cols]
df7_new_y=df7_new.Occupancy
logreg.fit(df7_new_x,df7_new_y) #train model on combined 70%









    Out[128]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [129]:

    
y_pred_class_df8=logreg.predict(df8_x) #test on df8 which 8th chunk or batch 
df8_score=metrics.accuracy_score(df8_y,y_pred_class_df8)
results.loc[6] = df8_score
results.head(10)



In [ ]:

    
#And So on

	Accuracy
0	0.906519
1	0.991400
2	0.987715

	Accuracy
0	0.906519
1	0.991400
2	0.987715
3	0.857494

	Accuracy
0	0.906519
1	0.991400
2	0.987715
3	0.857494
4	1.000000

	Accuracy
0	0.906519
1	0.991400
2	0.987715
3	0.857494
4	1.000000
5	1.000000

	Accuracy
0	0.906519
1	0.991400
2	0.987715
3	0.857494
4	1.000000
5	1.000000
6	1.000000

	date	Temperature	Humidity	Light	CO2	HumidityRatio	Occupancy
1	2015-02-04 17:51:00	23.18	27.2720	426.0	721.25	0.004793	1
2	2015-02-04 17:51:59	23.15	27.2675	429.5	714.00	0.004783	1