notebook.community

Edit and run



In [2]:

    
import pandas as pd
import numpy as np
import matplotlib as plt
df=pd.read_table('C:\\Users\\Siddy\\Desktop\\occupancy_data\\datatraining.txt', sep=',')
df.dtypes









    Out[2]:





date              object
Temperature      float64
Humidity         float64
Light            float64
CO2              float64
HumidityRatio    float64
Occupancy          int64
dtype: object



In [3]:

    
df.shape









    Out[3]:





(8143, 7)



In [4]:

    
df.head(10)









    Out[4]:






  
    
      
      date
      Temperature
      Humidity
      Light
      CO2
      HumidityRatio
      Occupancy
    
  
  
    
      1
      2015-02-04 17:51:00
      23.180
      27.2720
      426.0
      721.250000
      0.004793
      1
    
    
      2
      2015-02-04 17:51:59
      23.150
      27.2675
      429.5
      714.000000
      0.004783
      1
    
    
      3
      2015-02-04 17:53:00
      23.150
      27.2450
      426.0
      713.500000
      0.004779
      1
    
    
      4
      2015-02-04 17:54:00
      23.150
      27.2000
      426.0
      708.250000
      0.004772
      1
    
    
      5
      2015-02-04 17:55:00
      23.100
      27.2000
      426.0
      704.500000
      0.004757
      1
    
    
      6
      2015-02-04 17:55:59
      23.100
      27.2000
      419.0
      701.000000
      0.004757
      1
    
    
      7
      2015-02-04 17:57:00
      23.100
      27.2000
      419.0
      701.666667
      0.004757
      1
    
    
      8
      2015-02-04 17:57:59
      23.100
      27.2000
      419.0
      699.000000
      0.004757
      1
    
    
      9
      2015-02-04 17:58:59
      23.100
      27.2000
      419.0
      689.333333
      0.004757
      1
    
    
      10
      2015-02-04 18:00:00
      23.075
      27.1750
      419.0
      688.000000
      0.004745
      1



In [5]:

    
df.columns









    Out[5]:





Index(['date', 'Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio',
       'Occupancy'],
      dtype='object')



In [6]:

    
df['date']=pd.to_datetime(df.date) #I have converted to datetime so we can use maths with datetime
df.dtypes









    Out[6]:





date             datetime64[ns]
Temperature             float64
Humidity                float64
Light                   float64
CO2                     float64
HumidityRatio           float64
Occupancy                 int64
dtype: object



In [7]:

    
ts=pd.to_datetime('2/4/2015')



In [8]:

    
df.date.max()









    Out[8]:





Timestamp('2015-02-10 09:33:00')



In [9]:

    
df.date.min()









    Out[9]:





Timestamp('2015-02-04 17:51:00')



In [10]:

    
(df.date.max()-df.date.min()).seconds









    Out[10]:





56520



In [11]:

    
df.shape









    Out[11]:





(8143, 7)



In [12]:

    
df.set_index('date',inplace=True)



In [104]:

    
df1 = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()
df4 = pd.DataFrame()
df5 = pd.DataFrame()
df6 = pd.DataFrame()
df7 = pd.DataFrame()
df8 = pd.DataFrame()
df9 = pd.DataFrame()
df10 = pd.DataFrame()
df1 = df[:814]
df2= df[815:1628]
df3= df[1629:2443]
df4= df[2444:3258]
df5= df[3259:4073]
df6= df[4074:4888]
df7= df[4889:5703]
df8= df[5704:6518]
df9= df[6519:7333]
df10= df[7334:8148]



In [14]:

    
set(df['Occupancy'])









    Out[14]:





{0, 1}



In [75]:

    
feature_cols=['Temperature', 'Humidity', 'Light', 'CO2']
df1_train_x=df1[feature_cols]
df1_train_y=df1.Occupancy
df2_x=df2[feature_cols]
df2_y=df2.Occupancy
df3_x=df3[feature_cols]
df3_y=df3.Occupancy
df4_x=df4[feature_cols]
df4_y=df4.Occupancy
df5_x=df5[feature_cols]
df5_y=df5.Occupancy
df6_y=df6.Occupancy
df6_y=df6.Occupancy
df7_x=df7[feature_cols]
df7_y=df7.Occupancy
df8_x=df8[feature_cols]
df8_y=df8.Occupancy
df9_x=df9[feature_cols]
df9_y=df9.Occupancy
df10_x=df10[feature_cols]
df10_y=df10.Occupancy



In [76]:

    
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()



In [77]:

    
logreg.fit(df1_train_x,df1_train_y)









    Out[77]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [78]:

    
y_pred_class_df2=logreg.predict(df2_x) #test on df2 or next 10%



In [79]:

    
#Now we will check our classification model accuracy which we have trained on df1 and predicted values of or tested on df2



In [80]:

    
from sklearn import metrics
from sklearn.metrics import accuracy_score
df2_score=metrics.accuracy_score(df2_y,y_pred_class_df2)



In [81]:

    
results=pd.DataFrame({'Accuracy':[df2_score]}) # putting results in dataframe 
results.head()



In [82]:

    
#now we will train classification model on next 10% which is our df2 and test it on df3 and so on



In [83]:

    
logreg.fit(df2_x,df2_y)









    Out[83]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [84]:

    
y_pred_class_df3=logreg.predict(df3_x)
df3_score=metrics.accuracy_score(df3_y,y_pred_class_df3)



In [85]:

    
results.loc[1] = df3_score
results.head()



In [86]:

    
logreg.fit(df3_x,df3_y)









    Out[86]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [87]:

    
y_pred_class_df4=logreg.predict(df4_x)
df4_score=metrics.accuracy_score(df4_y,y_pred_class_df4)
results.loc[2] = df4_score
results.head()



In [88]:

    
logreg.fit(df4_x,df4_y)









    Out[88]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [89]:

    
y_pred_class_df5=logreg.predict(df5_x)
df5_score=metrics.accuracy_score(df5_y,y_pred_class_df5)
results.loc[3] = df5_score
results.head()



In [94]:

    
logreg.fit(df5_x, df5_y)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-94-52e9be94bffe> in <module>()
----> 1 logreg.fit(df5_x, df5_y)

C:\anaconda\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
   1184                 self.class_weight, self.penalty, self.dual, self.verbose,
   1185                 self.max_iter, self.tol, self.random_state,
-> 1186                 sample_weight=sample_weight)
   1187             self.n_iter_ = np.array([n_iter_])
   1188             return self

C:\anaconda\lib\site-packages\sklearn\svm\base.py in _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight, penalty, dual, verbose, max_iter, tol, random_state, multi_class, loss, epsilon, sample_weight)
    873             raise ValueError("This solver needs samples of at least 2 classes"
    874                              " in the data, but the data contains only one"
--> 875                              " class: %r" % classes_[0])
    876 
    877         class_weight_ = compute_class_weight(class_weight, classes_, y)

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0



In [ ]:

    
couldent find much so just moved on to different classification SVM



In [95]:

    
from sklearn import svm
svm_classifier = svm.SVC()
svm_classifier.fit(df1_train_x,df1_train_y)









    Out[95]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)



In [96]:

    
y_pred_svmclass_df2=svm_classifier.predict(df2_x)
df2_svmscore=metrics.accuracy_score(df2_y,y_pred_svmclass_df2)
results_svm=pd.DataFrame({'Accuracy of SVM':[df2_svmscore]}) # putting results in dataframe 
results_svm.head()









    Out[96]:






  
    
      
      Accuracy of SVM
    
  
  
    
      0
      0.337023



In [97]:

    
svm_classifier.fit(df2_x,df2_y)









    Out[97]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)



In [98]:

    
y_pred_svmclass_df3=svm_classifier.predict(df3_x)
df3_svmscore=metrics.accuracy_score(df3_y,y_pred_svmclass_df3)
results_svm.loc[1] = df3_svmscore
results_svm.head()









    Out[98]:






  
    
      
      Accuracy of SVM
    
  
  
    
      0
      0.337023
    
    
      1
      0.341523



In [99]:

    
svm_classifier.fit(df3_x,df3_y)









    Out[99]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)



In [100]:

    
y_pred_svmclass_df4=svm_classifier.predict(df4_x)
df4_svmscore=metrics.accuracy_score(df4_y,y_pred_svmclass_df4)
results_svm.loc[2] = df4_svmscore
results_svm.head()









    Out[100]:






  
    
      
      Accuracy of SVM
    
  
  
    
      0
      0.337023
    
    
      1
      0.341523
    
    
      2
      0.796069



In [101]:

    
svm_classifier.fit(df4_x,df4_y)









    Out[101]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)



In [102]:

    
y_pred_svmclass_df5=svm_classifier.predict(df5_x)
df5_svmscore=metrics.accuracy_score(df5_y,y_pred_svmclass_df5)
results_svm.loc[3] = df5_svmscore
results_svm.head()









    Out[102]:






  
    
      
      Accuracy of SVM
    
  
  
    
      0
      0.337023
    
    
      1
      0.341523
    
    
      2
      0.796069
    
    
      3
      0.588452



In [103]:

    
svm_classifier.fit(df5_x,df5_y)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-103-2edba68e8a05> in <module>()
----> 1 svm_classifier.fit(df5_x,df5_y)

C:\anaconda\lib\site-packages\sklearn\svm\base.py in fit(self, X, y, sample_weight)
    150 
    151         X, y = check_X_y(X, y, dtype=np.float64, order='C', accept_sparse='csr')
--> 152         y = self._validate_targets(y)
    153 
    154         sample_weight = np.asarray([]

C:\anaconda\lib\site-packages\sklearn\svm\base.py in _validate_targets(self, y)
    524             raise ValueError(
    525                 "The number of classes has to be greater than one; got %d"
--> 526                 % len(cls))
    527 
    528         self.classes_ = cls

ValueError: The number of classes has to be greater than one; got 1



In [ ]:

    
#got same error as logistic regression

	Accuracy
0	0.916359
1	0.969287
2	0.880835

	Accuracy
0	0.916359
1	0.969287
2	0.880835
3	0.995086

	date	Temperature	Humidity	Light	CO2	HumidityRatio	Occupancy
1	2015-02-04 17:51:00	23.180	27.2720	426.0	721.250000	0.004793	1
2	2015-02-04 17:51:59	23.150	27.2675	429.5	714.000000	0.004783	1
3	2015-02-04 17:53:00	23.150	27.2450	426.0	713.500000	0.004779	1
4	2015-02-04 17:54:00	23.150	27.2000	426.0	708.250000	0.004772	1
5	2015-02-04 17:55:00	23.100	27.2000	426.0	704.500000	0.004757	1
6	2015-02-04 17:55:59	23.100	27.2000	419.0	701.000000	0.004757	1
7	2015-02-04 17:57:00	23.100	27.2000	419.0	701.666667	0.004757	1
8	2015-02-04 17:57:59	23.100	27.2000	419.0	699.000000	0.004757	1
9	2015-02-04 17:58:59	23.100	27.2000	419.0	689.333333	0.004757	1
10	2015-02-04 18:00:00	23.075	27.1750	419.0	688.000000	0.004745	1