In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
df=pd.read_table('C:\\Users\\Siddy\\Desktop\\occupancy_data\\datatraining.txt', sep=',')
df.dtypes


Out[2]:
date              object
Temperature      float64
Humidity         float64
Light            float64
CO2              float64
HumidityRatio    float64
Occupancy          int64
dtype: object

In [3]:
df.shape


Out[3]:
(8143, 7)

In [4]:
df.head(10)


Out[4]:
date Temperature Humidity Light CO2 HumidityRatio Occupancy
1 2015-02-04 17:51:00 23.180 27.2720 426.0 721.250000 0.004793 1
2 2015-02-04 17:51:59 23.150 27.2675 429.5 714.000000 0.004783 1
3 2015-02-04 17:53:00 23.150 27.2450 426.0 713.500000 0.004779 1
4 2015-02-04 17:54:00 23.150 27.2000 426.0 708.250000 0.004772 1
5 2015-02-04 17:55:00 23.100 27.2000 426.0 704.500000 0.004757 1
6 2015-02-04 17:55:59 23.100 27.2000 419.0 701.000000 0.004757 1
7 2015-02-04 17:57:00 23.100 27.2000 419.0 701.666667 0.004757 1
8 2015-02-04 17:57:59 23.100 27.2000 419.0 699.000000 0.004757 1
9 2015-02-04 17:58:59 23.100 27.2000 419.0 689.333333 0.004757 1
10 2015-02-04 18:00:00 23.075 27.1750 419.0 688.000000 0.004745 1

In [5]:
df.columns


Out[5]:
Index(['date', 'Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio',
       'Occupancy'],
      dtype='object')

In [6]:
df['date']=pd.to_datetime(df.date) #I have converted to datetime so we can use maths with datetime
df.dtypes


Out[6]:
date             datetime64[ns]
Temperature             float64
Humidity                float64
Light                   float64
CO2                     float64
HumidityRatio           float64
Occupancy                 int64
dtype: object

In [7]:
ts=pd.to_datetime('2/4/2015')

In [8]:
df.date.max()


Out[8]:
Timestamp('2015-02-10 09:33:00')

In [9]:
df.date.min()


Out[9]:
Timestamp('2015-02-04 17:51:00')

In [10]:
(df.date.max()-df.date.min()).seconds


Out[10]:
56520

In [11]:
df.shape


Out[11]:
(8143, 7)

In [12]:
df.set_index('date',inplace=True)

In [104]:
df1 = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()
df4 = pd.DataFrame()
df5 = pd.DataFrame()
df6 = pd.DataFrame()
df7 = pd.DataFrame()
df8 = pd.DataFrame()
df9 = pd.DataFrame()
df10 = pd.DataFrame()
df1 = df[:814]
df2= df[815:1628]
df3= df[1629:2443]
df4= df[2444:3258]
df5= df[3259:4073]
df6= df[4074:4888]
df7= df[4889:5703]
df8= df[5704:6518]
df9= df[6519:7333]
df10= df[7334:8148]

In [14]:
set(df['Occupancy'])


Out[14]:
{0, 1}

In [75]:
feature_cols=['Temperature', 'Humidity', 'Light', 'CO2']
df1_train_x=df1[feature_cols]
df1_train_y=df1.Occupancy
df2_x=df2[feature_cols]
df2_y=df2.Occupancy
df3_x=df3[feature_cols]
df3_y=df3.Occupancy
df4_x=df4[feature_cols]
df4_y=df4.Occupancy
df5_x=df5[feature_cols]
df5_y=df5.Occupancy
df6_y=df6.Occupancy
df6_y=df6.Occupancy
df7_x=df7[feature_cols]
df7_y=df7.Occupancy
df8_x=df8[feature_cols]
df8_y=df8.Occupancy
df9_x=df9[feature_cols]
df9_y=df9.Occupancy
df10_x=df10[feature_cols]
df10_y=df10.Occupancy

In [76]:
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()

In [77]:
logreg.fit(df1_train_x,df1_train_y)


Out[77]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [78]:
y_pred_class_df2=logreg.predict(df2_x) #test on df2 or next 10%

In [79]:
#Now we will check our classification model accuracy which we have trained on df1 and predicted values of or tested on df2

In [80]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
df2_score=metrics.accuracy_score(df2_y,y_pred_class_df2)

In [81]:
results=pd.DataFrame({'Accuracy':[df2_score]}) # putting results in dataframe 
results.head()


Out[81]:
Accuracy
0 0.916359

In [82]:
#now we will train classification model on next 10% which is our df2 and test it on df3 and so on

In [83]:
logreg.fit(df2_x,df2_y)


Out[83]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [84]:
y_pred_class_df3=logreg.predict(df3_x)
df3_score=metrics.accuracy_score(df3_y,y_pred_class_df3)

In [85]:
results.loc[1] = df3_score
results.head()


Out[85]:
Accuracy
0 0.916359
1 0.969287

In [86]:
logreg.fit(df3_x,df3_y)


Out[86]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [87]:
y_pred_class_df4=logreg.predict(df4_x)
df4_score=metrics.accuracy_score(df4_y,y_pred_class_df4)
results.loc[2] = df4_score
results.head()


Out[87]:
Accuracy
0 0.916359
1 0.969287
2 0.880835

In [88]:
logreg.fit(df4_x,df4_y)


Out[88]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [89]:
y_pred_class_df5=logreg.predict(df5_x)
df5_score=metrics.accuracy_score(df5_y,y_pred_class_df5)
results.loc[3] = df5_score
results.head()


Out[89]:
Accuracy
0 0.916359
1 0.969287
2 0.880835
3 0.995086

In [94]:
logreg.fit(df5_x, df5_y)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-94-52e9be94bffe> in <module>()
----> 1 logreg.fit(df5_x, df5_y)

C:\anaconda\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
   1184                 self.class_weight, self.penalty, self.dual, self.verbose,
   1185                 self.max_iter, self.tol, self.random_state,
-> 1186                 sample_weight=sample_weight)
   1187             self.n_iter_ = np.array([n_iter_])
   1188             return self

C:\anaconda\lib\site-packages\sklearn\svm\base.py in _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight, penalty, dual, verbose, max_iter, tol, random_state, multi_class, loss, epsilon, sample_weight)
    873             raise ValueError("This solver needs samples of at least 2 classes"
    874                              " in the data, but the data contains only one"
--> 875                              " class: %r" % classes_[0])
    876 
    877         class_weight_ = compute_class_weight(class_weight, classes_, y)

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [ ]:
couldent find much so just moved on to different classification SVM

In [95]:
from sklearn import svm
svm_classifier = svm.SVC()
svm_classifier.fit(df1_train_x,df1_train_y)


Out[95]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [96]:
y_pred_svmclass_df2=svm_classifier.predict(df2_x)
df2_svmscore=metrics.accuracy_score(df2_y,y_pred_svmclass_df2)
results_svm=pd.DataFrame({'Accuracy of SVM':[df2_svmscore]}) # putting results in dataframe 
results_svm.head()


Out[96]:
Accuracy of SVM
0 0.337023

In [97]:
svm_classifier.fit(df2_x,df2_y)


Out[97]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [98]:
y_pred_svmclass_df3=svm_classifier.predict(df3_x)
df3_svmscore=metrics.accuracy_score(df3_y,y_pred_svmclass_df3)
results_svm.loc[1] = df3_svmscore
results_svm.head()


Out[98]:
Accuracy of SVM
0 0.337023
1 0.341523

In [99]:
svm_classifier.fit(df3_x,df3_y)


Out[99]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [100]:
y_pred_svmclass_df4=svm_classifier.predict(df4_x)
df4_svmscore=metrics.accuracy_score(df4_y,y_pred_svmclass_df4)
results_svm.loc[2] = df4_svmscore
results_svm.head()


Out[100]:
Accuracy of SVM
0 0.337023
1 0.341523
2 0.796069

In [101]:
svm_classifier.fit(df4_x,df4_y)


Out[101]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [102]:
y_pred_svmclass_df5=svm_classifier.predict(df5_x)
df5_svmscore=metrics.accuracy_score(df5_y,y_pred_svmclass_df5)
results_svm.loc[3] = df5_svmscore
results_svm.head()


Out[102]:
Accuracy of SVM
0 0.337023
1 0.341523
2 0.796069
3 0.588452

In [103]:
svm_classifier.fit(df5_x,df5_y)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-103-2edba68e8a05> in <module>()
----> 1 svm_classifier.fit(df5_x,df5_y)

C:\anaconda\lib\site-packages\sklearn\svm\base.py in fit(self, X, y, sample_weight)
    150 
    151         X, y = check_X_y(X, y, dtype=np.float64, order='C', accept_sparse='csr')
--> 152         y = self._validate_targets(y)
    153 
    154         sample_weight = np.asarray([]

C:\anaconda\lib\site-packages\sklearn\svm\base.py in _validate_targets(self, y)
    524             raise ValueError(
    525                 "The number of classes has to be greater than one; got %d"
--> 526                 % len(cls))
    527 
    528         self.classes_ = cls

ValueError: The number of classes has to be greater than one; got 1

In [ ]:
#got same error as logistic regression