Useful libraries for this assignment
In [25]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
Loading the Training and Test data set using numpy
In [26]:
#Data Loading
#Training Data
x_train=np.loadtxt("../data/Train/X_train.txt")
y_train=np.loadtxt("../data/Train/y_train.txt")
subjects_train=np.loadtxt("../data/Train/subject_id_train.txt")
#Test Data
x_test=np.loadtxt("../data/Test/X_test.txt")
y_test=np.loadtxt("../data/Test/y_test.txt")
subjects_test=np.loadtxt("../data/Test/subject_id_test.txt")
x_total=np.concatenate((x_train,x_test))
y_total=np.concatenate((y_train,y_test))
print "x_train:\n"
print x_train
print "y_train:\n"
print y_train
print "x_test:\n"
print x_test
print "y_test:\n"
print y_test
In [4]:
%matplotlib inline
#Standing
plt.plot([k*10000 for k in range(len(x_test[0]))],x_test[0])
plt.plot([k*10000 for k in range(len(x_test[1]))],x_test[1])
#plt.plot([k*10000 for k in range(len(x_test[329]))],x_test[329])
plt.plot([k*10000 for k in range(len(x_test[2]))],x_test[2])
plt.plot([k*10000 for k in range(len(x_test[3]))],x_test[3])
plt.show()
plt.plot([k*10000 for k in range(len(x_train[0]))],x_train[0])
plt.plot([k*10000 for k in range(len(x_train[1]))],x_train[1])
#plt.plot([k*10000 for k in range(len(x_test[329]))],x_test[329])
plt.plot([k*10000 for k in range(len(x_train[2]))],x_train[2])
plt.plot([k*10000 for k in range(len(x_train[3]))],x_train[3])
plt.show()
In [5]:
#Sitting
plt.plot([k*10000 for k in range(len(x_test[17]))],x_test[17])
plt.plot([k*10000 for k in range(len(x_test[18]))],x_test[18])
plt.plot([k*10000 for k in range(len(x_test[19]))],x_test[19])
plt.plot([k*10000 for k in range(len(x_test[20]))],x_test[20])
plt.show()
plt.plot([k*10000 for k in range(len(x_train[30]))],x_train[30])
plt.plot([k*10000 for k in range(len(x_train[31]))],x_train[31])
#plt.plot([k*10000 for k in range(len(x_test[329]))],x_test[329])
plt.plot([k*10000 for k in range(len(x_train[32]))],x_train[32])
plt.plot([k*10000 for k in range(len(x_train[33]))],x_train[33])
plt.show()
In [6]:
#STAND_TO_LIE
plt.plot([k*10000 for k in range(len(x_test[47]))],x_test[47])
plt.plot([k*10000 for k in range(len(x_test[48]))],x_test[48])
plt.plot([k*10000 for k in range(len(x_test[49]))],x_test[49])
plt.plot([k*10000 for k in range(len(x_test[50]))],x_test[50])
plt.show()
plt.plot([k*10000 for k in range(len(x_train[3882]))],x_train[3882])
plt.plot([k*10000 for k in range(len(x_train[3883]))],x_train[3883])
#plt.plot([k*10000 for k in range(len(x_test[329]))],x_test[329])
plt.plot([k*10000 for k in range(len(x_train[3884]))],x_train[3884])
plt.plot([k*10000 for k in range(len(x_train[4068]))],x_train[4068])
plt.show()
In [7]:
#LAYING
plt.plot([k*10000 for k in range(len(x_test[52]))],x_test[52])
plt.plot([k*10000 for k in range(len(x_test[53]))],x_test[53])
plt.plot([k*10000 for k in range(len(x_test[54]))],x_test[54])
plt.plot([k*10000 for k in range(len(x_test[55]))],x_test[55])
plt.show()
plt.plot([k*10000 for k in range(len(x_train[60]))],x_train[60])
plt.plot([k*10000 for k in range(len(x_train[61]))],x_train[61])
#plt.plot([k*10000 for k in range(len(x_test[329]))],x_test[329])
plt.plot([k*10000 for k in range(len(x_train[62]))],x_train[62])
plt.plot([k*10000 for k in range(len(x_train[63]))],x_train[63])
plt.show()
In [8]:
#LIE_TO_SIT
plt.plot([k*10000 for k in range(len(x_test[63]))],x_test[63])
plt.plot([k*10000 for k in range(len(x_test[64]))],x_test[64])
plt.plot([k*10000 for k in range(len(x_test[65]))],x_test[65])
plt.plot([k*10000 for k in range(len(x_test[221]))],x_test[221])
plt.show()
plt.plot([k*10000 for k in range(len(x_train[3112]))],x_train[3112])
plt.plot([k*10000 for k in range(len(x_train[3500]))],x_train[3500])
#plt.plot([k*10000 for k in range(len(x_test[329]))],x_test[329])
plt.plot([k*10000 for k in range(len(x_train[3501]))],x_train[3501])
plt.plot([k*10000 for k in range(len(x_train[3700]))],x_train[3700])
plt.show()
In [9]:
#SIT_TO_LIE
plt.plot([k*10000 for k in range(len(x_test[76]))],x_test[76])
plt.plot([k*10000 for k in range(len(x_test[77]))],x_test[77])
plt.plot([k*10000 for k in range(len(x_test[233]))],x_test[233])
plt.plot([k*10000 for k in range(len(x_test[401]))],x_test[401])
plt.show()
plt.plot([k*10000 for k in range(len(x_train[2595]))],x_train[2595])
plt.plot([k*10000 for k in range(len(x_train[2762]))],x_train[2762])
#plt.plot([k*10000 for k in range(len(x_test[329]))],x_test[329])
plt.plot([k*10000 for k in range(len(x_train[2763]))],x_train[2763])
plt.plot([k*10000 for k in range(len(x_train[2945]))],x_train[2945])
plt.show()
In [10]:
#LIE_TO_STAND
plt.plot([k*10000 for k in range(len(x_test[92]))],x_test[92])
plt.plot([k*10000 for k in range(len(x_test[248]))],x_test[248])
plt.plot([k*10000 for k in range(len(x_test[417]))],x_test[417])
plt.plot([k*10000 for k in range(len(x_test[584]))],x_test[584])
plt.show()
plt.plot([k*10000 for k in range(len(x_train[4350]))],x_train[4350])
plt.plot([k*10000 for k in range(len(x_train[4351]))],x_train[4351])
#plt.plot([k*10000 for k in range(len(x_test[329]))],x_test[329])
plt.plot([k*10000 for k in range(len(x_train[4742]))],x_train[4742])
plt.plot([k*10000 for k in range(len(x_train[4743]))],x_train[4743])
plt.show()
In [11]:
#WALKING
plt.plot([k*10000 for k in range(len(x_test[93]))],x_test[93])
plt.plot([k*10000 for k in range(len(x_test[94]))],x_test[94])
plt.plot([k*10000 for k in range(len(x_test[95]))],x_test[95])
plt.plot([k*10000 for k in range(len(x_test[96]))],x_test[96])
plt.show()
plt.plot([k*10000 for k in range(len(x_train[80]))],x_train[80])
plt.plot([k*10000 for k in range(len(x_train[81]))],x_train[81])
#plt.plot([k*10000 for k in range(len(x_test[329]))],x_test[329])
plt.plot([k*10000 for k in range(len(x_train[82]))],x_train[82])
plt.plot([k*10000 for k in range(len(x_train[83]))],x_train[83])
plt.show()
In [12]:
#WALKING_UPSTAIRS
plt.plot([k*10000 for k in range(len(x_test[132]))],x_test[132])
plt.plot([k*10000 for k in range(len(x_test[133]))],x_test[133])
plt.plot([k*10000 for k in range(len(x_test[134]))],x_test[134])
plt.plot([k*10000 for k in range(len(x_test[135]))],x_test[135])
plt.show()
plt.plot([k*10000 for k in range(len(x_train[160]))],x_train[160])
plt.plot([k*10000 for k in range(len(x_train[161]))],x_train[161])
#plt.plot([k*10000 for k in range(len(x_test[329]))],x_test[329])
plt.plot([k*10000 for k in range(len(x_train[162]))],x_train[162])
plt.plot([k*10000 for k in range(len(x_train[163]))],x_train[163])
plt.show()
In [13]:
#WALKING_DOWNSTAIRS
plt.plot([k*10000 for k in range(len(x_test[123]))],x_test[123])
plt.plot([k*10000 for k in range(len(x_test[124]))],x_test[124])
plt.plot([k*10000 for k in range(len(x_test[125]))],x_test[125])
plt.plot([k*10000 for k in range(len(x_test[126]))],x_test[126])
plt.show()
plt.plot([k*10000 for k in range(len(x_train[130]))],x_train[130])
plt.plot([k*10000 for k in range(len(x_train[131]))],x_train[131])
#plt.plot([k*10000 for k in range(len(x_test[329]))],x_test[329])
plt.plot([k*10000 for k in range(len(x_train[132]))],x_train[132])
plt.plot([k*10000 for k in range(len(x_train[133]))],x_train[133])
plt.show()
In [14]:
#STAND_TO_SIT
plt.plot([k*10000 for k in range(len(x_test[183]))],x_test[183])
plt.plot([k*10000 for k in range(len(x_test[340]))],x_test[340])
plt.plot([k*10000 for k in range(len(x_test[341]))],x_test[341])
plt.plot([k*10000 for k in range(len(x_test[515]))],x_test[515])
plt.show()
plt.plot([k*10000 for k in range(len(x_train[2360]))],x_train[2360])
plt.plot([k*10000 for k in range(len(x_train[2361]))],x_train[2361])
#plt.plot([k*10000 for k in range(len(x_test[329]))],x_test[329])
plt.plot([k*10000 for k in range(len(x_train[2362]))],x_train[2362])
plt.plot([k*10000 for k in range(len(x_train[2534]))],x_train[2534])
plt.show()
In [15]:
#SIT_TO_STAND
plt.plot([k*10000 for k in range(len(x_test[30]))],x_test[30])
plt.plot([k*10000 for k in range(len(x_test[195]))],x_test[195])
plt.plot([k*10000 for k in range(len(x_test[354]))],x_test[354])
plt.plot([k*10000 for k in range(len(x_test[530]))],x_test[530])
plt.show()
plt.plot([k*10000 for k in range(len(x_train[2595]))],x_train[2595])
plt.plot([k*10000 for k in range(len(x_train[2762]))],x_train[2762])
#plt.plot([k*10000 for k in range(len(x_test[329]))],x_test[329])
plt.plot([k*10000 for k in range(len(x_train[2763]))],x_train[2763])
plt.plot([k*10000 for k in range(len(x_train[2945]))],x_train[2945])
plt.show()
Precision (also called positive predictive value) is the fraction of retrieved instances that are relevant.
Recall (also known as sensitivity) is the fraction of relevant instances that are retrieved.
In statistical analysis of binary classification, the F1 score (also F-score or F-measure) is a measure of a test's accuracy.
Visualizes the precision of model for each class in test data.
###Why the Confusion? Accuracy is not a reliable metric for the real performance of a classifier, because it will yield misleading results if the data set is unbalanced (that is, when the number of samples in different classes vary greatly)
In [16]:
#plt.cm.Blues
def plot_confusion_matrix(cm, title='Confusion matrix', cmap="Set3"):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=70)
plt.yticks(tick_marks, target_names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [17]:
#Logistic Regression
from sklearn.metrics import confusion_matrix, classification_report,f1_score,accuracy_score
from sklearn.linear_model import LogisticRegression
labels=[1,2,3,4,5,6,7,8,9,10,11,12]
target_names=['WALKING','WALKING_UPSTAIRS','WALKING_DOWNSTAIRS','SITTING','STANDING','LAYING','STAND_TO_SIT','SIT_TO_STAND','SIT_TO_LIE','LIE_TO_SIT','STAND_TO_LIE','LIE_TO_STAND']
logReg=LogisticRegression()
logReg.fit(x_train,y_train)
yLogRegPredict=logReg.predict(x_test)
clf_rpt=classification_report(y_test,yLogRegPredict,target_names=target_names)
print clf_rpt
fLogReg=f1_score(y_test,yLogRegPredict,labels=labels,average=None)
print fLogReg
accuracyLogReg=accuracy_score(y_test,yLogRegPredict)
print accuracyLogReg
# Compute confusion matrix for Logistic Regression
cm = confusion_matrix(y_test,yLogRegPredict)
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cm,title='Confusion matrix for Logistic Regression')
# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix for Logistic Regression')
plt.show()
print('Confusion matrix, without normalization')
print(cm)
print('Normalized confusion matrix')
print(cm_normalized)
Random forests is a notion of the general technique of random decision forests that are an ensemble learning method for classification, regression and other tasks, that operate by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees. Random decision forests correct for decision trees' habit of overfitting to their training set.
In [18]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
labels=[1,2,3,4,5,6,7,8,9,10,11,12]
target_names=['WALKING','WALKING_UPSTAIRS','WALKING_DOWNSTAIRS','SITTING','STANDING','LAYING','STAND_TO_SIT','SIT_TO_STAND','SIT_TO_LIE','LIE_TO_SIT','STAND_TO_LIE','LIE_TO_STAND']
randForest=RandomForestClassifier()
randForest.fit(x_train,y_train)
yRandForestPredict=randForest.predict(x_test)
clf_rpt=classification_report(y_test,yRandForestPredict,target_names=target_names)
print clf_rpt
fRandForest=f1_score(y_test,yRandForestPredict,labels=labels,average=None)
print fRandForest
accuracyRandForest=accuracy_score(y_test,yRandForestPredict)
print accuracyRandForest
# Compute confusion matrix for Random Forest
cm = confusion_matrix(y_test,yRandForestPredict)
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cm,title='Confusion matrix for Random Forest')
# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix for Random Forest')
plt.show()
print('Confusion matrix, without normalization')
print(cm)
print('Normalized confusion matrix')
print(cm_normalized)
In [19]:
#KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
labels=[1,2,3,4,5,6,7,8,9,10,11,12]
target_names=['WALKING','WALKING_UPSTAIRS','WALKING_DOWNSTAIRS','SITTING','STANDING','LAYING','STAND_TO_SIT','SIT_TO_STAND','SIT_TO_LIE','LIE_TO_SIT','STAND_TO_LIE','LIE_TO_STAND']
knn=KNeighborsClassifier(n_neighbors=11)
knn.fit(x_train,y_train)
yKnnPredict=knn.predict(x_test)
clf_rpt=classification_report(y_test,yKnnPredict,target_names=target_names)
print clf_rpt
fKnn=f1_score(y_test,yKnnPredict,labels=labels,average=None)
print fKnn
accuracyKnn=accuracy_score(y_test,yKnnPredict)
print accuracyKnn
# Compute confusion matrix for K Nearest Neighbours
cm = confusion_matrix(y_test,yKnnPredict)
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cm,title='Confusion matrix for K Nearest Neighbours')
# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix for K Nearest Neighbours')
plt.show()
print('Confusion matrix, without normalization')
print(cm)
print('Normalized confusion matrix')
print(cm_normalized)
AdaBoost, short for "Adaptive Boosting", is a machine learning meta-algorithm. It can be used in conjunction with many other types of learning algorithms to improve their performance. The output of the other learning algorithms ('weak learners') is combined into a weighted sum that represents the final output of the boosted classifier. AdaBoost is adaptive in the sense that subsequent weak learners are tweaked in favor of those instances misclassified by previous classifiers. AdaBoost is sensitive to noisy data and outliers. In some problems, however, it can be less susceptible to the overfitting problem than other learning algorithms. The individual learners can be weak, but as long as the performance of each one is slightly better than random guessing (e.g., their error rate is smaller than 0.5 for binary classification), the final model can be proven to converge to a strong learner
In [21]:
#AdaBoostClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
labels=[1,2,3,4,5,6,7,8,9,10,11,12]
target_names=['WALKING','WALKING_UPSTAIRS','WALKING_DOWNSTAIRS','SITTING','STANDING','LAYING','STAND_TO_SIT','SIT_TO_STAND','SIT_TO_LIE','LIE_TO_SIT','STAND_TO_LIE','LIE_TO_STAND']
adaBoost=AdaBoostClassifier(
DecisionTreeClassifier(max_depth=2),
n_estimators=100,
learning_rate=1)
adaBoost.fit(x_train,y_train)
yAdaBoostPredict=adaBoost.predict(x_test)
clf_rpt=classification_report(y_test,yAdaBoostPredict,target_names=target_names)
print clf_rpt
fAdaBoost=f1_score(y_test,yAdaBoostPredict,labels=labels,average=None)
print fAdaBoost
accuracyAdaBoost=accuracy_score(y_test,yAdaBoostPredict)
print accuracyAdaBoost
# Compute confusion matrix for Adaptive Boosting
cm = confusion_matrix(y_test,yAdaBoostPredict)
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cm,title='Confusion matrix for Adaptive Boosting')
# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix for Adaptive Boosting')
plt.show()
print('Confusion matrix, without normalization')
print(cm)
print('Normalized confusion matrix')
print(cm_normalized)
In [46]:
#Histogram
n_activities=12
fig, ax = plt.subplots()
index = np.arange(n_activities)
bar_width = 0.15
opacity = 0.4
error_config = {'ecolor': '0.3'}
rects1 = plt.bar(index+ bar_width, fLogReg, bar_width,
alpha=opacity,
color='b',
label='Logistic Regression')
rects2 = plt.bar(index + 2*bar_width, fRandForest, bar_width,
alpha=opacity,
color='r',
label='Random Forest')
rects3 = plt.bar(index + 4*bar_width, fKnn, bar_width,
alpha=opacity,
color='y',
label='K Nearest Neighbours')
rects4 = plt.bar(index + 3*bar_width, fAdaBoost, bar_width,
alpha=opacity,
color='g',
label='Adaptive Boosting')
plt.xlabel('Activity')
plt.ylabel('F-Measure')
plt.title('F-Scores for Classifiers')
plt.xticks(index +bar_width, ('WALKING','WALKING_UPSTAIRS','WALKING_DOWNSTAIRS','SITTING','STANDING','LAYING','STAND_TO_SIT','SIT_TO_STAND','SIT_TO_LIE','LIE_TO_SIT','STAND_TO_LIE','LIE_TO_STAND'), rotation=70)
plt.tight_layout()
plt.legend(bbox_to_anchor=(1.8,1.2))
plt.show()