https://www.biotrakhealth.com/
In [1]:
from docx import Document
from docx.shared import Inches
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid.parasite_axes import SubplotHost
from mpl_toolkits.axes_grid1 import host_subplot
import mpl_toolkits.axisartist as AA
import seaborn as sns
import requests
import scipy.stats as stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.externals import joblib
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_curve
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
In [2]:
# Read in data
df_users = pd.read_json('users_02_26_2017.txt')
df_sessions = pd.read_json('sessions_02_26_2017.txt')
print df_users.columns
print df_sessions.columns
In [3]:
print len(df_users), " users are set up"
print len(df_sessions), "sessions are uploaded"
In [4]:
# Read Ratings File
df_ratings = pd.read_csv('data/ratings.csv')
print "Number of ratings is ",len(df_ratings)
In [5]:
print "Rated Bad:", df_ratings[df_ratings['rating']=='Bad'].count()
In [6]:
print "Rated Good:",df_ratings[df_ratings['rating']=='Good'].count()
In [7]:
df_sessions = df_sessions.ix[:356,:]
After a review session the project firmware engineer and I graded 356 sessions. There were 237 bad and 119 good ratings. a
In addition to coming up with the ratings we jointly decided on some heuristics for determing a good and bad sessions
If the session was less than 60 seconds the session is bad. It is possible that a user may have an properly operating device over the 60 seconds, but we consider that not an effective use as there is not enough time to use the sessions as designed.
If the session, after 60 seconds, goes to 0 (Min) or 4095 (Max), the session is bad. Some of the sessions in this category show a significant amount of good data, but the fact that the user had problems idnicate someting that needs to be addressed.
Low signal values are not a problem.
Signal may be smooth with gradual peaks or there may be sharp peaks. The peaks come from activities such as jaw clenching that may be part of a session or may simply be the user exploring the operation of the device.
In [8]:
# add ratings to session data
df_sessions['rating'] = df_ratings['rating']
In [9]:
print df_sessions.columns
A litte data exploration
In [10]:
df_sessions.Session_name.unique()
Out[10]:
In [11]:
df_sessions.Session_type.unique()
Out[11]:
In [12]:
df_groups = df_sessions.groupby('Session_name')
In [13]:
df_groups['Device_info_id'].count()
Out[13]:
In [14]:
df_results = df_sessions[['Session_name','userid']]
In [15]:
# Score is True if session is bad
# we are looking for bad sessions
df_results['score'] = np.where(df_sessions['rating']=='Good', False, True)
We have to change format for the time series data. It starts as a string of comma delimted values. The last value is uploaded incorrectly in some sessions so we discard it.
In [16]:
# Pull session data dictionary info into it's own clumns
dfNew = df_sessions.join(pd.DataFrame(df_sessions["session_data"].to_dict()).T)
In [17]:
dfNew.columns
Out[17]:
In [18]:
# Start calculation of features
# Convert session data from string to array of ints
# -- Drop the last entry as there was a bug that put in a bad last entry
temp = dfNew['average_data'].str.replace(" ","").str.split(',').str[:-1].map(lambda xx: np.array([int(yy) for yy in xx]))
In [19]:
# Some of the time series are empty, if so insert an entry of one sample at 0
# Signal is inverted -- need to change it
temp = temp.map(lambda x: np.array( [0] if not len(x) else x ))
temp = temp.map(lambda x: 4095 - x)
In [20]:
# Get the mean
temp2 = temp.map(lambda x: x.mean())
In [21]:
temp2.fillna(0,inplace=True)
df_results['avg_data_mean'] = temp2
In [22]:
# Get the max
t_max = temp.map(lambda x: x.max())
In [23]:
df_results['avg_data_max'] = t_max
In [24]:
# Get the min
df_results['avg_data_min'] = temp.map(lambda x: x.min())
In [25]:
# Get the length
df_results['avg_data_len'] = temp.map(lambda x: len(x))
In [26]:
# Get the standard deviation
df_results['avg_data_std'] = temp.map(lambda x: x.std())
In [27]:
df_results.columns
Out[27]:
In [28]:
# Load in session num
df_results['session_num'] = df_results.index
In [29]:
# Add heuristics
df_results['min_len'] = df_results['avg_data_len']>3600
df_results['pegged_low'] = [False if (len(x)<3600 or min(x[3600:])>0) else True for x in temp]
df_results['pegged_high'] = [False if (len(x)<3600 or max(x[3600:])<4095) else True for x in temp]
In [30]:
df_results.columns
Out[30]:
In [31]:
df_key_columns = [x for x in list(df_results.columns) if x not in ['Session_name','userid','score']]
print df_key_columns
In [32]:
df_res_save = df_results.copy()
In [33]:
y = df_results['score'].as_matrix()
X_df = df_results[df_key_columns]
X = X_df.as_matrix()
In [34]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)
In [35]:
df_results.describe()
Out[35]:
In [36]:
clf = RandomForestClassifier(n_estimators=10, random_state=42).fit(X_train,y_train)
In [37]:
scores = cross_val_score(clf, X_train, y_train, cv=10)
In [38]:
print scores
print scores.mean()
In [39]:
pred = clf.predict(X_train)
In [40]:
print clf.feature_importances_
for i in range(len(clf.feature_importances_)):
print X_df.columns[i], clf.feature_importances_[i]
clf_save = clf
clf_name_save = X_df.columns
X_save = X_df.as_matrix()
In [41]:
pred_test = clf.predict(X_test)
In [42]:
clf.score(X_test,y_test)
Out[42]:
In [43]:
clf.get_params
Out[43]:
In [44]:
y_prob = clf.predict_proba(X_test)[:,1:]
In [45]:
missed = np.where([pred_test != y_test])[1]
In [46]:
for idx in missed:
print df_results.iloc[idx]['session_num']
In [47]:
for i in range(len(missed)):
print i, clf.predict_proba(X_test)[i]
In [ ]:
In [48]:
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
In [49]:
print precision, recall, thresholds
In [50]:
df = pd.DataFrame()
In [51]:
df['precision'] = precision
In [52]:
print len(precision), len(recall), len(thresholds)
thresholds = np.append(thresholds,1.0)
print len(precision), len(recall), len(thresholds)
In [53]:
df['recall'] = recall
df['thresholds'] = thresholds
In [54]:
%matplotlib inline
df.plot()
Out[54]:
In [55]:
byScore = df_results.groupby('score').count()
In [56]:
print byScore['avg_data_min']
In [57]:
## Add in the cycles and largest swing features
def count_crosses(s, pts=10, intv=600):
'''
Looks at series
for each 'intv' points
- count crossings of 'pts' lookback moving average
INPUTS:
s = list of measurements
pts = how many pts to collect for moving average
intv = interval to measure crossing (600 = 10 sec)
OUTPUTS
crossings / intv = average crossing per interval
biggest_move = on a crossing biggest move
'''
if len(s) < intv:
return 0, 0
if pts >= intv:
return 0,0
s = np.array(s)
crossing_counts = [0]
index = pts
max_swing = 0
up = True # True for last cross up, False for down
while (index + intv < len(s)):
#print index
interval_crossings = 0
for i in range(intv):
last_n = s[index-pts:index]
avg = last_n.mean()
if up and s[index] < avg:
up = False
interval_crossings += 1
swing = abs(s[index]-s[index-1])
if swing > max_swing:
max_swing = swing
if not up and s[index] > avg:
up = True
interval_crossings += 1
swing = abs(s[index]-s[index-1])
if swing > max_swing:
max_swing = swing
index += 1
crossing_counts.append(interval_crossings)
time_sec = intv/60.0
return np.array(crossing_counts).mean()/(2*time_sec), max_swing
In [58]:
df_results3 = df_res_save.copy()
cycles = []
swings = []
look_back = [1]
#look_back = [3]
for look in look_back:
c = []
s = []
for idx, elem in enumerate(temp):
a,b = count_crosses(elem, pts=look)
c.append(a)
s.append(b)
cycles.append(c)
swings.append(s)
df_results3['cycles_{0}'.format(look)] = c
df_results3['swings_{0}'.format(look)] = s
#plt.hist(cycles)
#plt.hist(swings)
In [59]:
#df_results3
In [60]:
df4 = df_results.copy()
df4['cycles_1'] = df_results3['cycles_1']
df4['swings_1'] = df_results3['swings_1']
In [ ]:
In [61]:
sns_plot = sns.pairplot(df4)
sns_plot.savefig("sns-pairplot.png")
In [62]:
y2 = df4['score'].as_matrix()
X_df2 = df4[['avg_data_mean',
'avg_data_max', 'avg_data_min', 'avg_data_std', 'avg_data_len',
'cycles_1','swings_1','min_len','pegged_low','pegged_high']]
X2 = X_df2.as_matrix()
In [63]:
X_train, X_test, y_train, y_test = train_test_split(
X2, y2, test_size=0.33, random_state=42)
In [64]:
clf = RandomForestClassifier(n_estimators=10, random_state=42).fit(X_train,y_train)
clf = clf.fit(X_train, y_train)
In [65]:
clf_save = clf
clf_name_save = X_df2.columns
X_save = X_df2.as_matrix()
In [66]:
scores = cross_val_score(clf, X_train, y_train, cv=10)
In [67]:
print scores
print np.average(scores)
In [68]:
print clf.feature_importances_
for i in range(len(clf.feature_importances_)):
print X_df2.columns[i], clf.feature_importances_[i]
In [ ]:
In [69]:
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
print precision, recall, thresholds
In [70]:
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.set_ylabel('Precision and Recall',fontsize=20)
x1 = np.arange(0,10)/5.0
#ax.plot(x1,df['precision'])
labels = [item.get_text() for item in ax.get_xticklabels()]
labels[1] = 'Testing'
ax.plot(df['precision'])
ax.plot(df['recall'])
#ax.plot(df['thresholds'])
ax.set_xlim(0,10)
#labels =
ax.set_xticklabels(x1)
ax.set_title('Precision and Recall Versus Probability Threshold',fontsize=24,y=1.08)
ax.set_xlabel('Probability Threshold',fontsize=20)
#plt.ylabel('Density',fontsize=18)
#plt.title('Average Frequency in Tension Measurement', fontsize=24)
ax.tick_params(axis='both',labelsize=16)
plt.legend(fontsize=20,loc=0)
plt.show()
plt.savefig('prec-recall.png',bbox_inches='tight')
In [71]:
df4_t = df4[df4['score']==True]
In [72]:
df4_f = df4[df4['score']==False]
In [73]:
# Set up the matplotlib figure
sns.distplot( df4_t['cycles_1'], label = 'Bad')
sns.distplot( df4_f['cycles_1'], label = 'Good')
plt.xlabel('Cycles Per Second',fontsize=20)
plt.ylabel('Density',fontsize=20)
plt.title('Average Frequency in Tension Measurement', fontsize=24, y=1.10)
plt.tick_params(axis='both',labelsize=16)
plt.legend(fontsize=20)
plt.savefig('feat-freq.png',bbox_inches='tight')
In [74]:
sns.distplot( df4_t['avg_data_mean'],label = 'True')
sns.distplot( df4_f['avg_data_mean'], label = 'False')
plt.legend()
Out[74]:
In [75]:
sns.distplot( df4_t['avg_data_len']/60.0,label = 'Bad')
sns.distplot( df4_f['avg_data_len']/60.0, label = 'Good')
plt.xlabel('Seconds',fontsize=20)
plt.ylabel('Density',fontsize=20)
plt.title('Length of Session', fontsize=24, y=1.08)
plt.tick_params(axis='both',labelsize=16)
plt.legend(fontsize=20)
plt.savefig('feat-session-length.png',bbox_inches='tight')
In [76]:
sns.distplot( df4_t['avg_data_max'],label = 'Bad')
sns.distplot( df4_f['avg_data_max'], label = 'Good')
plt.legend(loc=0)
Out[76]:
In [77]:
sns.distplot( df4_t['avg_data_min'],label = 'Bad')
sns.distplot( df4_f['avg_data_min'], label = 'Good')
plt.legend()
Out[77]:
In [78]:
sns.distplot( df4_t['avg_data_std']/4095.0,label = 'Bad')
sns.distplot( df4_f['avg_data_std']/4095.0, label = 'Good')
plt.xlabel('Std in 0-10 Tension Scale',fontsize=20)
plt.ylabel('Density',fontsize=20)
plt.title('Standard Deviation in Tension Measurement', fontsize=24,y=1.08)
plt.tick_params(axis='both',labelsize=16)
plt.legend(fontsize=20)
plt.savefig('feat-std.png',bbox_inches='tight')
In [79]:
good = [len(np.where(df4_f['avg_data_std']<=60)[0]),
len(np.where(df4_f['avg_data_std']>60)[0]) ]
bad = [len(np.where(df4_t['avg_data_std']<=60)[0]),
len(np.where(df4_t['avg_data_std']>60)[0]) ]
In [80]:
fig, ax = plt.subplots()
index = np.arange(2)
bar_width = 0.35
opacity = 0.4
error_config = {'ecolor': '0.3'}
rects1 = plt.bar(index,
bad,
bar_width,
alpha=opacity,
color='b',
label='Bad')
rects2 = plt.bar(index + bar_width,
good,
bar_width,
alpha=opacity,
color='g',
label='Good')
plt.xlabel('Session Length',fontsize=20)
plt.ylabel('Number of Sessions',fontsize=20)
plt.title('60 Second Minimum Length',fontsize=24,y=1.08)
plt.xticks(index + bar_width , ('Less than 60 seconds', 'At least 60 seconds'))
plt.legend(fontsize=16, loc=0)
plt.tick_params(axis='both',labelsize=16)
plt.tight_layout()
plt.show()
plt.savefig('feat-60-sec_min.png',bbox_inches='tight')
In [81]:
sns.distplot( df4_t['pegged_low'],kde=False, label = 'Bad')
sns.distplot( df4_f['pegged_low'],kde=False, label = 'Good')
plt.legend()
Out[81]:
In [82]:
sns.distplot( df4_t['pegged_high'],kde=False, label = 'Bad')
sns.distplot( df4_f['pegged_high'], kde=False, label = 'Good')
plt.legend()
Out[82]:
In [83]:
# Set up the matplotlib figure
df4.columns
Out[83]:
In [84]:
clf = RandomForestClassifier(n_estimators=10, random_state=42).fit(X_train,y_train)
scores = cross_val_score(clf, X_train, y_train, cv=10)
print scores
print np.average(scores)
In [85]:
joblib.dump(clf, 'brtakrf_class.pkl')
Out[85]:
In [ ]:
In [86]:
clfab = AdaBoostClassifier(base_estimator=None,
n_estimators=50, learning_rate=1.0, algorithm='SAMME.R',
random_state=42).fit(X_train,y_train)
In [87]:
scores_ab = cross_val_score(clfab, X_train, y_train, cv=10)
print scores_ab
print np.average(scores_ab)
In [88]:
clfgb = GradientBoostingClassifier(loss='deviance', learning_rate=0.1,
n_estimators=100, subsample=1.0,
criterion='friedman_mse', min_samples_split=2,
min_samples_leaf=1, min_weight_fraction_leaf=0.0,
max_depth=3, min_impurity_split=1e-07, init=None,
random_state=42, max_features=None,
verbose=0, max_leaf_nodes=None,
warm_start=False, presort='auto').fit(X_train,y_train)
In [89]:
scores_gb = cross_val_score(clfgb, X_train, y_train, cv=10)
print scores_gb
print np.average(scores_gb)
In [90]:
scores_rf = cross_val_score(clf, X_train, y_train, cv=10)
print scores_rf
In [91]:
RF_pred = clf.predict(X2)
In [92]:
df4['RF_pred'] = RF_pred
In [93]:
y2 = df4['score'].as_matrix()
X_df2 = df4[['avg_data_mean',
'avg_data_max', 'avg_data_min', 'avg_data_std', 'avg_data_len',
'cycles_1','swings_1','min_len','pegged_low','pegged_high','RF_pred']]
X2 = X_df2.as_matrix()
In [94]:
X_train, X_test, y_train, y_test = train_test_split(
X2, y2, test_size=0.33, random_state=42)
In [95]:
clfab = AdaBoostClassifier(base_estimator=None, n_estimators=50,
learning_rate=1.0, algorithm='SAMME.R', random_state=42).fit(X_train,y_train)
scores_ab = cross_val_score(clfab, X_train, y_train, cv=10)
print scores_ab
print np.average(scores_ab)
In [96]:
scores_ab = cross_val_score(clfab, X_test, y_test, cv=10)
print scores_ab
print np.average(scores_ab)
In [97]:
clfgb = GradientBoostingClassifier(loss='deviance',
learning_rate=0.1, n_estimators=100, subsample=1.0,
criterion='friedman_mse', min_samples_split=2,
min_samples_leaf=1, min_weight_fraction_leaf=0.0,
max_depth=3, min_impurity_split=1e-07, init=None,
random_state=42, max_features=None, verbose=0,
max_leaf_nodes=None, warm_start=False, presort='auto').fit(X_train,y_train)
scores_gb = cross_val_score(clfgb, X_train, y_train, cv=10)
print scores_gb
print np.average(scores_gb)
In [98]:
scores_gb = cross_val_score(clfgb, X_test, y_test, cv=10)
print scores_gb
print np.average(scores_gb)
In [99]:
joblib.dump(clf, 'brtakgb_class.pkl')
Out[99]:
In [100]:
trial_users = pd.read_csv('TrialUsers.csv', header=None)
In [101]:
df_trial = df4[df_results['userid'].isin(trial_users[0]) |df_results['userid'].str.contains('pilot') ]
In [102]:
y_trial = df_trial['score'].as_matrix()
X_trial_df = df_trial[['avg_data_mean',
'avg_data_max', 'avg_data_min', 'avg_data_std', 'avg_data_len',
'cycles_1','swings_1','min_len','pegged_low','pegged_high']]
In [103]:
X_trial = X_trial_df.as_matrix()
In [104]:
trial_score = clf.score(X_trial, y_trial)
print trial_score
In [105]:
X_trial_df2= X_trial_df.copy()
X_trial_df2['RF_pred'] = clf.predict(X_trial)
X_trial2 = X_trial_df2.as_matrix()
X_train, X_test, y_train, y_test = train_test_split(
X_trial2, y_trial, test_size=0.33, random_state=42)
In [106]:
trial_score_gb = clfgb.score(X_trial2, y_trial)
print trial_score_gb
In [107]:
trial_score_ab = clfab.score(X_trial2, y_trial)
print trial_score_ab
In [108]:
clfab.feature_importances_
Out[108]:
In [109]:
len(X_trial[0])
Out[109]:
In [110]:
print df_trial.columns
len(df_trial.columns)
Out[110]:
In [111]:
print X_df.columns
print X_df2.columns
print X_trial_df.columns
print X_trial_df2.columns
In [112]:
print 'Bad=',len([x for x in y_trial if x==True])
print 'Good=',len([x for x in y_trial if x==False])
In [113]:
y_pred2 = clfgb.predict(X_trial2)
In [114]:
print 'Bad=',len([x for x in y_pred2 if x==True])
print 'Good=',len([x for x in y_pred2 if x==False])
In [115]:
right = y_trial == y_pred2
In [116]:
print 'Correct=',len([x for x in right if x==True])
print 'Incorrect=',len([x for x in right if x==False])
In [117]:
y_pred2 = clf.predict(X_trial)
right = y_trial == y_pred2
print 'Correct=',len([x for x in right if x==True])
print 'Incorrect=',len([x for x in right if x==False])
In [118]:
#df_trial[df_trial['score']!=df_trial['RF_pred']]
In [119]:
# Compute the correlation matrix
corr = df_results.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
square=True,
linewidths=.5, cbar_kws={"shrink": .5}, ax=ax, annot=True, annot_kws={"size": 9})
plt.xticks(rotation=45)
plt.yticks(rotation=0)
Out[119]:
In [120]:
# Compute the correlation matrix
corr = df4.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
square=True,
linewidths=.5, cbar_kws={"shrink": .5}, ax=ax, annot=True, annot_kws={"size": 9})
plt.xticks(rotation=45)
plt.yticks(rotation=0)
Out[120]:
In [121]:
# Compute the correlation matrix
df9=df4.drop('RF_pred',axis=1)
corr = df9.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
square=True,
linewidths=.5, cbar_kws={"shrink": .5}, ax=ax, annot=True, annot_kws={"size": 9})
plt.xticks(rotation=45)
plt.yticks(rotation=0)
Out[121]:
In [122]:
forest = clf_save
names = clf_name_save
X = X_save
importances = forest.feature_importances_
print len(importances), len(forest.feature_importances_)
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(X.shape[1]):
print("%d. feature %s (%f)" % (f + 1, names[indices[f]], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), names[indices], rotation=45)
plt.xlim([-1, X.shape[1]])
plt.show()
plt.savefig('feat-imp.png',bbox_inches='tight')