In [1]:
import pandas as pd
In [2]:
df = pd.read_csv('../3-data/phmrc_cleaned.csv')
df.head()
Out[2]:
In [3]:
import numpy as np
In [4]:
X = np.array(df.filter(regex='^(s[0-9]+|age|sex)').fillna(0))
y = np.array(df.gs_text34)
In [5]:
import sklearn.naive_bayes
In [6]:
clf = sklearn.naive_bayes.BernoulliNB()
In [7]:
import sklearn.model_selection
In [8]:
cv = sklearn.model_selection.KFold(n_folds=10, shuffle=True, random_state=123456)
In [9]:
for train, test in cv.split(X, y):
clf.fit(X[train], y[train])
y_pred = clf.predict(X[test])
acc = np.mean(y_pred == y[test])
print(acc)
In [10]:
# refactor this into a function
def measure_acc(rep):
cv = sklearn.model_selection.KFold(n_folds=10, shuffle=True, random_state=123456+rep)
acc_list = []
for train, test in cv.split(X, y):
clf.fit(X[train], y[train])
y_pred = clf.predict(X[test])
acc = np.mean(y_pred == y[test])
acc_list.append(acc)
return acc_list
measure_acc(rep=0)
Out[10]:
In [11]:
%%time
# repeat it 10 times
acc_list = []
for rep in range(10):
acc_list += measure_acc(rep)
In [12]:
pd.Series(acc_list).describe(percentiles=[.025, .975])
Out[12]: