# We won't work through this notebook

We won't have time. But I thought I'd include it, in case you want to see exactly how I implement my population-level quality metric.

import numpy as np, pandas as pd

# Let's put the CSMF Accuracy calculation right at the top

def measure_prediction_quality(csmf_pred, y_test):
"""Calculate population-level prediction quality (CSMF Accuracy)

Parameters
----------
csmf_pred : pd.Series, predicted distribution of causes
y_test : array-like, labels for test dataset

Results
-------
csmf_acc : float
"""

csmf_true = pd.Series(y_test).value_counts() / float(len(y_test))
csmf_acc = 1 -

return csmf_acc

# How can I test this?

csmf_pred = pd.Series({'cause_1': .5, 'cause_2': .5})
y_test = ['cause_1', 'cause_2']
measure_prediction_quality(csmf_pred, y_test)

csmf_pred = pd.Series({'cause_1': 0., 'cause_2': 1.})
y_test = ['cause_1']*1000 + ['cause_2']
measure_prediction_quality(csmf_pred, y_test)

# Things we don't have time for

An approach to really do the cross-validation out of sample:

val = {}

def get_data(module):
X = np.array(val[module].filter(regex='(^s[0-9]+|age|sex)').fillna(0))
y = np.array(val[module].gs_text34)
site = np.array(val[module].site)

return X, y, site

X, y, site = get_data(module)
X.shape

def my_resample(X, y, N2, csmf_new):
""""Randomly resample X and y so that resampled cause distribution follows
csmf_new and there are N2 samples total

Parameters
----------
X : array-like, feature vectors
y : array-like, corresponding labels
N2 : int, number of samples in resampled results
csmf_new : pd.Series, distribution of resampled data

Results
-------
X_new : array-like, resampled feature vectors
y_new : array-like, corresponding resampled labels
"""

N, I = X.shape
assert len(y) == N, 'X and y must have same length'

causes = csmf_new.index
J, = causes.shape  # trailing comma for sneaky numpy reasons

# generate count of examples for each cause according to csmf_new
cnt_new = np.random.multinomial(N2, csmf_new)

# replace y_new with original values
y_new = []
for cnt, cause in zip(cnt_new, causes):
for n_j in range(cnt):
y_new.append(cause)
y_new = np.array(y_new)

# resample rows of X appropriately
X_new = np.zeros((len(y_new), I))
for j in causes:
new_rows, = np.where(y_new == j)  # trailing comma for sneaky numpy reasons
candidate_rows, = np.where(y == j)  # trailing comma for sneaky numpy reasons

assert len(candidate_rows) > 0, 'must have examples of each resampled cause'
old_rows = np.random.choice(candidate_rows, size=len(new_rows), replace=True)
X_new[new_rows,] = X[old_rows,]
return X_new, y_new

def random_allocation(X_train, y_train):
""" make predictions by random allocation"""
clf = sklearn.base.BaseEstimator()
def my_predict(X_test):
N = len(X_test)
J = float(len(np.unique(y_train)))

y_pred = np.ones((N, J)) / J
csmf_pred = pd.Series(y_pred.sum(axis=0),
index=np.unique(y_train)) / N
return csmf_pred
clf.my_predict = my_predict
return clf

def my_key(module, clf):
return '{}-{}'.format(module, clf)

import sklearn.model_selection

results = []
def measure_csmf_acc(my_fit_predictor, replicates=10):
""" my_fit_predictor : function that takes X,y returns clf object with my_predict method
clf.my_predict takes X_test, return csmf_pred

Results
-------
stores calculation in results dict,
returns calc for adults
"""
X, y, site = get_data(module)
acc = []

np.random.seed(12345) # set seed for reproducibility
cv = sklearn.model_selection.StratifiedShuffleSplit(n_iter=replicates, test_size=0.25)
for train_index, test_index in cv.split(X, y):
# make train test split
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

# resample train set for equal class weights
J = len(np.unique(y))
csmf_flat = pd.Series(np.ones(J)/J, index=np.unique(y))
X_train, y_train = my_resample(X_train, y_train,  J*100, csmf_flat)

clf = my_fit_predictor(X_train, y_train)

# resample test set to have uninformative cause distribution
csmf_rand = pd.Series(np.random.dirichlet(np.ones(J)), index=np.unique(y))
X_test_resamp, y_test_resamp = my_resample(X_test, y_test, J*100, csmf_rand)

# make predictions
csmf_pred = clf.my_predict(X_test_resamp)

# test predictions
csmf_acc = measure_prediction_quality(csmf_pred, y_test_resamp)

results.append({'csmf_acc':csmf_acc, 'key':my_key(module, clf)})

df = pd.DataFrame(results)
g = df.groupby('key')
return g.csmf_acc.describe().unstack()

baseline_csmf_acc = measure_csmf_acc(random_allocation)
baseline_csmf_acc

import sklearn.naive_bayes

def nb_pr_allocation(X_train, y_train):
clf = sklearn.naive_bayes.BernoulliNB()
clf.fit(X_train, y_train)

def my_predict(X_test):
y_pred = clf.predict_proba(X_test)
csmf_pred = pd.Series(y_pred.sum(axis=0), index=clf.classes_) / float(len(y_pred))
return csmf_pred
clf.my_predict = my_predict
return clf

measure_csmf_acc(nb_pr_allocation)

