In [ ]:
import numpy as np, pandas as pd
In [ ]:
def measure_prediction_quality(csmf_pred, y_test):
"""Calculate population-level prediction quality (CSMF Accuracy)
Parameters
----------
csmf_pred : pd.Series, predicted distribution of causes
y_test : array-like, labels for test dataset
Results
-------
csmf_acc : float
"""
csmf_true = pd.Series(y_test).value_counts() / float(len(y_test))
csmf_acc = 1 -
return csmf_acc
In [ ]:
csmf_pred = pd.Series({'cause_1': .5, 'cause_2': .5})
y_test = ['cause_1', 'cause_2']
measure_prediction_quality(csmf_pred, y_test)
In [ ]:
csmf_pred = pd.Series({'cause_1': 0., 'cause_2': 1.})
y_test = ['cause_1']*1000 + ['cause_2']
measure_prediction_quality(csmf_pred, y_test)
In [ ]:
val = {}
module = 'Adult'
val[module] = pd.read_csv('../3-data/phmrc_cleaned.csv')
In [ ]:
def get_data(module):
X = np.array(val[module].filter(regex='(^s[0-9]+|age|sex)').fillna(0))
y = np.array(val[module].gs_text34)
site = np.array(val[module].site)
return X, y, site
In [ ]:
X, y, site = get_data(module)
X.shape
In [ ]:
def my_resample(X, y, N2, csmf_new):
""""Randomly resample X and y so that resampled cause distribution follows
csmf_new and there are N2 samples total
Parameters
----------
X : array-like, feature vectors
y : array-like, corresponding labels
N2 : int, number of samples in resampled results
csmf_new : pd.Series, distribution of resampled data
Results
-------
X_new : array-like, resampled feature vectors
y_new : array-like, corresponding resampled labels
"""
N, I = X.shape
assert len(y) == N, 'X and y must have same length'
causes = csmf_new.index
J, = causes.shape # trailing comma for sneaky numpy reasons
# generate count of examples for each cause according to csmf_new
cnt_new = np.random.multinomial(N2, csmf_new)
# replace y_new with original values
y_new = []
for cnt, cause in zip(cnt_new, causes):
for n_j in range(cnt):
y_new.append(cause)
y_new = np.array(y_new)
# resample rows of X appropriately
X_new = np.zeros((len(y_new), I))
for j in causes:
new_rows, = np.where(y_new == j) # trailing comma for sneaky numpy reasons
candidate_rows, = np.where(y == j) # trailing comma for sneaky numpy reasons
assert len(candidate_rows) > 0, 'must have examples of each resampled cause'
old_rows = np.random.choice(candidate_rows, size=len(new_rows), replace=True)
X_new[new_rows,] = X[old_rows,]
return X_new, y_new
In [ ]:
def random_allocation(X_train, y_train):
""" make predictions by random allocation"""
clf = sklearn.base.BaseEstimator()
def my_predict(X_test):
N = len(X_test)
J = float(len(np.unique(y_train)))
y_pred = np.ones((N, J)) / J
csmf_pred = pd.Series(y_pred.sum(axis=0),
index=np.unique(y_train)) / N
return csmf_pred
clf.my_predict = my_predict
return clf
In [ ]:
def my_key(module, clf):
return '{}-{}'.format(module, clf)
In [ ]:
import sklearn.model_selection
In [ ]:
results = []
def measure_csmf_acc(my_fit_predictor, replicates=10):
""" my_fit_predictor : function that takes X,y returns clf object with my_predict method
clf.my_predict takes X_test, return csmf_pred
Results
-------
stores calculation in results dict,
returns calc for adults
"""
X, y, site = get_data(module)
acc = []
np.random.seed(12345) # set seed for reproducibility
cv = sklearn.model_selection.StratifiedShuffleSplit(n_iter=replicates, test_size=0.25)
for train_index, test_index in cv.split(X, y):
# make train test split
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# resample train set for equal class weights
J = len(np.unique(y))
csmf_flat = pd.Series(np.ones(J)/J, index=np.unique(y))
X_train, y_train = my_resample(X_train, y_train, J*100, csmf_flat)
clf = my_fit_predictor(X_train, y_train)
# resample test set to have uninformative cause distribution
csmf_rand = pd.Series(np.random.dirichlet(np.ones(J)), index=np.unique(y))
X_test_resamp, y_test_resamp = my_resample(X_test, y_test, J*100, csmf_rand)
# make predictions
csmf_pred = clf.my_predict(X_test_resamp)
# test predictions
csmf_acc = measure_prediction_quality(csmf_pred, y_test_resamp)
results.append({'csmf_acc':csmf_acc, 'key':my_key(module, clf)})
df = pd.DataFrame(results)
g = df.groupby('key')
return g.csmf_acc.describe().unstack()
baseline_csmf_acc = measure_csmf_acc(random_allocation)
baseline_csmf_acc
In [ ]:
import sklearn.naive_bayes
def nb_pr_allocation(X_train, y_train):
clf = sklearn.naive_bayes.BernoulliNB()
clf.fit(X_train, y_train)
def my_predict(X_test):
y_pred = clf.predict_proba(X_test)
csmf_pred = pd.Series(y_pred.sum(axis=0), index=clf.classes_) / float(len(y_pred))
return csmf_pred
clf.my_predict = my_predict
return clf
measure_csmf_acc(nb_pr_allocation)
In [ ]: