We won't work through this notebook

We won't have time. But I thought I'd include it, in case you want to see exactly how I implement my population-level quality metric.


In [ ]:
import numpy as np, pandas as pd

Let's put the CSMF Accuracy calculation right at the top


In [ ]:
def measure_prediction_quality(csmf_pred, y_test):
    """Calculate population-level prediction quality (CSMF Accuracy)
    
    Parameters
    ----------
    csmf_pred : pd.Series, predicted distribution of causes
    y_test : array-like, labels for test dataset
    
    Results
    -------
    csmf_acc : float
    """
    
    csmf_true = pd.Series(y_test).value_counts() / float(len(y_test))
    csmf_acc = 1 - 

    return csmf_acc

How can I test this?


In [ ]:
csmf_pred = pd.Series({'cause_1': .5, 'cause_2': .5})
y_test = ['cause_1', 'cause_2']
measure_prediction_quality(csmf_pred, y_test)

In [ ]:
csmf_pred = pd.Series({'cause_1': 0., 'cause_2': 1.})
y_test = ['cause_1']*1000 + ['cause_2']
measure_prediction_quality(csmf_pred, y_test)

Things we don't have time for

An approach to really do the cross-validation out of sample:


In [ ]:
val = {}
module = 'Adult'
val[module] = pd.read_csv('../3-data/phmrc_cleaned.csv')

In [ ]:
def get_data(module):
    X = np.array(val[module].filter(regex='(^s[0-9]+|age|sex)').fillna(0))
    y = np.array(val[module].gs_text34)
    site = np.array(val[module].site)
    
    return X, y, site

In [ ]:
X, y, site = get_data(module)
X.shape

In [ ]:
def my_resample(X, y, N2, csmf_new):
    """"Randomly resample X and y so that resampled cause distribution follows
    csmf_new and there are N2 samples total
    
    Parameters
    ----------
    X : array-like, feature vectors
    y : array-like, corresponding labels
    N2 : int, number of samples in resampled results
    csmf_new : pd.Series, distribution of resampled data
    
    Results
    -------
    X_new : array-like, resampled feature vectors
    y_new : array-like, corresponding resampled labels
    """
    
    N, I = X.shape
    assert len(y) == N, 'X and y must have same length' 

    causes = csmf_new.index
    J, = causes.shape  # trailing comma for sneaky numpy reasons
    
    # generate count of examples for each cause according to csmf_new
    cnt_new = np.random.multinomial(N2, csmf_new)
    
    # replace y_new with original values
    y_new = []
    for cnt, cause in zip(cnt_new, causes):
        for n_j in range(cnt):
            y_new.append(cause)
    y_new = np.array(y_new)
    
    # resample rows of X appropriately
    X_new = np.zeros((len(y_new), I))
    for j in causes:
        new_rows, = np.where(y_new == j)  # trailing comma for sneaky numpy reasons
        candidate_rows, = np.where(y == j)  # trailing comma for sneaky numpy reasons
        
        assert len(candidate_rows) > 0, 'must have examples of each resampled cause'
        old_rows = np.random.choice(candidate_rows, size=len(new_rows), replace=True)
        X_new[new_rows,] = X[old_rows,]
    return X_new, y_new

In [ ]:
def random_allocation(X_train, y_train):
    """ make predictions by random allocation"""
    clf = sklearn.base.BaseEstimator()
    def my_predict(X_test):
        N = len(X_test)
        J = float(len(np.unique(y_train)))
    
        y_pred = np.ones((N, J)) / J
        csmf_pred = pd.Series(y_pred.sum(axis=0),
                              index=np.unique(y_train)) / N
        return csmf_pred
    clf.my_predict = my_predict
    return clf

In [ ]:
def my_key(module, clf):
    return '{}-{}'.format(module, clf)

In [ ]:
import sklearn.model_selection

In [ ]:
results = []
def measure_csmf_acc(my_fit_predictor, replicates=10):
    """ my_fit_predictor : function that takes X,y returns clf object with my_predict method
    clf.my_predict takes X_test, return csmf_pred
    
    Results
    -------
    stores calculation in results dict,
    returns calc for adults
    """
    X, y, site = get_data(module)
    acc = []

    np.random.seed(12345) # set seed for reproducibility
    cv = sklearn.model_selection.StratifiedShuffleSplit(n_iter=replicates, test_size=0.25)
    for train_index, test_index in cv.split(X, y):
        # make train test split
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # resample train set for equal class weights
        J = len(np.unique(y))
        csmf_flat = pd.Series(np.ones(J)/J, index=np.unique(y))
        X_train, y_train = my_resample(X_train, y_train,  J*100, csmf_flat)

        clf = my_fit_predictor(X_train, y_train)

        # resample test set to have uninformative cause distribution
        csmf_rand = pd.Series(np.random.dirichlet(np.ones(J)), index=np.unique(y))
        X_test_resamp, y_test_resamp = my_resample(X_test, y_test, J*100, csmf_rand)

        # make predictions
        csmf_pred = clf.my_predict(X_test_resamp)

        # test predictions
        csmf_acc = measure_prediction_quality(csmf_pred, y_test_resamp)

        results.append({'csmf_acc':csmf_acc, 'key':my_key(module, clf)})

    df = pd.DataFrame(results)
    g = df.groupby('key')
    return g.csmf_acc.describe().unstack()

baseline_csmf_acc = measure_csmf_acc(random_allocation)
baseline_csmf_acc

In [ ]:
import sklearn.naive_bayes

def nb_pr_allocation(X_train, y_train):
    clf = sklearn.naive_bayes.BernoulliNB()
    clf.fit(X_train, y_train)
    
    def my_predict(X_test):
        y_pred = clf.predict_proba(X_test)
        csmf_pred = pd.Series(y_pred.sum(axis=0), index=clf.classes_) / float(len(y_pred))
        return csmf_pred
    clf.my_predict = my_predict
    return clf
    
measure_csmf_acc(nb_pr_allocation)

In [ ]: