In [1]:
import os
import numpy as np

In [2]:
def read_fc_data_file(fc_data_file):
    ''' Read an existing functional connectivity data file that also contains labels and demongraphics,
    or create it if it doesn't exist. Returns the data file.
    Args:
        fc_data_file (str): The complete path to the fc data file  
    '''
    import pandas as pd
    if not os.path.exists(fc_data_file):
        import create_fc_fisher_z_csv_file as makedata
        subjDF = makedata.read_data()
        fcData = makedata.read_fc_data(subjDF)
        fcData.to_csv(path_or_buf=fc_data_file, index=True)
    else:
        fcData = pd.read_csv(fc_data_file, index_col=0)
    fcData['func_perc_quart'] = pd.qcut(fcData['func_perc_fd'], q=4, labels=False)
    return fcData

In [3]:
datas = read_fc_data_file( './abide_fc_data_fisher_z.csv')

In [4]:
thing=datas.DX_GROUP.as_matrix()
np.unique(thing, return_counts=True)


Out[4]:
(array([1, 2]), array([504, 530]))

In [5]:
np.savetxt('blah_counts.csv',np.unique(thing, return_counts=True), delimiter=',', newline=os.linesep)

In [18]:
def ppc_fc_data(fcData, age_range, motion_threshold, 
                labels_col = 'DX_GROUP', strata_cols = ['DX_GROUP', 'func_perc_quart', 'SEX'], 
                agecol='AGE_YRS', motioncol='func_perc_fd'):
    ''' Takes an fc file with functional connectivity columns as well as demographics and returns
    strata, labels, and features with imputed values.
    Args:
        fcData (pandas.DataFrame): The fc data file
        age_range (tuple): lower and upper age range for sample
        motion_threshold (int or float): sample will contain rows with func_perc_fd <= motion_threshold
        strata_cols (list of str): columns used to stratify
    '''
    from sklearn.preprocessing import Imputer
    fcData_threshed = fcData.query(agecol + " >= " + str(age_range[0]) + " & " + agecol + "<= " + str(age_range[1]) + " & " + motioncol + " <= " + str(motion_threshold))
    strata = fcData_threshed.groupby(strata_cols).grouper.group_info[0]
    labels = fcData_threshed[labels_col].as_matrix()
    features = fcData_threshed.loc[:,'#2001_#2002':'#9160_#9170']
    features_imputed = Imputer(missing_values='NaN', strategy='mean', axis=0).fit_transform(features)
    return strata, labels, features_imputed

In [66]:
labels_col='AGE_YRS'
oos_iter = 10
sss_n_iter=10
skf_n_folds=3
age_range=(6,18)
motion_threshold=5
sample_size=30
classifier = 'svc'
cvmethod = 'sss'
modelDir = './cv_models/test'
outputDir = './cv_output/test'
fname_prefix = "{}_{}_mt{}_n{}".format(cvmethod, classifier, motion_threshold, sample_size)
fc_data_file = './abide_fc_data_fisher_z.csv'
fcData = read_fc_data_file(fc_data_file)
strata, labels, features = ppc_fc_data(fcData, age_range, motion_threshold, labels_col=labels_col)

In [67]:
cv=True
sparse=False
classifier='svr' #support vector regression
data=features
penalty='l1'
n_iter=sss_n_iter
n_jobs=1
saveData=False

In [68]:
if sparse:
    if classifier == 'svr':
        print "Support vector regression does not support sparse, l1 penalties. `sparse` option ignored."
    penalty = 'l1'
else:
    penalty = 'l2'

if classifier == 'svc':
    from sklearn.svm import LinearSVC
    algorithm = LinearSVC(penalty = penalty)
    if penalty == 'l1':
        algorithm.set_params(dual=False)
elif classifier == 'svr':
    from sklearn.svm import LinearSVR
    algorithm = LinearSVR()

In [69]:
classifier


Out[69]:
'svr'

In [71]:
labels


Out[71]:
array([ 13.,  16.,  14.,  14.,  11.,  12.,  13.,  17.,  12.,  16.,  15.,
        15.,  18.,  16.,  11.,  13.,  13.,  10.,  10.,   8.,  11.,  12.,
        12.,   9.,   9.,  15.,  11.,  11.,   8.,  10.,   9.,  10.,  14.,
        17.,  16.,  15.,  16.,  13.,  12.,  14.,  12.,  12.,  14.,  15.,
        13.,  13.,  16.,  14.,  14.,  16.,  16.,  16.,  13.,  14.,  16.,
        15.,  12.,  15.,  17.,  15.,  17.,  13.,  13.,  15.,  18.,  14.,
        14.,  17.,  14.,  14.,  14.,  16.,  16.,  15.,  15.,  12.,  14.,
        14.,  12.,  12.,  11.,  12.,  16.,   9.,   9.,  16.,  13.,   9.,
        15.,  18.,  18.,  13.,  15.,  17.,  16.,  17.,  11.,  17.,  14.,
        13.,  18.,  16.,  18.,  18.,  17.,  18.,  15.,  18.,  12.,  16.,
        13.,  17.,   9.,   9.,  13.,  18.,  11.,  15.,  14.,  10.,   9.,
        11.,  11.,  14.,  17.,  17.,  14.,  14.,  14.,  17.,  13.,  15.,
        16.,  16.,  14.,  13.,  13.,  17.,  13.,  15.,  17.,  16.,  13.,
        15.,  15.,  18.,  13.,  18.,  17.,  17.,  18.,  18.,  11.,  15.,
        12.,   8.,  14.,  10.,  13.,   9.,  13.,   9.,  13.,  16.,  16.,
        15.,  10.,   8.,  14.,  16.,  16.,  12.,  13.,   7.,   9.,  14.,
        18.,  13.,  13.,  15.,  16.,  15.,  16.,  12.,  14.,  14.,  13.,
        12.,  15.,  12.,  14.,  14.,  13.,  13.,  14.,  14.,  14.,  12.,
        12.,  10.,  10.,   8.,  10.,   8.,  10.,  11.,   8.,   8.,   9.,
        11.,   9.,   8.,  10.,  14.,  14.,  10.,  15.,   9.,   8.,  17.,
        11.,  14.,   7.,   9.,   9.,  13.,   8.,  14.,   9.,  13.,  10.,
        15.,  16.,  15.,  14.,   8.,  11.,  11.,   8.,  11.,  18.,   8.,
         8.,   8.,   8.,   8.,   8.,  10.,  11.,  11.,  12.,  12.,  13.,
        13.,  14.,  14.,  14.,  14.,  15.,  17.,   7.,  10.,  18.,   8.,
        10.,  11.,  12.,  12.,  14.,  16.,   6.,   8.,   8.,   9.,  10.,
        10.,  11.,  11.,  11.,  13.,  13.,  13.,  13.,  14.,  14.,  15.,
        15.,  15.,  15.,  16.,  16.,  10.,  12.,  16.,  16.,  17.,  12.,
        16.,  13.,  13.,  12.,  15.,  14.,   8.,   8.,  12.,   9.,  11.,
        11.,   7.,   9.,  10.,   9.,   8.,   8.,   8.,  12.,  12.,  12.,
        17.,  15.,  16.,  11.,  12.,  14.,  17.,  10.,  17.,  14.,  10.,
        13.,   8.,  10.,  11.,  14.,  14.,  15.,  13.,  13.,  13.,  15.,
        11.,  17.,  12.,  13.,  12.,  13.,  12.,  13.,   9.,  11.,  16.,
        13.,  13.,  12.,   9.,  11.,  13.,  13.,  10.,  11.,  17.,  17.,
        18.])

In [70]:
if cv:
    from sklearn.grid_search import GridSearchCV
    paramsToSearch = []        
    paramsToSearch.append({'C': [.001,.005,.01,.1,1,10]})
    if cvmethod == 'sss':
        from sklearn.cross_validation import StratifiedShuffleSplit
        cvalgorithm = StratifiedShuffleSplit(strata, n_iter = n_iter, test_size = .3)
    if cvmethod == 'skf':
        from sklearn.cross_validation import StratifiedKFold
        cvalgorithm = StratifiedKFold(strata, n_folds = n_folds, shuffle = True)
    clf=[]
    clf = GridSearchCV(algorithm, 
                       paramsToSearch, 
                       cv=cvalgorithm,
                       n_jobs=n_jobs)
else:
    #Defaults and train model
    clf = algorithm
clf.fit(data, labels)


Out[70]:
GridSearchCV(cv=StratifiedShuffleSplit(labels=[2 2 ..., 4 4], n_iter=10, test_size=0.3, random_state=None),
       error_score='raise',
       estimator=LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'C': [0.001, 0.005, 0.01, 0.1, 1, 10]}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [72]:
#Get training accuracy
trainingPredictions = clf.predict(data)
if classifier=='svc':
    from sklearn.metrics import accuracy_score
    accuracy = accuracy_score(trainingPredictions, labels)
    print "Training accuracy: %f" % accuracy
elif classifier=='svr':
    from sklearn.metrics import mean_squared_error, r2_score
    MSE = mean_squared_error(trainingPredictions, labels)
    RMSE = abs(MSE)**.5
    R2 = r2_score(trainingPredictions, labels)
    print "Training RMSE: %f" % RMSE
    print "Training R^2: %f" % R2


Training RMSE: 0.000493
Training R^2: 1.000000

In [74]:
if cv:
    #Extract tuned model weights
    if classifier=='svc':
        modelWeights = clf.best_estimator_.coef_[0]
        modelIntercept = clf.best_estimator_.intercept_
    elif classifier=='svr':
        modelWeights = clf.best_estimator_.coef_
        modelIntercept = clf.best_estimator_.intercept_
else:
    #Extract deafult model weights
    if classifier=='svc':
        modelWeights = clf.best_estimator_.coef_[0]
        modelIntercept = clf.best_estimator_.intercept_
    elif classifier=='svr':
        modelWeights = clf.coef_
        modelIntercept = clf.intercept_

In [76]:
print modelWeights
print modelIntercept


[-0.05847647  0.0972536  -0.00559011 ..., -0.00511297 -0.05498672
  0.03412322]
[ 0.25228141]

In [77]:
if saveData:
    if (not modelDir) | (not fname_prefix):
        raise TypeError('Model dir or file name not specified')
    print "Saving model weights, training accuracy, and model"
    from sklearn.externals import joblib
    #Write intercept + model weights to csv 
    if not os.path.exists(modelDir):
        print ("{} does not exist; creating ...".format(modelDir))
        os.makedirs(modelDir)
    fileName = os.path.join(modelDir, fname_prefix)
    print ("Writing data to {}*".format(fileName))
    np.savetxt(fileName + '_Weights.csv',np.concatenate([modelIntercept,modelWeights]), delimiter=',', newline=os.linesep)
    np.savetxt(fileName + '_TrainAcc.csv',np.array([accuracy]), delimiter=',',newline=os.linesep)
    joblib.dump(clf, fileName + '_Model.pkl')

In [ ]:
def trainModel(data, labels, strata, modelDir = None, fname_prefix = None, classifier = 'svc',  cv = True, cvmethod = 'sss', n_iter = 10, k = 10, sparse = True, saveData = True, n_jobs=1):
    '''Applies classifier to predict labels from data, stratified by strata, and returns the resulting model.
    Optionally saves the model and coefficients.
    Args:
        data (numpy.ndarray): features to use in prediction
        labels (numpy.ndarray): labels to predict
        strata (numpy.ndarray): labels to use for stratification
        classifier (str): classifier to use. currently only svc is implemented
        modelDir (str): where to save model and csv files
        fname_prefix (str): what to attach to the beginning of the filename
        cv (bool): implement cross validation? currently this must be true
        cvmethod (str): cross validation method, 'sss' for StratifiedShuffleSplit or 'skf' for StratifiedKFold.
        n_iter (int): number of times to iterate the StratifiedShuffleSplit; default true
        k (int): if classifier is StratifiedKFold, number of folds; default true
        sparse (bool): whether to use l1 (sparse) regularization or l2; default True
        saveData (bool): save the results to model directory? default True
        n_jobs (int): number of jobs to pass to cv
    '''
    if sparse:
        penalty = 'l1'
    else:
        penalty = 'l2'
    if classifier == 'svc':
        from sklearn.svm import LinearSVC
        algorithm = LinearSVC(penalty = penalty)
        if penalty == 'l1':
            algorithm.set_params(dual=False)
    if cv:
        from sklearn.grid_search import GridSearchCV
        paramsToSearch = []        
        paramsToSearch.append({'C': [.001,.005,.01,.1,1,10]})
        if cvmethod == 'sss':
            from sklearn.cross_validation import StratifiedShuffleSplit
            cvalgorithm = StratifiedShuffleSplit(strata, n_iter = n_iter, test_size = .3)
        if cvmethod == 'skf':
            from sklearn.cross_validation import StratifiedKFold
            cvalgorithm = StratifiedKFold(strata, n_folds = n_folds, shuffle = True)
        clf=[]
        clf = GridSearchCV(algorithm, 
                           paramsToSearch, 
                           cv=cvalgorithm,
                           n_jobs=n_jobs)
    else:
        #Defaults and train model
        clf = algorithm
    clf.fit(data, labels)
    
    #Get training accuracy
    from sklearn.metrics import accuracy_score
    trainingPredictions = clf.predict(data)
    accuracy = accuracy_score(trainingPredictions, labels)
    print "Training accuracy: %f" % accuracy

    if cv:
        #Extract tuned model weights
        modelWeights = clf.best_estimator_.coef_[0]
        modelIntercept = clf.best_estimator_.intercept_
    else:
        #Extract deafult model weights
        modelWeights = clf.coef_[0]
        modelIntercept = clf.intercept_
    
    if saveData:
        if (not modelDir) | (not fname_prefix):
            raise TypeError('Model dir or file name not specified')
        print "Saving model weights, training accuracy, and model"
        from sklearn.externals import joblib
        #Write intercept + model weights to csv 
        if not os.path.exists(modelDir):
            print ("{} does not exist; creating ...".format(modelDir))
            os.makedirs(modelDir)
        fileName = os.path.join(modelDir, fname_prefix)
        print ("Writing data to {}*".format(fileName))
        np.savetxt(fileName + '_Weights.csv',np.concatenate([modelIntercept,modelWeights]), delimiter=',', newline=os.linesep)
        np.savetxt(fileName + '_TrainAcc.csv',np.array([accuracy]), delimiter=',',newline=os.linesep)
        joblib.dump(clf, fileName + '_Model.pkl')
        
    return (clf)

In [ ]:
def testModel(data, labels, clf = None, modelDir = None, fname_prefix = None, outputDir = None, saveData=True):
    
    ''' Test a linear classifier by loading a fitted model and returning predictions on given data.
    Args:
        data (ndarray): A data matrix organized as nsamples x nfeatures 
        labels (ndarray): A 1d labels array same length as nsamples  
        clf (sklearn fit object): If not specified, fucntion will try to load using modelDir and fname_prefix
        modelDir (str): directory from which to load pickled model files
        fname_prefix (str): filename prefix used for model files
        outDir (str): directory to write csv file with testing accuracy and predictions
        saveData (bool; optional): whether to actually save csv or just return model object; 
            default True
    '''

    from sklearn.externals import joblib
    from sklearn.metrics import accuracy_score

    if not clf:
        if (not modelDir) | (not fname_prefix):
            raise TypeError('No clf provided and Model dir or file name not specified')
        modelPath = os.path.join(modelDir, fname_prefix)
        #If model doesn't exist use csv with coefs - TODO
        clf = joblib.load(modelPath + '_Model.pkl')
    predictions = clf.predict(data)

    #Compute accuracy on test data
    accuracy = accuracy_score(predictions,labels)
    print "Testing accuracy: %f" % accuracy

    if saveData:
        from sklearn.externals import joblib
        if (not outputDir) | (not fname_prefix):
            raise TypeError('Output dir or file name not specified')
        if not os.path.exists(outputDir):
            print ("{} does not exist; creating ...".format(outputDir))
            os.makedirs(outputDir)
        outPath = os.path.join(outputDir, fname_prefix)
        print "Saving test accuracy and predictions to {}*".format(outPath)
        #Save accuracy
        np.savetxt(outPath + '_TestAcc.csv', np.array([accuracy]), delimiter=',',newline=os.linesep)
        #Save predictions
        np.savetxt(outPath + '_Predictions.csv',predictions, delimiter=',',newline=os.linesep)

    return accuracy

In [ ]:
oos_iter = 10
sss_n_iter=10
skf_n_folds=3
age_range=(6,18)
motion_threshold=5
sample_size=30
classifier = 'svc'
cvmethod = 'sss'
modelDir = './cv_models/test'
outputDir = './cv_output/test'
fname_prefix = "{}_{}_mt{}_n{}".format(cvmethod, classifier, motion_threshold, sample_size)
fc_data_file = './abide_fc_data_fisher_z.csv'

In [ ]:
fcData = read_fc_data_file(fc_data_file)

In [ ]:
strata, labels, features = ppc_fc_data(fcData, age_range, motion_threshold)

In [ ]:
from sklearn.cross_validation import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(strata, n_iter = oos_iter, test_size = sample_size, train_size = sample_size)

print("Running {} iterations of {} cv'd {} classification\n".format(oos_iter, cvmethod, classifier) + 
      "Each N = {}, Motion thresh = {}".format(sample_size, motion_threshold))

In [ ]:
for i, (train_index, test_index) in enumerate(sss):
    train_features, train_labels = features[train_index, :], labels[train_index]
    test_features, test_labels = features[test_index, :], labels[test_index]
    ifname_prefix = fname_prefix + '_i{:03d}'.format(i)
    aCLF = trainModel(train_features, train_labels, train_labels, 
                      modelDir=modelDir, fname_prefix=ifname_prefix, 
                      classifier = classifier,  
                      cv = True, cvmethod = cvmethod, 
                      n_iter = sss_n_iter, k = skf_n_folds, 
                      sparse = True, saveData = True, n_jobs=4)
    accuracy = testModel(test_features, test_labels, clf = aCLF, 
                         fname_prefix = ifname_prefix, 
                         outputDir = outputDir, saveData=True)

In [ ]: