In [ ]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os.path as op
import numpy as np
import pandas as pd
from mriqc.viz.misc import (
    raters_variability_plot, plot_abide_stripplots, plot_corrmat, plot_histograms, figure1, plot_batches
)
from pkg_resources import resource_filename as pkgrf
from mriqc.classifier.data import read_dataset, combine_datasets
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sn
sn.set(style="whitegrid")

out_path = op.abspath('results')

In [ ]:
x_path = pkgrf('mriqc', 'data/csv/x_abide.csv')
y_path = pkgrf('mriqc', 'data/csv/y_abide.csv')
ds030_x_path = pkgrf('mriqc', 'data/csv/x_ds030.csv')
ds030_y_path = pkgrf('mriqc', 'data/csv/y_ds030.csv')

Testing sklearn's splits


In [ ]:
from sklearn.model_selection import PredefinedSplit
naive_x = np.array(list(range(20)))
naive_y = np.zeros(15).tolist() + np.ones(5).tolist()

ps = PredefinedSplit(test_fold=(np.array(naive_y) - 1).tolist())
print(ps.get_n_splits(naive_x, naive_y))

for train_index, test_index in ps.split():
    print(train_index)
    print(test_index)

In [ ]:
from mriqc.classifier.sklearn._split import RepeatedPartiallyHeldOutKFold

train_y = fulldata[['rater_1']].values.ravel()
groups = (fulldata.site == 'UM').astype(int)
sp = RepeatedPartiallyHeldOutKFold(n_splits=3).split(fulldata.values, y=train_y, groups=groups)

for i, (train_index, test_index) in enumerate(sp):
#     print(len(train_index), len(test_index))
    y = train_y[test_index]
    unique_y, y_inversed = np.unique(y, return_inverse=True)
    y_counts = np.bincount(y_inversed)
#     print("Counts[%d]:" % i, y_counts)

from sklearn.model_selection import StratifiedKFold
from mriqc.classifier.sklearn._split import RepeatedBalancedKFold

train_y = fulldata[['rater_1']].values.ravel()
sp = RepeatedBalancedKFold(n_splits=10, n_repeats=5).split(fulldata[coi], y=train_y)

for i, (train_index, test_index) in enumerate(sp):
    y = train_y[test_index]
    unique_y, y_inversed = np.unique(y, return_inverse=True)
    y_counts = np.bincount(y_inversed)
    print("Counts[%d]:" % i, y_counts)
    
from mriqc.classifier.sklearn._split import RobustLeavePGroupsOut

sites = fulldata[['site']].values.ravel().tolist()
sitenames = list(set(sites))
groups = [sitenames.index(s) for s in sites]
cv = RobustLeavePGroupsOut(n_groups=1)
cvs = cv.split(fulldata[coi], y=fulldata[['rater_1']].values.ravel().tolist(), groups=groups)

Testing new preprocessing steps


In [ ]:
from mriqc.classifier.sklearn import preprocessing as mcsp
select = mcsp.SiteCorrelationSelector()
selected = select.fit_transform(scaled[features + ['site']].values, None)
try1 = scaled[features + ['site']].columns[select.mask_].ravel().tolist()

In [ ]:
print('Removed %s:\n' % sorted(list(set(features) - set(try1))))
print('Kept %s:\n' % sorted(try1))

In [ ]:
print(try1)

In [ ]:
from mriqc.classifier.sklearn import preprocessing as mcsp

select = mcsp.CustFsNoiseWinnow()
selected = select.fit_transform(fulldata[features].values, fulldata.rater_1.values)
try1 = fulldata[features].columns[select.mask_].ravel().tolist()

select2 = mcsp.CustFsNoiseWinnow()
selected2 = select2.fit_transform(fulldata[features].values, fulldata.rater_1.values)
try2 = fulldata[features].columns[select.mask_].ravel().tolist()

select3 = mcsp.CustFsNoiseWinnow()
selected3 = select3.fit_transform(fulldata[features].values, fulldata.rater_1.values)
try3 = fulldata[features].columns[select.mask_].ravel().tolist()

select4 = mcsp.CustFsNoiseWinnow()
selected4 = select4.fit_transform(fulldata[features].values, fulldata.rater_1.values)
try4 = fulldata[features].columns[select.mask_].ravel().tolist()

intersection = set(try1) & set(try2) & set(try3) & set(try4)
print(list(sorted(intersection)))
print(len(intersection))
print(list(sorted(set(features) - intersection)))

In [ ]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.model_selection import train_test_split

selmask = fulldata[features].columns.isin(intersection)
selmask = np.zeros(len(features), dtype=bool)
selmask[[0, 1, 2, 10]] = True

X = fulldata[features].values
binarize = LabelBinarizer()
y = binarize.fit_transform(fulldata.site.values.ravel().tolist())



clf = ExtraTreesClassifier(
    n_estimators=1000,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='sqrt',
    max_leaf_nodes=None,
    min_impurity_split=1e-07,
    bootstrap=True,
    oob_score=False,
    n_jobs=8,
    random_state=None,
    verbose=0,
    warm_start=False,
    class_weight='balanced'
)

clf = clf.fit(X_train, y_train)

In [ ]:
print(roc_auc_score(y_test, clf.predict(X_test), average='macro', sample_weight=None))

In [ ]:
clf.feature_importances_

In [ ]:
from sklearn.preprocessing import RobustScaler
from mriqc.classifier.sklearn import preprocessing as mcsp

scaler = mcsp.BatchScaler(RobustScaler(with_scaling=False), columns=coi)
scaled = scaler.fit_transform(fulldata)
fig = plot_batches(scaled[coi], excl_columns=['rater_1'])

In [ ]:
from sklearn.pipeline import Pipeline
# del fulldata['provenance']
clf = Pipeline([
    ('std', mcsp.BatchScaler(RobustScaler(), groups='site', columns=coi)), 
    ('feature_selection', mcsp.CustFsNoiseWinnow(features=features)),
])

clf.fit(fulldata, fulldata[['rater_1']].values.ravel())

In [ ]:
mit_csv = '/home/oesteban/mriqc/mit-satra/T1-mit.csv'
abide_csv = op.join(data_path, 'runs/20170505_0.9.3-2017-04-23-2ba2c2e40c39/T1w.csv')

In [ ]:
mit_df = pd.read_csv(mit_csv, index_col=False, dtype={'subject_id': object})
abide_df, pp_cols = read_dataset(abide_csv, op.join(data_path, 'ABIDE_QC_all.csv'), rate_label='rater_1')

In [ ]:
mit_df['rater'] = [1] * len(mit_df)
mit_df['site'] = ['MIT'] * len(mit_df)
abide_df['rater'] = [0] * len(abide_df)

del abide_df['rater_1']
mdata = pd.concat([abide_df, mit_df], axis=0)

In [ ]:
zscored = mcsp.BatchRobustScaler(by='site', columns=coi).fit_transform(mdata)

colnames = [col for col in sorted(pp_cols)
            if not (col.startswith('spacing') or col.startswith('summary') or col.startswith('size'))]

nrows = len(colnames)
# palette = ['dodgerblue', 'darkorange']

fig = plt.figure(figsize=(18, 2 * nrows))
gs = GridSpec(nrows, 2, hspace=0.2)

for i, col in enumerate(sorted(colnames)):
    ax_nzs = plt.subplot(gs[i, 0])
    ax_zsd = plt.subplot(gs[i, 1])

    sn.distplot(mdata.loc[(mdata.rater == 0), col], norm_hist=False,
                label='ABIDE', ax=ax_nzs, color='dodgerblue')
    sn.distplot(mdata.loc[(mdata.rater == 1), col], norm_hist=False,
                label='MIT', ax=ax_nzs, color='darkorange')
    ax_nzs.legend()

    sn.distplot(zscored.loc[(zscored.rater == 0), col], norm_hist=False,
                label='ABIDE', ax=ax_zsd, color='dodgerblue')
    sn.distplot(zscored.loc[(zscored.rater == 1), col], norm_hist=False,
                label='MIT', ax=ax_zsd, color='darkorange')

    alldata = mdata[[col]].values.ravel().tolist()
    minv = np.percentile(alldata, 0.2)
    maxv = np.percentile(alldata, 99.8)
    ax_nzs.set_xlim([minv, maxv])

    alldata = zscored[[col]].values.ravel().tolist()
    minv = np.percentile(alldata, 0.2)
    maxv = np.percentile(alldata, 99.8)
    ax_zsd.set_xlim([minv, maxv])
    
    ax_zsd.set_ylabel(col)
fig.savefig('/home/oesteban/tmp/mriqc-ml-tests-2/histograms-mit.svg', format='svg', pad_inches=0, dpi=100)

In [ ]:
abide_df, pp_cols = read_dataset(abide_csv, op.join(data_path, 'ABIDE_QC_all.csv'), rate_label='rater_1')

In [ ]:
accept = abide_df[abide_df.rater_1 == 0]
exclude = abide_df[abide_df.rater_1 == 1]

In [ ]:
mit_df = pd.read_csv(mit_csv, index_col=False, dtype={'subject_id': object})

means = {}
for i, col in enumerate(sorted(colnames)):
    means[col] = np.median(accept[[col]].values)
    mit_copy = mit_df.copy()
    mit_copy[[col]] = [means[col]] * len(mit_copy)
    
    mit_copy.to_csv('/home/oesteban/tmp/mriqc-ml-tests-2/mit_t1_%s.csv' % col, index=False)
    
    bad_m = np.median(exclude[[col]].values)
    print('%s: %f +- %f :: %f +- %f' % (col, means[col], accept[[col]].std(), bad_m, exclude[[col]].std()))

In [ ]:
pred = pd.read_csv('/home/oesteban/tmp/mriqc-ml-tests-2/predicted_orig.csv', index_col=False)

In [ ]:
for i, col in enumerate(sorted(colnames)):
    pred[col] = pd.read_csv('/home/oesteban/tmp/mriqc-ml-tests-2/predicted_mit_t1_%s.csv' % col).prediction.values

In [ ]:
pred.to_csv('/home/oesteban/tmp/mriqc-ml-tests-2/predictions_wrt_iqms.csv', index=False)

In [ ]:
pred.describe()

In [ ]: