In [ ]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os.path as op
import numpy as np
import pandas as pd
from mriqc.viz.misc import (
raters_variability_plot, plot_abide_stripplots, plot_corrmat, plot_histograms, figure1, plot_batches
)
from pkg_resources import resource_filename as pkgrf
from mriqc.classifier.data import read_dataset, combine_datasets
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sn
sn.set(style="whitegrid")
out_path = op.abspath('results')
In [ ]:
x_path = pkgrf('mriqc', 'data/csv/x_abide.csv')
y_path = pkgrf('mriqc', 'data/csv/y_abide.csv')
ds030_x_path = pkgrf('mriqc', 'data/csv/x_ds030.csv')
ds030_y_path = pkgrf('mriqc', 'data/csv/y_ds030.csv')
In [ ]:
from sklearn.model_selection import PredefinedSplit
naive_x = np.array(list(range(20)))
naive_y = np.zeros(15).tolist() + np.ones(5).tolist()
ps = PredefinedSplit(test_fold=(np.array(naive_y) - 1).tolist())
print(ps.get_n_splits(naive_x, naive_y))
for train_index, test_index in ps.split():
print(train_index)
print(test_index)
In [ ]:
from mriqc.classifier.sklearn._split import RepeatedPartiallyHeldOutKFold
train_y = fulldata[['rater_1']].values.ravel()
groups = (fulldata.site == 'UM').astype(int)
sp = RepeatedPartiallyHeldOutKFold(n_splits=3).split(fulldata.values, y=train_y, groups=groups)
for i, (train_index, test_index) in enumerate(sp):
# print(len(train_index), len(test_index))
y = train_y[test_index]
unique_y, y_inversed = np.unique(y, return_inverse=True)
y_counts = np.bincount(y_inversed)
# print("Counts[%d]:" % i, y_counts)
from sklearn.model_selection import StratifiedKFold
from mriqc.classifier.sklearn._split import RepeatedBalancedKFold
train_y = fulldata[['rater_1']].values.ravel()
sp = RepeatedBalancedKFold(n_splits=10, n_repeats=5).split(fulldata[coi], y=train_y)
for i, (train_index, test_index) in enumerate(sp):
y = train_y[test_index]
unique_y, y_inversed = np.unique(y, return_inverse=True)
y_counts = np.bincount(y_inversed)
print("Counts[%d]:" % i, y_counts)
from mriqc.classifier.sklearn._split import RobustLeavePGroupsOut
sites = fulldata[['site']].values.ravel().tolist()
sitenames = list(set(sites))
groups = [sitenames.index(s) for s in sites]
cv = RobustLeavePGroupsOut(n_groups=1)
cvs = cv.split(fulldata[coi], y=fulldata[['rater_1']].values.ravel().tolist(), groups=groups)
In [ ]:
from mriqc.classifier.sklearn import preprocessing as mcsp
select = mcsp.SiteCorrelationSelector()
selected = select.fit_transform(scaled[features + ['site']].values, None)
try1 = scaled[features + ['site']].columns[select.mask_].ravel().tolist()
In [ ]:
print('Removed %s:\n' % sorted(list(set(features) - set(try1))))
print('Kept %s:\n' % sorted(try1))
In [ ]:
print(try1)
In [ ]:
from mriqc.classifier.sklearn import preprocessing as mcsp
select = mcsp.CustFsNoiseWinnow()
selected = select.fit_transform(fulldata[features].values, fulldata.rater_1.values)
try1 = fulldata[features].columns[select.mask_].ravel().tolist()
select2 = mcsp.CustFsNoiseWinnow()
selected2 = select2.fit_transform(fulldata[features].values, fulldata.rater_1.values)
try2 = fulldata[features].columns[select.mask_].ravel().tolist()
select3 = mcsp.CustFsNoiseWinnow()
selected3 = select3.fit_transform(fulldata[features].values, fulldata.rater_1.values)
try3 = fulldata[features].columns[select.mask_].ravel().tolist()
select4 = mcsp.CustFsNoiseWinnow()
selected4 = select4.fit_transform(fulldata[features].values, fulldata.rater_1.values)
try4 = fulldata[features].columns[select.mask_].ravel().tolist()
intersection = set(try1) & set(try2) & set(try3) & set(try4)
print(list(sorted(intersection)))
print(len(intersection))
print(list(sorted(set(features) - intersection)))
In [ ]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.model_selection import train_test_split
selmask = fulldata[features].columns.isin(intersection)
selmask = np.zeros(len(features), dtype=bool)
selmask[[0, 1, 2, 10]] = True
X = fulldata[features].values
binarize = LabelBinarizer()
y = binarize.fit_transform(fulldata.site.values.ravel().tolist())
clf = ExtraTreesClassifier(
n_estimators=1000,
criterion='gini',
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features='sqrt',
max_leaf_nodes=None,
min_impurity_split=1e-07,
bootstrap=True,
oob_score=False,
n_jobs=8,
random_state=None,
verbose=0,
warm_start=False,
class_weight='balanced'
)
clf = clf.fit(X_train, y_train)
In [ ]:
print(roc_auc_score(y_test, clf.predict(X_test), average='macro', sample_weight=None))
In [ ]:
clf.feature_importances_
In [ ]:
from sklearn.preprocessing import RobustScaler
from mriqc.classifier.sklearn import preprocessing as mcsp
scaler = mcsp.BatchScaler(RobustScaler(with_scaling=False), columns=coi)
scaled = scaler.fit_transform(fulldata)
fig = plot_batches(scaled[coi], excl_columns=['rater_1'])
In [ ]:
from sklearn.pipeline import Pipeline
# del fulldata['provenance']
clf = Pipeline([
('std', mcsp.BatchScaler(RobustScaler(), groups='site', columns=coi)),
('feature_selection', mcsp.CustFsNoiseWinnow(features=features)),
])
clf.fit(fulldata, fulldata[['rater_1']].values.ravel())
In [ ]:
mit_csv = '/home/oesteban/mriqc/mit-satra/T1-mit.csv'
abide_csv = op.join(data_path, 'runs/20170505_0.9.3-2017-04-23-2ba2c2e40c39/T1w.csv')
In [ ]:
mit_df = pd.read_csv(mit_csv, index_col=False, dtype={'subject_id': object})
abide_df, pp_cols = read_dataset(abide_csv, op.join(data_path, 'ABIDE_QC_all.csv'), rate_label='rater_1')
In [ ]:
mit_df['rater'] = [1] * len(mit_df)
mit_df['site'] = ['MIT'] * len(mit_df)
abide_df['rater'] = [0] * len(abide_df)
del abide_df['rater_1']
mdata = pd.concat([abide_df, mit_df], axis=0)
In [ ]:
zscored = mcsp.BatchRobustScaler(by='site', columns=coi).fit_transform(mdata)
colnames = [col for col in sorted(pp_cols)
if not (col.startswith('spacing') or col.startswith('summary') or col.startswith('size'))]
nrows = len(colnames)
# palette = ['dodgerblue', 'darkorange']
fig = plt.figure(figsize=(18, 2 * nrows))
gs = GridSpec(nrows, 2, hspace=0.2)
for i, col in enumerate(sorted(colnames)):
ax_nzs = plt.subplot(gs[i, 0])
ax_zsd = plt.subplot(gs[i, 1])
sn.distplot(mdata.loc[(mdata.rater == 0), col], norm_hist=False,
label='ABIDE', ax=ax_nzs, color='dodgerblue')
sn.distplot(mdata.loc[(mdata.rater == 1), col], norm_hist=False,
label='MIT', ax=ax_nzs, color='darkorange')
ax_nzs.legend()
sn.distplot(zscored.loc[(zscored.rater == 0), col], norm_hist=False,
label='ABIDE', ax=ax_zsd, color='dodgerblue')
sn.distplot(zscored.loc[(zscored.rater == 1), col], norm_hist=False,
label='MIT', ax=ax_zsd, color='darkorange')
alldata = mdata[[col]].values.ravel().tolist()
minv = np.percentile(alldata, 0.2)
maxv = np.percentile(alldata, 99.8)
ax_nzs.set_xlim([minv, maxv])
alldata = zscored[[col]].values.ravel().tolist()
minv = np.percentile(alldata, 0.2)
maxv = np.percentile(alldata, 99.8)
ax_zsd.set_xlim([minv, maxv])
ax_zsd.set_ylabel(col)
fig.savefig('/home/oesteban/tmp/mriqc-ml-tests-2/histograms-mit.svg', format='svg', pad_inches=0, dpi=100)
In [ ]:
abide_df, pp_cols = read_dataset(abide_csv, op.join(data_path, 'ABIDE_QC_all.csv'), rate_label='rater_1')
In [ ]:
accept = abide_df[abide_df.rater_1 == 0]
exclude = abide_df[abide_df.rater_1 == 1]
In [ ]:
mit_df = pd.read_csv(mit_csv, index_col=False, dtype={'subject_id': object})
means = {}
for i, col in enumerate(sorted(colnames)):
means[col] = np.median(accept[[col]].values)
mit_copy = mit_df.copy()
mit_copy[[col]] = [means[col]] * len(mit_copy)
mit_copy.to_csv('/home/oesteban/tmp/mriqc-ml-tests-2/mit_t1_%s.csv' % col, index=False)
bad_m = np.median(exclude[[col]].values)
print('%s: %f +- %f :: %f +- %f' % (col, means[col], accept[[col]].std(), bad_m, exclude[[col]].std()))
In [ ]:
pred = pd.read_csv('/home/oesteban/tmp/mriqc-ml-tests-2/predicted_orig.csv', index_col=False)
In [ ]:
for i, col in enumerate(sorted(colnames)):
pred[col] = pd.read_csv('/home/oesteban/tmp/mriqc-ml-tests-2/predicted_mit_t1_%s.csv' % col).prediction.values
In [ ]:
pred.to_csv('/home/oesteban/tmp/mriqc-ml-tests-2/predictions_wrt_iqms.csv', index=False)
In [ ]:
pred.describe()
In [ ]: