Predict them ages

Warning, this will generate a lot of data. But not on your machine, so don't worry.


In [308]:
# Get some paths
import os
import subprocess
import numpy as np
import pandas as pd
import statsmodels.api as sm
from pymatbridge import Matlab
from matplotlib import pyplot as plt
import scipy.cluster.hierarchy as clh
from sklearn import linear_model, cross_validation

In [2]:
# Set some paths
wk_dir = '/data1/age'
dl_dir = os.path.join(wk_dir, 'data')
csv_path = os.path.join(dl_dir, 'aging_full_model.csv')
data_url = 'http://downloads.figshare.com/article/public/1241650'

In [3]:
# Set up the scene
if not os.path.exists(wk_dir):
    print("I'll create {}".format(wk_dir))
    os.makedirs(wk_dir)
if not os.path.exists(dl_dir):
    print("I'll create {}".format(dl_dir))
    os.makedirs(dl_dir)

In [181]:
# Get the data
get_proc = subprocess.Popen(['wget', '-nc', data_url,  '-P', dl_dir], 
                            stderr=subprocess.STDOUT)
get_proc.wait()
get_status = get_proc.poll()
if not status == 0:
    print("Trying to run {} didn't work".format(sys_call))
else:
    zip_proc = subprocess.Popen(['unzip',
                             '{}/1241650'.format(dl_dir),
                             '-d', dl_dir], stderr=subprocess.STDOUT)
    zip_proc.wait()
    ztatus = zip_proc.poll()
    if not ztatus == 0:
        print("Trying to run {} didn't work".format(zip_call))

In [85]:
pheno = pd.read_csv(os.path.join(dl_dir, 'aging_full_model.csv'))
age = pheno['age'].values
subs = pheno[' '].values
# Strip the annoying trailing whitespace
subs = [x.strip(' ') for x in subs]

In [5]:
# Bring out the Matlab Master to deal with all those .mat files
mlab = Matlab()
mlab.start()


Starting MATLAB on ZMQ socket ipc:///tmp/pymatbridge
Send 'exit' command to kill the server
...MATLAB started and connected!
Out[5]:
True

In [11]:
# Yes, yes, this is all notoriously slow because we have to weak up the matlab giant for each command
for sub in subs:
    read_this = os.path.join(dl_dir, 'correlation_{}_roi.mat'.format(sub))
    comm = "tmp = load('{}');".format(read_this)
    out = mlab.run_code(comm)
    out = mlab.run_code('R = tmp.mat_r;')
    # Save the stuff out as csv
    out = mlab.run_code("csvwrite('{}/out_{}.csv', R);".format(dl_dir, sub))
print('Done writing')


done with mni_622
done with mni_624
done with mni_667
done with mni_644
done with mni_649
done with mni_610
done with mni_623
done with mni_603
done with mni_646
done with mni_633
done with mni_638
done with mni_611
done with mni_654
done with mni_592
done with mni_637
done with mni_635
done with mni_647
done with mni_626
done with mni_651
done with mni_631
done with mni_629
done with mni_613
done with mni_627
done with mni_668
done with mni_616
done with mni_620
done with mni_621
done with mni_612
done with mni_640
done with mni_657
done with mni_645
done with mni_655
done with mni_666
done with mni_658
done with mni_641
done with mni_615
done with mni_642
done with mni_650
done with mni_604
done with mni_634
done with mni_663
done with mni_628
done with mni_619
done with mni_605
done with mni_625
done with mni_614
done with mni_639
done with mni_660
done with mni_652
done with mni_618
done with mni_653

In [51]:
# Now lets read this in again and deal with sensible data
n_subs = len(subs)
for idx, sub in enumerate(subs):
    df = pd.read_csv('{}/out_{}.csv'.format(dl_dir, sub), header=None)
    ts = np.array(df.values)
    # Do mad things
    ts = (ts - np.median(ts))/1.4785*np.median(np.absolute(ts-np.median(ts)))
    if idx == 0:
        conn = np.zeros((ts.shape[0], n_subs))
    conn[:, idx] = ts[:,0]

In [120]:
# Cross-validated clustering
scale = 10
cv = cross_validation.LeaveOneOut(n_subs)
for train, test in cv:
    train_conn = conn[:, train].T
    test_conn = conn[:, test].T
    train_age = age[train]
    test_age = age[test]
    
    y = train_conn
    x = sm.add_constant(train_age)
    N = len(train)
    K = 2
    c = np.ones((K, 1));
    # Fit OLS
    linm = sm.OLS()
    res = linm.fit(y, x)
    beta = res.params
    e = res.resid
    std_e = [None, np.sqrt(np.sum(e**2,0 )/(N-K))]
    d = np.sqrt(np.dot(np.dot(c.T,np.dot(x.T,x))**(-1),c))
    ttest = np.dot(c.T,beta)/np.dot(std_e.T,d).T
    # Turn the thing back into a matrix
    N = np.round((1+np.sqrt(1+8*size(ttest)))/2.)
    mat = np.zeros((N, N))
    mat[np.tril(np.ones((N, N), dtype=bool), -1)] = ttest[0]
    mat = mat.T
    mat[np.tril(np.ones((N, N), dtype=bool), -1)] = ttest[0]
    # Hierarchical Clustering
    link = clh.linkage(mat, method='ward')
    part = clh.fcluster(link, scale, criterion='maxclust')
    # Make new connectomes
    avg_conn = np.zeros((n_subs-1,scale*(scale-1)/2))
    for idx, sub in enumerate(subs):
        avg_conn[idx, :] =

In [292]:
mat = np.zeros((N, N))
mat[np.tril(np.ones((N, N), dtype=bool), -1)] = ttest[0]

In [303]:
N = np.round((1+np.sqrt(1+8*size(ttest)))/2.)
mat = np.zeros((N, N))
mat[np.tril(np.ones((N, N), dtype=bool), -1)] = ttest[0]
mat = mat.T
mat[np.tril(np.ones((N, N), dtype=bool), -1)] = ttest[0]

In [310]:
scale = 10
link = clh.linkage(mat, method='ward')
part = clh.fcluster(link, scale, criterion='maxclust')

In [318]:
avg_conn = np.zeros((n_subs-1,scale*(scale-1)/2))

In [319]:
avg_conn.shape


Out[319]:
(50, 45)

In [316]:
K*(K-1)/2


Out[316]:
1

In [317]:
K


Out[317]:
2

In [239]:
d.shape


Out[239]:
(1, 1)

In [195]:
results = model.fit()

In [197]:
y.shape


Out[197]:
(50, 289941)

In [171]:
beta


Out[171]:
(2, 289941)

In [181]:
x.shape


Out[181]:
(50, 2)

In [182]:
a = range(1,8)

In [183]:
a


Out[183]:
[1, 2, 3, 4, 5, 6, 7]

In [184]:
a = sm.add_constant(a)


Out[184]:
array([[ 1.,  1.],
       [ 1.,  2.],
       [ 1.,  3.],
       [ 1.,  4.],
       [ 1.,  5.],
       [ 1.,  6.],
       [ 1.,  7.]])

In [186]:
x = sm.add_constant(train_age)

In [187]:
x.shape


Out[187]:
(50, 2)

In [190]:
y.shape


Out[190]:
(50, 289941)

In [ ]: