Plotting age distributions with respect to genotype groups


In [68]:
%matplotlib inline
import pandas as pd
from scipy import stats
from matplotlib import pyplot as plt

data = pd.read_excel('/home/grg/spm/data/covariates.xls')

for i in xrange(5):
    x = data[data['apo'] == i]['age'].values
    plt.hist(x, bins=20)
    print i, 'W:%.4f p:%.4f -'%stats.shapiro(x), len(x), 'subjects between', int(min(x)), 'and', int(max(x))

plt.legend(['apoe23', 'apoe24', 'apoe33', 'apoe34', 'apoe44'])
plt.show()


0 W:0.9705 p:0.0222 - 102 subjects between 46 and 75
1 W:0.9539 p:0.0769 - 44 subjects between 45 and 71
2 W:0.9613 p:0.0005 - 143 subjects between 45 and 75
3 W:0.9640 p:0.0004 - 160 subjects between 45 and 74
4 W:0.9306 p:0.0013 - 65 subjects between 45 and 67

For two of the 5 groups, the Shapiro test p-value is lower than 1e-3, which means that the distributions of these two groups can't be considered as normal. (But theorically none of them is)

Matching pairs using nearest neighbours

The matching algorithm:


In [233]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

def get_matching_pairs(treated_df, non_treated_df, scaler=True):

    treated_x = treated_df.values
    non_treated_x = non_treated_df.values
    if scaler:
        scaler = StandardScaler()
        scaler.fit(treated_x)
        treated_x = scaler.transform(treated_x)
        non_treated_x = scaler.transform(non_treated_x)

    nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(non_treated_x)
    distances, indices = nbrs.kneighbors(treated_x)
    indices = indices.reshape(indices.shape[0])    
    matched = non_treated_df.ix[indices]
    matched = non_treated_df.irow(matched.index)
    return matched

Loading data


In [234]:
df = pd.read_excel('/home/grg/spm/data/covariates.xls')
df = df[['subject','apo','age','gender','educyears']]

groups = [df[df['apo']==i] for i in xrange(5)]
for i in xrange(5):
    groups[i] = groups[i].set_index(groups[i]['subject'])
    del groups[i]['subject']
    del groups[i]['apo']

Matching the groups


In [235]:
treated_df =  groups[4]
matched_df = [get_matching_pairs(treated_df, groups[i], scaler=False) for i in xrange(4)]


/home/grg/jupyter/lib/python2.7/site-packages/ipykernel/__main__.py:18: FutureWarning: irow(i) is deprecated. Please use .iloc[i]

Plotting data and see that the groups are now matching


In [236]:
fig, ax = plt.subplots(figsize=(6,6))

for i in xrange(4):
    x = matched_df[i]['age']
    plt.hist(x, bins=20)   
    print i, 'W:%.4f p:%.4f -'%stats.shapiro(x), len(x), 'subjects between', int(min(x)), 'and', int(max(x))
    
x = treated_df['age']
plt.hist(x, bins=20)   
print 4, 'W:%.4f p:%.4f -'%stats.shapiro(x), len(x), 'subjects between', int(min(x)), 'and', int(max(x))

plt.legend(['apoe23', 'apoe24', 'apoe33', 'apoe34', 'apoe44'])


0 W:0.9165 p:0.0003 - 65 subjects between 46 and 66
1 W:0.9271 p:0.0009 - 65 subjects between 45 and 67
2 W:0.9295 p:0.0012 - 65 subjects between 45 and 68
3 W:0.9163 p:0.0003 - 65 subjects between 46 and 68
4 W:0.9306 p:0.0013 - 65 subjects between 45 and 67
Out[236]:
<matplotlib.legend.Legend at 0x7f2b248685d0>

Matching groups using linear assignment method


In [1]:
import pandas as pd
df = pd.read_excel('/home/grg/spm/data/covariates.xls')
df = df[['subject','apo','age','gender','educyears']]

groups = [df[df['apo']==i] for i in xrange(5)]
for i in xrange(5):
    groups[i] = groups[i].set_index(groups[i]['subject'])
    del groups[i]['subject']
    del groups[i]['apo']

In [2]:
groups = [df[df['apo']==i] for i in xrange(5)]
for i in xrange(5):
    groups[i] = groups[i].set_index(groups[i]['subject'])
    del groups[i]['apo']
    del groups[i]['subject']
treated_df =  groups[4]
non_treated_df = groups[0]

In [3]:
from scipy.spatial.distance import cdist
from scipy import optimize

def get_matching_pairs(treated_df, non_treated_df):
    cost_matrix = cdist(treated_df.values, non_treated_df.values)
    row_ind, col_ind = optimize.linear_sum_assignment(cost_matrix)
    return non_treated_df.iloc[col_ind]

In [72]:
treated_df =  groups[4]
matched_df = [get_matching_pairs(treated_df, groups[i]) for i in xrange(4)]

Plotting data and see that the groups are now matching


In [73]:
fig, ax = plt.subplots(figsize=(6,6))

for i in xrange(4):
    x = matched_df[i]['age']
    plt.hist(x, bins=20)   
    print i, 'W:%.4f p:%.4f -'%stats.shapiro(x), len(x), 'subjects between', int(min(x)), 'and', int(max(x))
    
x = treated_df['age']
plt.hist(x, bins=20)   
print 4, 'W:%.4f p:%.4f -'%stats.shapiro(x), len(x), 'subjects between', int(min(x)), 'and', int(max(x))

plt.legend(['apoe23', 'apoe24', 'apoe33', 'apoe34', 'apoe44'])


0 W:0.9200 p:0.0004 - 65 subjects between 46 and 66
1 W:0.9539 p:0.0769 - 44 subjects between 45 and 71
2 W:0.9333 p:0.0017 - 65 subjects between 45 and 68
3 W:0.9411 p:0.0039 - 65 subjects between 45 and 68
4 W:0.9306 p:0.0013 - 65 subjects between 45 and 67
Out[73]:
<matplotlib.legend.Legend at 0x7f180013e550>

In [6]:
import json
groups_index = [each.index.tolist() for each in matched_df]
groups_index.append(groups[4].index.tolist())
json.dump(groups_index, open('/tmp/groups.json','w'))

Assessing the effect from the matching

We perform a two-sample t-test between each group and the target group, before and after applying the matching. As the dataset is composed of 3 variables (age, gender, education), this returns 3 t values and 3 p-values for each comparison.


In [76]:
from scipy.stats import ttest_ind
for i in xrange(4):
    print '=== Group %s ==='%i
    tval_bef, pval_bef = ttest_ind(groups[i].values, treated_df.values)
    tval_aft, pval_aft = ttest_ind(matched_df[i].values, treated_df.values)
    print 'p-values before matching: %s - p-values after matching: %s'%(pval_bef, pval_aft)


=== Group 0 ===
p-values before matching: [ 0.10239774  0.96574159  0.17485346] - p-values after matching: [ 0.8817942   0.85652515  0.79697805]
=== Group 1 ===
p-values before matching: [ 0.31275858  0.5163133   0.89363599] - p-values after matching: [ 0.31275858  0.5163133   0.89363599]
=== Group 2 ===
p-values before matching: [  7.87194137e-06   7.85788040e-01   7.59002122e-01] - p-values after matching: [ 0.97140587  0.35516616  0.74364497]
=== Group 3 ===
p-values before matching: [  1.01369434e-05   2.02634075e-01   5.06903917e-01] - p-values after matching: [ 0.97517904  0.21707092  0.69159306]

In [14]:
df = pd.read_excel('/home/grg/spm/data/covariates.xls')
list(df[df['apo']!=1]['subject'].values)


Out[14]:
[10070,
 10102,
 10108,
 10235,
 10365,
 10419,
 10463,
 10530,
 10551,
 10563,
 10576,
 10630,
 10668,
 10692,
 10693,
 10703,
 10725,
 10737,
 10756,
 10809,
 11045,
 11047,
 11048,
 11133,
 11180,
 11225,
 11262,
 11305,
 11351,
 11360,
 11387,
 11407,
 11414,
 11416,
 11550,
 11610,
 11614,
 11638,
 11658,
 11691,
 11721,
 11737,
 11768,
 11798,
 11803,
 11858,
 11902,
 11941,
 11975,
 11979,
 12032,
 12067,
 12079,
 12121,
 12140,
 12174,
 12244,
 12252,
 12323,
 12324,
 12331,
 12399,
 12425,
 12445,
 12479,
 12484,
 12493,
 12516,
 12637,
 12659,
 12730,
 12783,
 12787,
 12841,
 12920,
 12970,
 12976,
 13008,
 13035,
 13075,
 13090,
 13118,
 13138,
 13235,
 21092,
 44046,
 44205,
 55166,
 55529,
 55538,
 55854,
 66089,
 66125,
 66159,
 66239,
 66240,
 66270,
 66312,
 77068,
 77093,
 77094,
 77195,
 10013,
 10023,
 10024,
 10028,
 10032,
 10036,
 10038,
 10040,
 10042,
 10049,
 10052,
 10053,
 10056,
 10081,
 10117,
 10118,
 10151,
 10156,
 10158,
 10166,
 10170,
 10178,
 10182,
 10199,
 10200,
 10213,
 10217,
 10239,
 10242,
 10245,
 10248,
 10251,
 10253,
 10263,
 10265,
 10308,
 10313,
 10322,
 10325,
 10326,
 10329,
 10330,
 10354,
 10370,
 10385,
 10393,
 10417,
 10426,
 10528,
 10577,
 10678,
 10682,
 10697,
 10724,
 10821,
 10841,
 10850,
 10855,
 10858,
 10894,
 10900,
 10942,
 10946,
 11030,
 11063,
 11136,
 11137,
 11139,
 11152,
 11184,
 11257,
 11264,
 11426,
 11478,
 11590,
 11620,
 11679,
 11686,
 11830,
 12138,
 12239,
 12279,
 12327,
 12624,
 12699,
 12767,
 12861,
 12904,
 13059,
 13061,
 13144,
 13169,
 13188,
 13214,
 13215,
 13217,
 13238,
 13242,
 13312,
 13367,
 21042,
 21057,
 21073,
 44068,
 44091,
 44094,
 44119,
 44141,
 44151,
 44723,
 55057,
 55152,
 55216,
 55297,
 55323,
 55370,
 55469,
 55778,
 66019,
 66030,
 66042,
 66048,
 66050,
 66128,
 66133,
 66141,
 66169,
 66183,
 66264,
 66267,
 66268,
 66293,
 66335,
 66361,
 66498,
 77024,
 77037,
 77076,
 77151,
 77175,
 77188,
 77252,
 77263,
 10016,
 10025,
 10026,
 10029,
 10034,
 10035,
 10051,
 10099,
 10106,
 10134,
 10160,
 10162,
 10225,
 10226,
 10259,
 10317,
 10319,
 10324,
 10333,
 10338,
 10346,
 10361,
 10362,
 10396,
 10397,
 10416,
 10418,
 10433,
 10436,
 10450,
 10453,
 10461,
 10482,
 10493,
 10504,
 10522,
 10538,
 10541,
 10550,
 10593,
 10634,
 10657,
 10696,
 10735,
 10741,
 10744,
 10750,
 10778,
 10787,
 10794,
 10811,
 10822,
 10846,
 10870,
 10881,
 10901,
 10944,
 10972,
 10988,
 11007,
 11012,
 11019,
 11054,
 11092,
 11127,
 11156,
 11196,
 11201,
 11219,
 11222,
 11245,
 11247,
 11254,
 11292,
 11323,
 11327,
 11355,
 11383,
 11415,
 11458,
 11461,
 11474,
 11481,
 11514,
 11552,
 11583,
 11630,
 11641,
 11747,
 11778,
 11850,
 11872,
 11937,
 11943,
 12056,
 12186,
 12269,
 12296,
 12379,
 12409,
 12548,
 12715,
 12724,
 12771,
 12778,
 12810,
 12812,
 12823,
 12858,
 12874,
 12878,
 12941,
 12947,
 12995,
 13019,
 13043,
 13049,
 13063,
 13105,
 13127,
 13151,
 13236,
 13268,
 13293,
 13309,
 13322,
 13417,
 21012,
 21051,
 21056,
 21084,
 44004,
 44043,
 44057,
 44147,
 44491,
 44632,
 55200,
 55483,
 55667,
 55708,
 55734,
 66017,
 66026,
 66039,
 66131,
 66162,
 66368,
 77027,
 77034,
 77040,
 77044,
 77056,
 77096,
 77117,
 77130,
 77140,
 77192,
 77217,
 77254,
 10065,
 10071,
 10096,
 10144,
 10180,
 10212,
 10298,
 10334,
 10367,
 10496,
 10515,
 10613,
 10701,
 11015,
 11042,
 11114,
 11213,
 11291,
 11540,
 11561,
 11592,
 11593,
 11646,
 11656,
 11687,
 11711,
 11796,
 11829,
 11874,
 11882,
 11939,
 12125,
 12133,
 12172,
 12271,
 12304,
 12308,
 12356,
 12391,
 12483,
 12502,
 12511,
 12582,
 12636,
 12711,
 12765,
 12785,
 12893,
 12963,
 12975,
 13083,
 13244,
 13306,
 13345,
 21039,
 21130,
 44229,
 44660,
 55388,
 55630,
 66085,
 66257,
 66309,
 77047,
 77179]

The p-values observe an overall increase after the matching, showing that the samples are more likely to follow the same distribution law after than before.