In [68]:
%matplotlib inline
import pandas as pd
from scipy import stats
from matplotlib import pyplot as plt
data = pd.read_excel('/home/grg/spm/data/covariates.xls')
for i in xrange(5):
x = data[data['apo'] == i]['age'].values
plt.hist(x, bins=20)
print i, 'W:%.4f p:%.4f -'%stats.shapiro(x), len(x), 'subjects between', int(min(x)), 'and', int(max(x))
plt.legend(['apoe23', 'apoe24', 'apoe33', 'apoe34', 'apoe44'])
plt.show()
For two of the 5 groups, the Shapiro test p-value is lower than 1e-3, which means that the distributions of these two groups can't be considered as normal. (But theorically none of them is)
In [233]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
def get_matching_pairs(treated_df, non_treated_df, scaler=True):
treated_x = treated_df.values
non_treated_x = non_treated_df.values
if scaler:
scaler = StandardScaler()
scaler.fit(treated_x)
treated_x = scaler.transform(treated_x)
non_treated_x = scaler.transform(non_treated_x)
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(non_treated_x)
distances, indices = nbrs.kneighbors(treated_x)
indices = indices.reshape(indices.shape[0])
matched = non_treated_df.ix[indices]
matched = non_treated_df.irow(matched.index)
return matched
In [234]:
df = pd.read_excel('/home/grg/spm/data/covariates.xls')
df = df[['subject','apo','age','gender','educyears']]
groups = [df[df['apo']==i] for i in xrange(5)]
for i in xrange(5):
groups[i] = groups[i].set_index(groups[i]['subject'])
del groups[i]['subject']
del groups[i]['apo']
In [235]:
treated_df = groups[4]
matched_df = [get_matching_pairs(treated_df, groups[i], scaler=False) for i in xrange(4)]
In [236]:
fig, ax = plt.subplots(figsize=(6,6))
for i in xrange(4):
x = matched_df[i]['age']
plt.hist(x, bins=20)
print i, 'W:%.4f p:%.4f -'%stats.shapiro(x), len(x), 'subjects between', int(min(x)), 'and', int(max(x))
x = treated_df['age']
plt.hist(x, bins=20)
print 4, 'W:%.4f p:%.4f -'%stats.shapiro(x), len(x), 'subjects between', int(min(x)), 'and', int(max(x))
plt.legend(['apoe23', 'apoe24', 'apoe33', 'apoe34', 'apoe44'])
Out[236]:
In [1]:
import pandas as pd
df = pd.read_excel('/home/grg/spm/data/covariates.xls')
df = df[['subject','apo','age','gender','educyears']]
groups = [df[df['apo']==i] for i in xrange(5)]
for i in xrange(5):
groups[i] = groups[i].set_index(groups[i]['subject'])
del groups[i]['subject']
del groups[i]['apo']
In [2]:
groups = [df[df['apo']==i] for i in xrange(5)]
for i in xrange(5):
groups[i] = groups[i].set_index(groups[i]['subject'])
del groups[i]['apo']
del groups[i]['subject']
treated_df = groups[4]
non_treated_df = groups[0]
In [3]:
from scipy.spatial.distance import cdist
from scipy import optimize
def get_matching_pairs(treated_df, non_treated_df):
cost_matrix = cdist(treated_df.values, non_treated_df.values)
row_ind, col_ind = optimize.linear_sum_assignment(cost_matrix)
return non_treated_df.iloc[col_ind]
In [72]:
treated_df = groups[4]
matched_df = [get_matching_pairs(treated_df, groups[i]) for i in xrange(4)]
In [73]:
fig, ax = plt.subplots(figsize=(6,6))
for i in xrange(4):
x = matched_df[i]['age']
plt.hist(x, bins=20)
print i, 'W:%.4f p:%.4f -'%stats.shapiro(x), len(x), 'subjects between', int(min(x)), 'and', int(max(x))
x = treated_df['age']
plt.hist(x, bins=20)
print 4, 'W:%.4f p:%.4f -'%stats.shapiro(x), len(x), 'subjects between', int(min(x)), 'and', int(max(x))
plt.legend(['apoe23', 'apoe24', 'apoe33', 'apoe34', 'apoe44'])
Out[73]:
In [6]:
import json
groups_index = [each.index.tolist() for each in matched_df]
groups_index.append(groups[4].index.tolist())
json.dump(groups_index, open('/tmp/groups.json','w'))
In [76]:
from scipy.stats import ttest_ind
for i in xrange(4):
print '=== Group %s ==='%i
tval_bef, pval_bef = ttest_ind(groups[i].values, treated_df.values)
tval_aft, pval_aft = ttest_ind(matched_df[i].values, treated_df.values)
print 'p-values before matching: %s - p-values after matching: %s'%(pval_bef, pval_aft)
In [14]:
df = pd.read_excel('/home/grg/spm/data/covariates.xls')
list(df[df['apo']!=1]['subject'].values)
Out[14]:
The p-values observe an overall increase after the matching, showing that the samples are more likely to follow the same distribution law after than before.