In [1]:
%matplotlib inline
import toolshed as ts
import itertools as it
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white", context="talk")
colors = sns.color_palette("Set1", 8)
pool = ts.pool()
import crystal
from aclust import mclust # to create correlated clusters
We need a generator that reads data and generates features to sent to aclust
In [2]:
def feature_gen(method, samples, logit=True):
fname = "../norm.{method}.txt".format(method=method)
for i, d in enumerate(ts.reader(fname, header=True)):
chrom, pos = d['probe'].split(":")
if chrom[-1] in "XY": break
values = np.array([float(d[s]) for s in samples])
yield crystal.Feature(chrom, int(pos), crystal.logit(values) if logit else values)
Now we need to define our model and read in our covariates. First, we want to test 6 vs 6 (male vs female).
In [3]:
covariates_file = "d0-450K-12.covariates.csv"
np.random.seed(42)
covs = pd.read_csv(covariates_file, index_col=0)
Our input data is a methylation matrix (rows of probes and columns of samples). Here, we have encoded the position into the row names, but you can also add these on the fly as you stream over the data.
In [4]:
# a function to restart the cluster generation each time.
def get_cluster_iter(samples, method, min_len=3, logit=True):
return (c for c in mclust(feature_gen(method, samples, logit=logit),
max_dist=300, max_skip=0) if len(c) >= min_len)
In [5]:
clusters = {}
norm_methods = ('raw', 'quantile', 'swan', 'funnorm')
for nmethod in norm_methods:
print nmethod
clusters[nmethod] = list(get_cluster_iter(covs.index, nmethod, 1))
Now we will evaluate performance using the continuous variable 'age' by using a discovery and validation cohort randomly drawn from the full dataset. We will create the plots as above using sites that replicate in the validation as 'true' positives and sites that do not as 'false' positives. There are some issues with that interpretation in the context of an ROC analysis, but it is an informative metric because we often are interested in sites that will replicate across cohorts.
First, we model the original clusters and find sites related to age:
In [7]:
formula_base = "methylation ~ age + gender + CpG"
coef = 'age'
labels = ['gee', 'mixed-model']
age_modeled = {}
for method in norm_methods:
print method
age_modeled[method] = [list(crystal.model_clusters(clusters[method], covs, formula, coef, fn, pool=pool, **kwargs))
for fn, formula, kwargs in [
(crystal.gee_cluster, formula_base, {}),
(crystal.mixed_model_cluster, formula_base, {})
]]
Then, we write those regions to a file; we will want to replicate sites with p < 1e-3.
In [8]:
CUTOFF = 1e-4
from crystal.evaluate import write_modeled_regions
region_files = []
for i, label in enumerate(labels):
for method in norm_methods:
out_fh = open('/tmp/norm-%s.%s.regions.bed' % (label, method), 'w')
modeled = age_modeled[method][i]
write_modeled_regions(modeled, CUTOFF, out_fh)
region_files.append(out_fh.name)
out_fh.close()
In [9]:
rcovariates_file = "v0-450K-12.covariates.csv"
rcovs = pd.read_csv(rcovariates_file, index_col=0)
replication_clusters = {}
for method in norm_methods:
print method
replication_clusters[method] = list(get_cluster_iter(rcovs.index, method, 1))
In [10]:
age_replicated = {}
for method in norm_methods:
print method
age_replicated[method] = [list(crystal.model_clusters(replication_clusters[method],
rcovs, formula, coef, fn,
pool=pool, **kwargs))
for fn, formula, kwargs in [
(crystal.gee_cluster, formula_base, {}),
(crystal.mixed_model_cluster, formula_base, {})
]]
In [14]:
from crystal.evaluate import evaluate, cluster_bediter
fig, ax = plt.subplots(1)
for im, method in enumerate(norm_methods):
# first do a simple transform to make use of the regions function
rep_results = [list(cluster_bediter(r)) for r in age_replicated[method]]
region_files = ["/tmp/norm-%s.%s.regions.bed" % (label, method) for label in labels]
for j, (label, rep, regions) in enumerate(zip(labels, rep_results, region_files)):
evaluate(rep, regions, label="%s - %s" % (method, label), ax=ax, ls='-.' if j == 0 else '-',
c=colors[im], marker='', lw=2.5)
ax.plot([0, 1], [0, 1], 'k--', zorder=-1)
ax.set_xlim(0, 0.2)
ax.set_ylim(0, 0.2)
ax.legend(loc='best')
plt.savefig('../figures/figure4.eps')
In [11]: