In [1]:
%matplotlib inline
import toolshed as ts
import itertools as it
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white", context="talk")
colors = sns.color_palette("Set1", 8)
pool = ts.pool()
import crystal
import aclust # to create correlated clusters
We need a generator that reads data and generates features to sent to aclust
In [2]:
def feature_gen(fname, samples, logit=True):
for i, d in enumerate(ts.reader(fname, header=True)):
chrom, pos = d['probe'].split(":")
if chrom[-1] in "XY": break
values = np.array([float(d[s]) for s in samples])
yield crystal.Feature(chrom, int(pos), crystal.logit(values) if logit else values)
Now we need to define our model and read in our covariates. First, we want to test 6 vs 6 (male vs female).
In [3]:
methylation_file = "v0-450K-12.meth.txt.gz"
covariates_file = "v0-450K-12.covariates.csv"
covs = pd.read_csv(covariates_file, index_col=0)
Our input data is a methylation matrix (rows of probes and columns of samples). Here, we have encoded the position into the row names, but you can also add these on the fly as you stream over the data.
In [4]:
# a function to restart the cluster generation each time.
def get_cluster_iter(samples, methylation_file, min_len=1, logit=True):
return (c for c in aclust.mclust(feature_gen(methylation_file, samples, logit=logit),
max_dist=300, max_skip=0)
if len(c) >= min_len)
Mclust returns an iterator of clusters given an iterator of features. The clusters are have correlated methylation. The clustering is done without knowing the study design so it is unbiased.
Each cluster is a list of features from your own feature_gen()
In [5]:
oclusters = list(get_cluster_iter(covs.index, methylation_file))
In [12]:
region_fh = open('ex-sim-regions.bed', 'w')
cluster_fh = open('ex-sim-meth.txt', 'w')
cluster_fh.write("\t".join(['probe'] + list(covs['id'])) + "\n")
sizes=crystal.simulate.SIZES.copy()
sizes[1] = 2000
#sizes.pop(1)
for cluster in crystal.simulate.simulate_regions(oclusters, region_fh, class_order=list(covs['gender']), seed=21,
get_reduced_residuals=crystal.simulate.rr_cluster,
get_reduced_residuals_args=(covs, "methylation ~ age"),
sizes=sizes
):
crystal.utils.write_cluster(cluster, cluster_fh)
cluster_fh.flush()
In [13]:
clusters = list(get_cluster_iter(covs.index, "ex-sim-meth.txt", logit=False))
print sum(len(c) for c in clusters)
In [14]:
from crystal import RLM, Beta
formula_base = "methylation ~ gender + age"
formula_cpg = formula_base + " + CpG"
coef = "gender"
modeled = [list(crystal.model_clusters(clusters, covs, formula, coef, fn, pool=pool, **kwargs))
for fn, formula, kwargs in [
(crystal.zscore_cluster, formula_base, {}),
#(crystal.bump_cluster, formula_base, {}),
(crystal.gee_cluster, formula_cpg, {}),
# (crystal.ols_cluster_robust, formula_cpg, {}),
(crystal.mixed_model_cluster, formula_cpg, {})
]]
In [21]:
evaluate = crystal.evaluate.evaluate
import seaborn as sns
sns.set_palette('Set1', 9)
fig, ax = plt.subplots(1, figsize=(12, 12))
labels = ['z-score', 'gee', 'mixed-model']
sizes = range(11)
for i in range(len(modeled)):
bediter = crystal.evaluate.cluster_bediter(modeled[i])
tr, ps = evaluate(bediter, "ex-sim-regions.bed", ax=ax, label=labels[i], sizes=sizes)
#NOTE: re-run these before plotting.
for label, pbed in (('limma', '../work/limma.output.txt'),
# ('bumphunter', '../work/bumphunter.output.txt'),
('DMRcate', '../work/dmrcate.output.txt'),
('comb-p limma', '../work/comb-p/cpv-limma.regions-t.bed')
):
bediter = ((d.get('chrom', d.get('chr')), int(float(d['start'])), int(float(d['end'])),
float(d.get('p', d.get('z_p', d.get('slk_p')))), 0) for d in ts.reader(pbed))
evaluate(bediter, "ex-sim-regions.bed", ax=ax, label=label, sizes=sizes)
print "DONE"
ax.plot([0, 1], [0, 1], 'k--', zorder=-1)
ax.set_xlim(0, 0.2)
ax.set_ylim(0, 0.2)
#ax.set_title('ROC evaluation of simulated regions')
ax.legend(loc="best")
plt.savefig('../figures/figure2.eps')
In [ ]: