In [1]:
%matplotlib inline
import crystal
import toolshed as ts
import aclust
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(style="white", context="talk")
colors = sns.color_palette("Set1", 8)
pool = ts.pool()
methylated_file = "/drive/450k/mthfr/methylated4.txt.gz"
counts_file = "/drive/450k/mthfr/counts4.txt.gz"
covs_file = "/drive/450k/mthfr/covs.Control_Saline-vs-MTHFR-Saline.txt"
In [2]:
from itertools import izip
def feature_gen(methylated, counts):
fhm = ts.reader(methylated, header=False)
fhc = ts.reader(counts, header=False)
next(fhm), next(fhc) # drop headers
for m, c in izip(fhm, fhc):
assert m[0] == c[0]
chrom, pos = m[0].split(":")
#if chrom == "chr10": break
counts = np.array([int(co) for co in c[1:]])
if counts.sum() < 20: continue
yield crystal.CountFeature(chrom, int(pos), np.array([int(me) for me in m[1:]]),
counts, rho_min=0.3)
In [3]:
def get_cluster_iter(min_len=2):
return (c for c in aclust.mclust(feature_gen(methylated_file, counts_file),
max_dist=300, max_skip=0) if len(c) >= min_len)
covs = pd.read_table(covs_file)
covs['trt'] = (covs.Group == "MTHFRKO").astype(int)
covs
Out[3]:
In [4]:
clusters = list(get_cluster_iter(2))
In [5]:
sum(len(c) for c in clusters)
Out[5]:
In [6]:
from crystal import Poisson, Independence, NB
coef = "trt"
formula_base = "methylation ~ trt"
formula_cpg = "methylation ~ trt + CpG"
labels = ["negative binomial", "gee poisson"]
modeled = [list(crystal.model_clusters(clusters, covs, formula, coef, fn, n_cpu=1, **kwargs))
for fn, formula, kwargs in [
(crystal.nb_cluster, formula_base, {}),
(crystal.gee_cluster, formula_cpg, dict(family=Poisson(), cov_struct=Independence())),
]]
In [8]:
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_palette('Set1', 9)
evaluate = crystal.evaluate.evaluate
fig, ax = plt.subplots(1, figsize=(12, 12))
for i in range(len(modeled)):
bediter = crystal.evaluate.cluster_bediter(modeled[i])
tr, ps = evaluate(bediter, "mthfr-region.bed", ax=ax, label=labels[i])
for label, pbed in (('BiSeq', '../biseq.res.txt'),
('RadMeth', '../cpgs.adjust.bed')):
if label == 'RadMeth':
bediter = ((d[0], int(d[1]), int(d[2]), float(d[4]), 1) for d in \
ts.reader(pbed, header=False) if float(d[4]) != -1)
else:
bediter = ((d['seqnames'], int(d['start']), int(d['end']),
float(d['median.p']), 1) for d in ts.reader(pbed))
evaluate(bediter, "mthfr-region.bed", ax=ax, label=label)
ax.plot([0, 1], [0, 1], 'k--', zorder=-1, alpha=0.5)
ax.set_xlim(0, 0.1)
ax.set_ylim(0, 0.2)
ax.legend(loc="best")
plt.savefig('../figures/figure5.eps')
In [7]: