In [1]:
%matplotlib inline
import crystal
import toolshed as ts
import aclust
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import seaborn as sns
sns.set(style="white", context="talk")
colors = sns.color_palette("Set1", 8)

pool = ts.pool()

methylated_file = "/drive/450k/mthfr/methylated4.txt.gz"
counts_file = "/drive/450k/mthfr/counts4.txt.gz"
covs_file = "/drive/450k/mthfr/covs.Control_Saline-vs-MTHFR-Saline.txt"


/usr/local/lib/python2.7/dist-packages/pytz/__init__.py:29: UserWarning: Module readline was already imported from /usr/lib/python2.7/lib-dynload/readline.x86_64-linux-gnu.so, but /home/brentp/.local/lib/python2.7/site-packages is being added to sys.path
  from pkg_resources import resource_stream

In [2]:
from itertools import izip

def feature_gen(methylated, counts):
    fhm = ts.reader(methylated, header=False)
    fhc = ts.reader(counts, header=False)
    next(fhm), next(fhc) # drop headers
    for m, c in izip(fhm, fhc):
        assert m[0] == c[0]
        chrom, pos = m[0].split(":")
        #if chrom == "chr10": break
        counts = np.array([int(co) for co in c[1:]])
        if counts.sum() < 20: continue
        yield crystal.CountFeature(chrom, int(pos), np.array([int(me) for me in m[1:]]),
                                                    counts, rho_min=0.3)

In [3]:
def get_cluster_iter(min_len=2):
    return (c for c in aclust.mclust(feature_gen(methylated_file, counts_file),
                              max_dist=300, max_skip=0) if len(c) >= min_len)

covs = pd.read_table(covs_file)
covs['trt'] = (covs.Group == "MTHFRKO").astype(int)
covs


Out[3]:
Sample Group Treatment Normalized.cell.count Eos trt
0 38378 Control Saline 36100 4720 0
1 38387 Control Saline 20200 202 0
2 38395 Control Saline 28900 145 0
3 38401 Control Saline 18700 0 0
4 38474 Control Saline 77800 0 0
5 38480 MTHFRKO Saline 74600 746 1
6 38481 MTHFRKO Saline 23800 1070 1
7 38484 Control Saline 69200 0 0
8 38490 MTHFRKO Saline 54000 0 1
9 38492 MTHFRKO Saline 32400 0 1
10 38493 MTHFRKO Saline 5920 0 1

In [4]:
clusters = list(get_cluster_iter(2))

In [5]:
sum(len(c) for c in clusters)


Out[5]:
50818

In [6]:
from crystal import Poisson, Independence, NB
coef = "trt"
formula_base = "methylation ~ trt"
formula_cpg = "methylation ~ trt + CpG"

labels = ["negative binomial", "gee poisson"]

modeled = [list(crystal.model_clusters(clusters, covs, formula, coef, fn, n_cpu=1, **kwargs))
             for fn, formula, kwargs in [

                         (crystal.nb_cluster, formula_base, {}),
                         (crystal.gee_cluster, formula_cpg, dict(family=Poisson(), cov_struct=Independence())),

           ]]


/usr/local/lib/python2.7/dist-packages/statsmodels-0.6.0-py2.7-linux-x86_64.egg/statsmodels/base/model.py:445: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

In [8]:
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_palette('Set1', 9)
evaluate = crystal.evaluate.evaluate

fig, ax = plt.subplots(1, figsize=(12, 12))
for i in range(len(modeled)):
    bediter = crystal.evaluate.cluster_bediter(modeled[i])
    tr, ps = evaluate(bediter, "mthfr-region.bed", ax=ax, label=labels[i])
    
for label, pbed in (('BiSeq', '../biseq.res.txt'),
                    ('RadMeth', '../cpgs.adjust.bed')):
    if label == 'RadMeth':
        bediter = ((d[0], int(d[1]), int(d[2]), float(d[4]), 1) for d in \
            ts.reader(pbed, header=False) if float(d[4]) != -1)
    else:
        bediter = ((d['seqnames'], int(d['start']), int(d['end']),
                float(d['median.p']), 1) for d in ts.reader(pbed))
    evaluate(bediter, "mthfr-region.bed", ax=ax, label=label)
    
    
ax.plot([0, 1], [0, 1], 'k--', zorder=-1, alpha=0.5)
ax.set_xlim(0, 0.1)
ax.set_ylim(0, 0.2)
ax.legend(loc="best")    
plt.savefig('../figures/figure5.eps')


AUC: 0.5533 | negative binomial 50818
AUC: 0.5308 | gee poisson 50818
AUC: 0.6278 | BiSeq 1198
AUC: 0.5765 | RadMeth 66769

In [7]: