notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import crystal
import toolshed as ts
import aclust
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import seaborn as sns
sns.set(style="white", context="talk")
colors = sns.color_palette("Set1", 8)

pool = ts.pool()

methylated_file = "/drive/450k/mthfr/methylated4.txt.gz"
counts_file = "/drive/450k/mthfr/counts4.txt.gz"
covs_file = "/drive/450k/mthfr/covs.Control_Saline-vs-MTHFR-Saline.txt"









    



/usr/local/lib/python2.7/dist-packages/pytz/__init__.py:29: UserWarning: Module readline was already imported from /usr/lib/python2.7/lib-dynload/readline.x86_64-linux-gnu.so, but /home/brentp/.local/lib/python2.7/site-packages is being added to sys.path
  from pkg_resources import resource_stream



In [2]:

    
from itertools import izip

def feature_gen(methylated, counts):
    fhm = ts.reader(methylated, header=False)
    fhc = ts.reader(counts, header=False)
    next(fhm), next(fhc) # drop headers
    for m, c in izip(fhm, fhc):
        assert m[0] == c[0]
        chrom, pos = m[0].split(":")
        #if chrom == "chr10": break
        counts = np.array([int(co) for co in c[1:]])
        if counts.sum() < 20: continue
        yield crystal.CountFeature(chrom, int(pos), np.array([int(me) for me in m[1:]]),
                                                    counts, rho_min=0.3)



In [3]:

    
def get_cluster_iter(min_len=2):
    return (c for c in aclust.mclust(feature_gen(methylated_file, counts_file),
                              max_dist=300, max_skip=0) if len(c) >= min_len)

covs = pd.read_table(covs_file)
covs['trt'] = (covs.Group == "MTHFRKO").astype(int)
covs









    Out[3]:






  
    
      
      Sample
      Group
      Treatment
      Normalized.cell.count
      Eos
      trt
    
  
  
    
      0 
       38378
       Control
       Saline
       36100
       4720
       0
    
    
      1 
       38387
       Control
       Saline
       20200
        202
       0
    
    
      2 
       38395
       Control
       Saline
       28900
        145
       0
    
    
      3 
       38401
       Control
       Saline
       18700
          0
       0
    
    
      4 
       38474
       Control
       Saline
       77800
          0
       0
    
    
      5 
       38480
       MTHFRKO
       Saline
       74600
        746
       1
    
    
      6 
       38481
       MTHFRKO
       Saline
       23800
       1070
       1
    
    
      7 
       38484
       Control
       Saline
       69200
          0
       0
    
    
      8 
       38490
       MTHFRKO
       Saline
       54000
          0
       1
    
    
      9 
       38492
       MTHFRKO
       Saline
       32400
          0
       1
    
    
      10
       38493
       MTHFRKO
       Saline
        5920
          0
       1



In [4]:

    
clusters = list(get_cluster_iter(2))



In [5]:

    
sum(len(c) for c in clusters)









    Out[5]:





50818



In [6]:

    
from crystal import Poisson, Independence, NB
coef = "trt"
formula_base = "methylation ~ trt"
formula_cpg = "methylation ~ trt + CpG"

labels = ["negative binomial", "gee poisson"]

modeled = [list(crystal.model_clusters(clusters, covs, formula, coef, fn, n_cpu=1, **kwargs))
             for fn, formula, kwargs in [

                         (crystal.nb_cluster, formula_base, {}),
                         (crystal.gee_cluster, formula_cpg, dict(family=Poisson(), cov_struct=Independence())),

           ]]









    



/usr/local/lib/python2.7/dist-packages/statsmodels-0.6.0-py2.7-linux-x86_64.egg/statsmodels/base/model.py:445: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)



In [8]:

    
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_palette('Set1', 9)
evaluate = crystal.evaluate.evaluate

fig, ax = plt.subplots(1, figsize=(12, 12))
for i in range(len(modeled)):
    bediter = crystal.evaluate.cluster_bediter(modeled[i])
    tr, ps = evaluate(bediter, "mthfr-region.bed", ax=ax, label=labels[i])
    
for label, pbed in (('BiSeq', '../biseq.res.txt'),
                    ('RadMeth', '../cpgs.adjust.bed')):
    if label == 'RadMeth':
        bediter = ((d[0], int(d[1]), int(d[2]), float(d[4]), 1) for d in \
            ts.reader(pbed, header=False) if float(d[4]) != -1)
    else:
        bediter = ((d['seqnames'], int(d['start']), int(d['end']),
                float(d['median.p']), 1) for d in ts.reader(pbed))
    evaluate(bediter, "mthfr-region.bed", ax=ax, label=label)
    
    
ax.plot([0, 1], [0, 1], 'k--', zorder=-1, alpha=0.5)
ax.set_xlim(0, 0.1)
ax.set_ylim(0, 0.2)
ax.legend(loc="best")    
plt.savefig('../figures/figure5.eps')









    



AUC: 0.5533 | negative binomial 50818
AUC: 0.5308 | gee poisson 50818
AUC: 0.6278 | BiSeq 1198
AUC: 0.5765 | RadMeth 66769



In [7]:

	Sample	Group	Treatment	Normalized.cell.count	Eos	trt
0	38378	Control	Saline	36100	4720	0
1	38387	Control	Saline	20200	202	0
2	38395	Control	Saline	28900	145	0
3	38401	Control	Saline	18700	0	0
4	38474	Control	Saline	77800	0	0
5	38480	MTHFRKO	Saline	74600	746	1
6	38481	MTHFRKO	Saline	23800	1070	1
7	38484	Control	Saline	69200	0	0
8	38490	MTHFRKO	Saline	54000	0	1
9	38492	MTHFRKO	Saline	32400	0	1
10	38493	MTHFRKO	Saline	5920	0	1