In [1]:
from __future__ import print_function
import random
import re
import gzip
from itertools import islice
from operator import itemgetter
import numpy as np
from future.standard_library import install_aliases
install_aliases()
from urllib.request import urlopen, urlcleanup, urlretrieve
As training data, we use some already-called CpG islands. These were called in a prior study that used a kind of Hidden Markov Model. Relevant studies:
In [2]:
islands_url = 'http://www.haowulab.org/software/makeCGI/model-based-cpg-islands-hg19.txt'
# URL for chromosome of the hg19 human genome assembly
def hg19_chr_url(chrom):
return 'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/chromosomes/%s.fa.gz' % chrom
In [3]:
def sample(iterable, n):
""" Samples n items from a stream """
samp = []
for t, item in enumerate(iterable):
if t < n:
samp.append(item)
else:
m = random.randint(0, t)
if m < n:
samp[m] = item
return samp
In [4]:
def kmers_from_fasta(fh, k):
""" Yield k-mer, offset pairs from FASTA filehandle.
Ignore k-mers with chars besides A, C, G or T. """
non_acgt = re.compile('[^ACGTacgt]') # regex for detecting non-A/C/G/Ts
kmer, off = [], 0
for ln in fh:
if ln[0] == r'>':
kmer, off = [], 0 # new sequence
continue
for c in filter(lambda x: x.isalpha(), ln.decode()):
if len(kmer) == k:
kmer.pop(0) # k-mer buffer full, so bump one element
kmer.append(c.upper())
off += 1
if len(kmer) == k:
kmerstr = ''.join(kmer)
if not non_acgt.search(kmerstr):
yield kmerstr, off - k
In [5]:
def kmers_islands_from_fasta(fh, k, isles, want_inside):
""" Yield k-mers along with string indicating whether k-mer lies
entirely within an island (True) or not (False) """
cur = 0
for kmer, off in kmers_from_fasta(fh, k):
while cur < len(isles) and off >= isles[cur][1]:
cur += 1
was_inside = False
if cur < len(isles) and off >= isles[cur][0]:
if off + k <= isles[cur][1]:
was_inside = True
if want_inside:
yield kmer
if not was_inside and not want_inside:
yield kmer
In [6]:
def parse_islands(fh, chromosome):
""" Parse a file with island annotations. Only take
records from given chromosome name. """
islands = []
for ln in fh:
ch, st, en, _ = ln.split(b'\t', 3)
if ch == chromosome.encode('utf8'):
# convert 1-based closed interval to 0-based right-open
islands.append((int(st)-1, int(en)))
return islands
In [7]:
def get_islands(chromosome):
with urlopen(islands_url) as fh:
return parse_islands(fh, chromosome) # takes a few seconds
Islands are described simply as a pair of numbers giving the 0-based right open interval for each island.
In [8]:
get_islands('chr22')[1:10]
Out[8]:
In [9]:
def kmers_islands_from_hg19(k, chromosome, islands, inside):
fa_fn, _ = urlretrieve(hg19_chr_url(chromosome))
with gzip.open(fa_fn, 'rb') as fa_fh:
# Yield all the k-mer tuples
for r in kmers_islands_from_fasta(fa_fh, k, islands, inside):
yield r
In [10]:
def samples_from_hg19(k, chromosome, n, upto):
""" Given given k, and n, sample n k-mers from both inside
and outside CpG islands, then return histograms of number
of times each k-mer occurs inside and outside. """
islands = get_islands(chromosome)
ins = sample(islice(kmers_islands_from_hg19(
k, chromosome, islands, True), upto), n)
out = sample(islice(kmers_islands_from_hg19(
k, chromosome, islands, False), upto), n)
return ins, out
Our first idea for a model is to count how many times our $k$-mer of interest occurs inside and outside CpG islands. This get problematic as $k$ grows as it requires exponentially more training data.
In [11]:
from collections import Counter
random.seed(723444)
q = 'CGCGC'
n = 500000
upto = 5000000
ins, out = samples_from_hg19(len(q), 'chr22', n, upto)
assert len(ins) == n, (len(ins), len(out), n)
assert len(out) == n, (len(ins), len(out), n)
hist_in, hist_out = Counter(ins), Counter(out)
# print info about inside/outside counts and probabilities
print("inside: %d out of %d" % (hist_in[q], n))
print("outside: %d out of %d" % (hist_out[q], n))
print("p(inside): %0.5f" % (float(hist_in[q]) / (hist_in[q] + hist_out[q])))
print("p(outside): %0.5f" % (float(hist_out[q]) / (hist_in[q] + hist_out[q])))
Now we adopt the Markov assumption and estimate all the conditional probabilities, e.g. $P(A|C)$.
In [12]:
# Now to build inside and outside Markov chains
# compile dinucleotide tables
samp_in, samp_out = samples_from_hg19(2, 'chr22', n=100000, upto=1000000)
In [13]:
def markov_chain_from_dinucs(dinucs):
''' Given dinucleotide frequencies, make a transition table. '''
conds = np.zeros((4, 4), dtype=np.float64)
margs = np.zeros(4, dtype=np.float64)
for i, ci in enumerate('ACGT'):
tot = 0
for j, cj in enumerate('ACGT'):
count = dinucs.get(ci + cj, 0)
tot += count
margs[i] += count
if tot > 0:
for j, cj in enumerate('ACGT'):
conds[i, j] = dinucs.get(ci + cj, 0) / float(tot)
return conds, margs
In [14]:
ins_conds, ins_margs = markov_chain_from_dinucs(Counter(samp_in))
out_conds, out_margs = markov_chain_from_dinucs(Counter(samp_out))
In [15]:
# transition probabilities inside CpG island
ins_conds
Out[15]:
In [16]:
# confirm that rows add to 1
np.sum(ins_conds, 1), np.sum(out_conds, 1)
Out[16]:
In [17]:
# elementwise log2 of above table
np.log2(ins_conds)
Out[17]:
In [18]:
# log ratio table
np.log2(ins_conds) - np.log2(out_conds)
Out[18]:
In [19]:
def classify(seq, lrTab):
""" Classify seq using given log-ratio table. We're ignoring the
initial probability for simplicity. """
bits = 0
nucmap = { 'A':0, 'C':1, 'G':2, 'T':3 }
for dinuc in [ seq[i:i+2] for i in range(len(seq)-1) ]:
i, j = nucmap[dinuc[0]], nucmap[dinuc[1]]
bits += lrTab[i, j]
return bits
In [20]:
log_ratios = np.log2(ins_conds) - np.log2(out_conds)
In [21]:
classify('CGCGCGCGCGCGCGCGCGCGCGCGCG', log_ratios)
Out[21]:
In [22]:
classify('ATTCTACTATCATCTATCTATCTTCT', log_ratios)
Out[22]:
In [23]:
itest, otest = samples_from_hg19(100, 'chr18', 1000, 100000)
In [24]:
itestClass = [ classify(x, log_ratios) for x in itest ]
otestClass = [ classify(x, log_ratios) for x in otest ]
In [25]:
%pylab inline --no-import-all
from matplotlib import pyplot
bins = numpy.linspace(-60, 60, 100)
pyplot.hist(itestClass, bins, alpha=0.5)
pyplot.hist(otestClass, bins, alpha=0.5)
pyplot.show()