In [1]:
%run ../../shared_setup.ipynb
In [2]:
def tabulate(f):
class Tabulated(etl.Table):
def __init__(self, *args, **kwargs):
self.args = args
self.kwargs = kwargs
def __iter__(self):
return f(*self.args, **self.kwargs)
return Tabulated
In [3]:
@tabulate
def tabulate_core_windows(window_size):
yield 'chrom', 'start', 'stop'
for rec in tbl_regions_1b.eq('region_type', 'Core').records():
for start in range(rec.region_start, rec.region_stop, window_size):
yield rec.region_chrom, start, start + window_size - 1
In [4]:
import re
motif_jiang_a = 'TA[TA]GTTAGT[CG]AAG[TG]TAAGACC'
motif_jiang_b = 'GCA[TC][CA][TG]AG[GT]TGC'
motif_jiang_c = '[TG]GA[TA]GAAG[AG][TG]GA'
motif_jiang_d = '[TA]GT[TA]AGTAGT[CA]A'
motif_jiang_e = '[TG]G[AG]TGA[TA]GA[AT][AG][AC]'
@tabulate
def tabulate_motifs(motif):
motif = motif.lower()
yield 'chrom', 'start', 'stop', 'sequence'
for chrom in sorted(fasta.keys()):
seq = fasta[chrom][:].lower()
matches = re.finditer(motif, seq)
for match in matches:
start, stop = match.span()
match_seq = seq[start:stop]
yield chrom, start, stop, match_seq
In [5]:
tabulate_motifs(motif_jiang_a)
Out[5]:
In [6]:
tabulate_motifs(motif_jiang_b)
Out[6]:
In [7]:
tabulate_motifs(motif_jiang_c)
Out[7]:
In [8]:
tabulate_motifs(motif_jiang_d)
Out[8]:
In [9]:
tabulate_motifs(motif_jiang_e)
Out[9]:
In [10]:
tbl_co = (
etl
.frompickle(os.path.join(PUBLIC_DIR, 'tbl_co.pickle'))
.convert('chrom', lambda v: str(v, 'ascii'))
)
display_with_nrows(tbl_co, caption='CO events')
In [11]:
def tabulate_tr(chrom):
fn = '/data/plasmodium/pfalciparum/pf-crosses/data/genome/sanger/version3/September_2012/%s.tr.pickle' % chrom
tbl_tr = etl.frompickle(fn)
return tbl_tr
tbl_tr = (
etl
.cat(*[tabulate_tr(chrom) for chrom in sorted(fasta.keys())])
.sort(key=('chrom', 'start'))
)
def select_tr_threshold(row):
unit_length = row.unit_length
tract_length = row.tract_length
return (
((unit_length == 1) and (tract_length >= 6)) or
((unit_length == 2) and (tract_length >= 9)) or
((unit_length == 3) and (tract_length >= 11)) or
((unit_length == 4) and (tract_length >= 13)) or
((unit_length == 5) and (tract_length >= 14)) or
((unit_length == 6) and (tract_length >= 16)) or
((unit_length >= 7) and (tract_length >= 18))
)
tbl_tr_proper = tbl_tr.select(select_tr_threshold).cache()
tbl_tr_proper.display()
is_tr = dict()
for chrom in sorted(fasta.keys()):
log(chrom)
is_tr[chrom] = np.zeros(len(fasta[chrom]), dtype='b1')
for rec in tbl_tr_proper.eq('chrom', chrom).records():
start = rec.start - 1
stop = rec.stop
is_tr[chrom][start:stop] = True
is_tr
Out[11]:
In [12]:
is_exon = dict()
for chrom in sorted(fasta.keys()):
log(chrom)
is_exon[chrom] = np.zeros(len(fasta[chrom]), dtype='b1')
for rec in tbl_exons.eq('feature_chrom', chrom).records():
start = rec.feature_start - 1
stop = rec.feature_stop
is_exon[chrom][start:stop] = True
In [13]:
%%R
library(AER)
In [14]:
callsets = load_callsets(COMBINED_CALLSET_FN_TEMPLATE,
variant_filter='FILTER_PASS',
call_filter=combined_conf_calls,
sample_exclusions=excessive_recomb_samples)
In [15]:
pos = dict()
for cross in CROSSES:
variants = callsets[cross]['variants']
pos[cross] = allel.SortedMultiIndex(variants['CHROM'], variants['POS'])
In [16]:
is_snp = dict()
for cross in CROSSES:
variants = callsets[cross]['variants']
is_snp[cross] = variants['is_snp']
In [27]:
def distance_to_centromere(row):
cen_id = 'PF3D7_CEN' + row.chrom[6:8]
try:
cen = lkp_feature[cen_id]
cen_pos = (cen['feature_start'] + cen['feature_stop'])/2
window_pos = (row.start + row.stop) / 2
return abs(window_pos - cen_pos)
except KeyError:
return None
def distance_to_telomere(row):
chrom = row.chrom
chrlen = len(fasta[chrom])
l = row.start
r = chrlen - row.stop
return min(l, r)
def distance_to_core_edge(row):
chrom = row.chrom
core_start = tbl_regions_1b.eq('region_chrom', chrom).eq('region_type', 'Core').values('region_start').min()
core_stop = tbl_regions_1b.eq('region_chrom', chrom).eq('region_type', 'Core').values('region_stop').max()
l = row.start - core_start
r = core_stop - row.stop
return min(l, r)
def n_coding(row):
return np.count_nonzero(is_exon[row.chrom][row.start-1:row.stop])
def n_tr(row):
return np.count_nonzero(is_tr[row.chrom][row.start-1:row.stop])
def gc(row):
seq = fasta[row.chrom][row.start-1:row.stop].lower()
com = collections.Counter(seq)
return (com['g'] + com['c']) / len(seq)
def n_variants(row):
nv = ni = ns = 0
start = row.start
stop = row.stop
chrom = row.chrom.encode('ascii')
for cross in CROSSES:
try:
loc = pos[cross].locate_range(chrom, start, stop)
except KeyError:
pass
else:
x = is_snp[cross][loc]
ns += nnz(x)
ni += nnz(~x)
return ns, ni
@functools.lru_cache(maxsize=None)
def distance_to_motif(motif, vmax=None):
tbl_motif = tabulate_motifs(motif)
lkp_motif = tbl_motif.recordlookup('chrom')
def _addfield(row):
chrom = row.chrom
window_mid = (row.start + row.stop) / 2
try:
motifs = lkp_motif[chrom]
except KeyError:
if vmax is None:
return 4000000
else:
return vmax
else:
motif_mids = [(motif.start + motif.stop) / 2 for motif in motifs]
nearest = min([abs(window_mid - motif_mid) for motif_mid in motif_mids])
if vmax is None:
return nearest
else:
return min(nearest, vmax)
return _addfield
In [28]:
@functools.lru_cache(maxsize=None)
def tabulate_co_predictors(window_size, vmax_motif=None):
fn = os.path.join(CACHE_DIR, 'tbl_co_predictors.%s.%s.pickle' % (window_size, vmax_motif))
if not os.path.exists(fn):
log('building')
# tabulate windows
tbl_windows = tabulate_core_windows(window_size)
# count COs in windows
tbl_windows_co = (
tbl_windows
.intervalleftjoin(tbl_co, lkey='chrom', lstart='start', lstop='stop',
rkey='chrom', rstart='co_pos_min', rstop='co_pos_max',
include_stop=True)
.cutout(4)
.aggregate(key=('chrom', 'start', 'stop'),
aggregation=lambda vals: sum(1 for v in vals if v is not None),
value='cross')
.rename('value', 'co_count')
)
# augment
tbl_aug = (
tbl_windows_co
.addfield('cen_dist', distance_to_centromere)
.addfield('tel_dist', distance_to_telomere)
.notnone('cen_dist')
.addfield('n_coding', n_coding)
.addfield('gc', gc)
.addfield('n_variants', n_variants)
.unpack('n_variants', ('n_snps', 'n_indels'))
.addfield('motif_jiang_a_dist', distance_to_motif(motif_jiang_a, vmax=vmax_motif))
.addfield('motif_jiang_b_dist', distance_to_motif(motif_jiang_b, vmax=vmax_motif))
.addfield('motif_jiang_c_dist', distance_to_motif(motif_jiang_c, vmax=vmax_motif))
.addfield('motif_jiang_d_dist', distance_to_motif(motif_jiang_d, vmax=vmax_motif))
.addfield('motif_jiang_e_dist', distance_to_motif(motif_jiang_e, vmax=vmax_motif))
.addfield('motif_aat_dist', distance_to_motif('AA[TC]AA[TC]AA[TC]AA[TC]', vmax=vmax_motif))
.addfield('motif_tta_dist', distance_to_motif('TT[AG]TT[AG]TT[AG]TT[AG]', vmax=vmax_motif))
.addfield('motif_at_dist', distance_to_motif('ATATATATATAT', vmax=vmax_motif))
.addfield('motif_a_dist', distance_to_motif('AAAAAAAAAAAA', vmax=vmax_motif))
.addfield('motif_t_dist', distance_to_motif('TTTTTTTTTTTT', vmax=vmax_motif))
)
tbl_aug.progress(1000).topickle(fn)
tbl_aug = (etl
.frompickle(fn)
# hack in extra field to avoid completely regenerating
.addfield('n_tr', n_tr)
.addfield('core_edge_dist', distance_to_core_edge)
)
# convert to dataframe
df = tbl_aug.todataframe()
return df
In [31]:
df = tabulate_co_predictors(5000, vmax_motif=None)
df.head()
Out[31]:
In [45]:
df.columns
Out[45]:
In [32]:
for v in ('cen_dist',
'tel_dist',
'core_edge_dist'):
sns.regplot(v, 'co_count', data=df, x_bins=20)
plt.title(v)
plt.xlim(left=0)
plt.ylim(0, 0.8)
plt.show();
In [33]:
for v in ('motif_jiang_a_dist',
'motif_jiang_b_dist',
'motif_jiang_c_dist',
'motif_jiang_d_dist',
'motif_jiang_e_dist'):
sns.regplot(v, 'co_count', data=df, x_bins=np.arange(0, 200000, 10000))
plt.xlim(0, 200000)
plt.ylim(0, 0.8)
plt.title(v)
plt.show();
In [36]:
for v in ('motif_aat_dist',
'motif_tta_dist',
'motif_at_dist',
'motif_a_dist',
'motif_t_dist',
):
sns.regplot(v, 'co_count', data=df, x_bins=20)
plt.title(v)
plt.xlim(left=0)
plt.ylim(0, .8)
plt.show();
In [37]:
sns.regplot('n_coding', 'co_count', data=df, x_bins=20)
plt.xlim(left=0)
plt.ylim(0, .8);
In [38]:
sns.regplot('n_tr', 'co_count', data=df, x_bins=20)
plt.xlim(left=0)
plt.ylim(0, .8);
In [39]:
sns.regplot('gc', 'co_count', data=df, x_bins=10)
plt.ylim(0, .8);
In [40]:
sns.regplot('n_snps', 'co_count', data=df, x_bins=20)
plt.xlim(left=0)
plt.ylim(0, .8);
In [41]:
sns.regplot('n_indels', 'co_count', data=df, x_bins=20)
plt.ylim(0, .8);
In [81]:
@functools.lru_cache(maxsize=None)
def glm_poisson(window_size, vmax_motif=None, cen_threshold=100000):
df = tabulate_co_predictors(window_size, vmax_motif=vmax_motif)
df_cen = df[df.cen_dist <= cen_threshold].copy()
df_tel = df[df.cen_dist > cen_threshold].copy()
# # standardise predictors
# for v in df.columns:
# if v not in ('chrom', 'start', 'stop', 'co_count'):
# a = df_cen[v].values
# df_cen[v] = (a - a.mean()) / a.std(ddof=0)
# a = df_tel[v].values
# df_tel[v] = (a - a.mean()) / a.std(ddof=0)
rd_cen = %R -i df_cen -o rd rd <- glm(co_count ~ cen_dist + tel_dist + n_coding + n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + motif_t_dist, data = df_cen, family = poisson)
summ_cen = %R summary(rd)
t_cen = %R -i rd -o t t <- dispersiontest(rd, trafo=1, alternative="greater")
rd_tel = %R -i df_tel -o rd rd <- glm(co_count ~ cen_dist + tel_dist + n_coding + n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + motif_t_dist, data = df_tel, family = poisson)
summ_tel = %R summary(rd)
t_tel = %R -i rd -o t t <- dispersiontest(rd, trafo=1, alternative="greater")
return summ_cen, t_cen, summ_tel, t_tel
In [94]:
@functools.lru_cache(maxsize=None)
def glm_poisson_v2(window_size, vmax_motif=None, cen_threshold=100000):
df = tabulate_co_predictors(window_size, vmax_motif=vmax_motif).copy()
# clip distances
df['cen_dist'] = df.cen_dist.clip(upper=cen_threshold)
df['core_edge_dist'] = df.core_edge_dist.clip(upper=cen_threshold)
# standardise predictors
for v in df.columns:
if v not in ('chrom', 'start', 'stop', 'co_count'):
a = df[v].values
df[v] = (a - a.mean()) / a.std(ddof=0)
rd = %R -i df -o rd rd <- glm(co_count ~ cen_dist + core_edge_dist + n_coding + n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + motif_t_dist, data = df, family = poisson)
summ = %R summary(rd)
t = %R -i rd -o t t <- dispersiontest(rd, trafo=1, alternative="greater")
return summ, t
In [95]:
@functools.lru_cache(maxsize=None)
def glm_poisson_v3(window_size, vmax_motif=None, cen_threshold=100000):
df = tabulate_co_predictors(window_size, vmax_motif=vmax_motif)
df_edge = df[(df.cen_dist <= cen_threshold) | (df.core_edge_dist <= cen_threshold)].copy()
df_main = df[(df.cen_dist > cen_threshold) & (df.core_edge_dist > cen_threshold)].copy()
# standardise predictors
for v in df.columns:
if v not in ('chrom', 'start', 'stop', 'co_count'):
a = df_edge[v].values
df_edge[v] = (a - a.mean()) / a.std(ddof=0)
a = df_main[v].values
df_main[v] = (a - a.mean()) / a.std(ddof=0)
rd_edge = %R -i df_edge -o rd rd <- glm(co_count ~ cen_dist + core_edge_dist + n_coding + n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + motif_t_dist, data = df_edge, family = poisson)
summ_edge = %R summary(rd)
t_edge = %R -i rd -o t t <- dispersiontest(rd, trafo=1, alternative="greater")
rd_main = %R -i df_main -o rd rd <- glm(co_count ~ cen_dist + core_edge_dist + n_coding + n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + motif_t_dist, data = df_main, family = poisson)
summ_main = %R summary(rd)
t_main = %R -i rd -o t t <- dispersiontest(rd, trafo=1, alternative="greater")
return summ_edge, t_edge, summ_main, t_main
In [96]:
glm_results = dict()
vmax_motif = 100000
for window_size in 50000, 20000, 10000, 5000, 2000, 1000:
summ_cen, t_cen, summ_tel, t_tel = glm_poisson(window_size, vmax_motif=vmax_motif)
glm_results[window_size] = summ_cen, t_cen, summ_tel, t_tel
print('========================================')
print(window_size)
print(summ_cen)
print(t_cen)
print(summ_tel)
print(t_tel)
In [97]:
glm_results_v2 = dict()
vmax_motif = 100000
for window_size in 50000, 20000, 10000, 5000, 2000, 1000:
summ, t = glm_poisson_v2(window_size, vmax_motif=vmax_motif)
glm_results_v2[window_size] = summ, t
print('========================================')
print(window_size)
print(summ)
print(t)
In [86]:
glm_results_v3 = dict()
vmax_motif = 100000
for window_size in 50000, 20000, 10000, 5000, 2000, 1000:
summ_edge, t_edge, summ_main, t_main = glm_poisson_v3(window_size, vmax_motif=vmax_motif)
glm_results_v3[window_size] = summ_edge, t_edge, summ_main, t_main
print('========================================')
print(window_size)
print('==================== edge ====================')
print(summ_edge)
print(t_edge)
print('==================== main ====================')
print(summ_main)
print(t_main)
In [98]:
tbl_co
Out[98]:
In [100]:
n_co = dict()
for chrom in sorted(fasta.keys()):
log(chrom)
n_co[chrom] = np.zeros(len(fasta[chrom]), dtype='i4')
for rec in tbl_co.eq('chrom', chrom).records():
start = rec.co_pos_min - 1
stop = rec.co_pos_max
n_co[chrom][start:stop] = n_co[chrom][start:stop] + 1
n_co
Out[100]:
In [101]:
is_core = dict()
for chrom in sorted(fasta.keys()):
log(chrom)
is_core[chrom] = np.zeros(len(fasta[chrom]), dtype='b1')
for rec in tbl_regions_1b.eq('region_chrom', chrom).eq('region_type', 'Core').records():
start = rec.region_start - 1
stop = rec.region_stop
is_core[chrom][start:stop] = True
In [102]:
is_exon = dict()
for chrom in sorted(fasta.keys()):
log(chrom)
is_exon[chrom] = np.zeros(len(fasta[chrom]), dtype='b1')
for rec in tbl_exons.eq('feature_chrom', chrom).records():
start = rec.feature_start - 1
stop = rec.feature_stop
is_exon[chrom][start:stop] = True
In [104]:
coding_tr = list()
coding_nontr = list()
noncoding_tr = list()
noncoding_nontr = list()
for chrom in sorted(fasta.keys()):
# coding tr
c = is_core[chrom] & is_exon[chrom] & is_tr[chrom]
x = n_co[chrom][c]
coding_tr.append(x)
# coding non-tr
c = is_core[chrom] & is_exon[chrom] & ~is_tr[chrom]
x = n_co[chrom][c]
coding_nontr.append(x)
# non-coding tr
c = is_core[chrom] & ~is_exon[chrom] & is_tr[chrom]
x = n_co[chrom][c]
noncoding_tr.append(x)
# non-coding non-tr
c = is_core[chrom] & ~is_exon[chrom] & ~is_tr[chrom]
x = n_co[chrom][c]
noncoding_nontr.append(x)
coding_tr = np.concatenate(coding_tr)
coding_nontr = np.concatenate(coding_nontr)
noncoding_tr = np.concatenate(noncoding_tr)
noncoding_nontr = np.concatenate(noncoding_nontr)
In [107]:
for x in coding_tr, coding_nontr, noncoding_tr, noncoding_nontr:
print(x.mean(), x.std())
In [106]:
fig, ax = plt.subplots()
ax.boxplot([coding_tr, coding_nontr, noncoding_tr, noncoding_nontr]);
In [49]:
analyse_dispersion(1000)
In [50]:
analyse_dispersion(2000)
In [51]:
analyse_dispersion(5000)
In [52]:
analyse_dispersion(10000)
In [53]:
analyse_dispersion(20000)
In [54]:
analyse_dispersion(50000)
In [55]:
analyse_dispersion(100000)
In [17]:
df = tabulate_predictors(10000)
df.head()
Out[17]:
In [18]:
rd = %R -i df -o rd rd <- glm(co_count ~ cen_dist + n_coding + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + motif_tta_dist, data = df, family = poisson)
rd
Out[18]:
In [19]:
print(rd)
In [20]:
summ = %R summary(rd)
summ
Out[20]:
In [21]:
print(summ)
In [39]:
'%.2e' % .2
Out[39]:
In [24]:
print(summ.names)
In [32]:
summ[11]
Out[32]:
In [33]:
coefficients = np.array(summ[11])
coefficients
Out[33]:
In [54]:
from prettypandas import PrettyPandas
In [58]:
col_index = pandas.MultiIndex.from_product([[10000], ['Estimate', 'Std. Error', 'z value', 'Pr(>|z|)']],
names=['window size', 'coefficient'])
row_index = ['intercept', 'cen_dist', 'n_coding', 'gc', 'n_snps', 'n_indels',
'motif_jiang_a_dist', 'motif_jiang_b_dist', 'motif_jiang_c_dist', 'motif_jiang_d_dist', 'motif_jiang_e_dist',
'motif_aat_dist', 'motif_tta_dist']
df = pandas.DataFrame(coefficients, columns=col_index, index=row_index)
df
Out[58]:
In [60]:
pandas.__version__
Out[60]:
In [71]:
dfs = df.style
dfs.apply(lambda v: '%.1f' % v, axis=0)
Out[71]:
In [56]:
PrettyPandas(df, precision=2)
Out[56]:
In [40]:
t = %R -i rd -o t t <- dispersiontest(rd, trafo=1, alternative="greater")
t
Out[40]:
In [48]:
t[1][0]
Out[48]:
In [41]:
print(t)
In [ ]:
summ = %R summary(rd)
log(summ)
t = %R -i rd -o t t <- dispersiontest(rd, trafo=1, alternative="greater")
In [ ]:
In [36]:
x = scipy.stats.poisson.rvs(np.mean(co_count), size=co_count.size)
x
Out[36]:
In [37]:
plt.hist(x, histtype='step', bins=np.arange(10), lw=2)
plt.hist(co_count, histtype='step', bins=np.arange(10), lw=2);
In [33]:
np.mean(co_count), np.std(co_count)
Out[33]:
In [35]:
np.mean(x), np.std(x)
Out[35]:
In [14]:
%%R
library(AER)
data(RecreationDemand)
head(RecreationDemand)
In [15]:
%%R
rd <- glm(RecreationDemand$trips ~ 1, family = poisson)
summary(rd)
In [17]:
%%R
dispersiontest(rd, trafo=1, alternative="greater")
In [ ]:
combined = load_callsets(COMBINED_CALLSET_FN_TEMPLATE,
variant_filter='FILTER_PASS',
call_filter=combined_conf_calls,
sample_exclusions=excessive_recomb_samples)
In [ ]:
variants = combined['3d7_hb3']['variants']
In [ ]:
pos = variants['POS'][variants['CHROM'] == b'Pf3D7_01_v3']
pos.shape
In [ ]:
tbl_intervals = (
etl
.fromcolumns((pos[:-1], pos[1:], np.diff(pos)), header=('start', 'stop', 'length'))
.addfield('chrom', b'Pf3D7_01_v3', index=0)
.intervalleftjoin(tbl_recom.prefixheader('recom_'),
lkey='chrom', lstart='start', lstop='stop',
rkey='recom_chrom', rstart='recom_start', rstop='recom_stop',
include_stop=True)
.aggregate(key=('chrom', 'start', 'stop', 'length'),
aggregation=list, value='recom_type')
.rename('value', 'event_types')
.convert('event_types', lambda v: [] if v == [None] else v)
.addfield('n_events', lambda row: len(row.event_types))
)
tbl_intervals.gt('n_events', 0).display(50)
In [ ]: