In [1]:
%run ../../shared_setup.ipynb
In [2]:
def tabulate_tr(chrom):
fn = '/data/plasmodium/pfalciparum/pf-crosses/data/genome/sanger/version3/September_2012/%s.tr.pickle' % chrom
tbl_tr = etl.frompickle(fn)
return tbl_tr
tbl_tr = (
etl
.cat(*[tabulate_tr(chrom) for chrom in sorted(fasta.keys())])
.sort(key=('chrom', 'start'))
)
tbl_tr
In [4]:
def select_tr_threshold(row):
unit_length = row.unit_length
tract_length = row.tract_length
return (
((unit_length == 1) and (tract_length >= 6)) or
((unit_length == 2) and (tract_length >= 9)) or
((unit_length == 3) and (tract_length >= 11)) or
((unit_length == 4) and (tract_length >= 13)) or
((unit_length == 5) and (tract_length >= 14)) or
((unit_length == 6) and (tract_length >= 16)) or
((unit_length >= 7) and (tract_length >= 18))
)
tbl_tr_proper = tbl_tr.select(select_tr_threshold).cache()
tbl_tr_proper.display(20)
In [5]:
tbl_tr_proper.valuecounts('chrom').displayall()
In [6]:
is_tr = dict()
for chrom in sorted(fasta.keys()):
log(chrom)
is_tr[chrom] = np.zeros(len(fasta[chrom]), dtype='b1')
for rec in tbl_tr_proper.eq('chrom', chrom).records():
start = rec.start - 1
stop = rec.stop
is_tr[chrom][start:stop] = True
is_tr
Out[6]:
In [7]:
tbl_regions_1b
Out[7]:
In [8]:
is_core = dict()
for chrom in sorted(fasta.keys()):
log(chrom)
is_core[chrom] = np.zeros(len(fasta[chrom]), dtype='b1')
for rec in tbl_regions_1b.eq('region_chrom', chrom).eq('region_type', 'Core').records():
start = rec.region_start - 1
stop = rec.region_stop
is_core[chrom][start:stop] = True
In [9]:
tbl_exons
Out[9]:
In [10]:
is_exon = dict()
for chrom in sorted(fasta.keys()):
log(chrom)
is_exon[chrom] = np.zeros(len(fasta[chrom]), dtype='b1')
for rec in tbl_exons.eq('feature_chrom', chrom).records():
start = rec.feature_start - 1
stop = rec.feature_stop
is_exon[chrom][start:stop] = True
In [11]:
tbl_introns
Out[11]:
In [12]:
is_intron = dict()
for chrom in sorted(fasta.keys()):
log(chrom)
is_intron[chrom] = np.zeros(len(fasta[chrom]), dtype='b1')
for rec in tbl_introns.eq('feature_chrom', chrom).records():
start = rec.feature_start - 1
stop = rec.feature_stop
is_intron[chrom][start:stop] = True
In [13]:
tbl_intergenic
Out[13]:
In [14]:
is_intergenic = dict()
for chrom in sorted(fasta.keys()):
log(chrom)
is_intergenic[chrom] = np.zeros(len(fasta[chrom]), dtype='b1')
for rec in tbl_intergenic.eq('feature_chrom', chrom).records():
start = rec.feature_start - 1
stop = rec.feature_stop
is_intergenic[chrom][start:stop] = True
In [15]:
def tr_composition():
tbl = [['chrom', 'n_core', 'n_core_tr', 'n_core_coding', 'n_core_coding_tr', 'n_core_noncoding', 'n_core_noncoding_tr']]
all_n_core = all_n_core_tr = all_n_core_coding = all_n_core_coding_tr = all_n_core_noncoding = all_n_core_noncoding_tr = 0
for chrom in sorted(fasta.keys()):
n_core = nnz(is_core[chrom])
all_n_core += n_core
is_core_tr = is_core[chrom] & is_tr[chrom]
n_core_tr = nnz(is_core_tr)
all_n_core_tr += n_core_tr
is_core_coding = is_core[chrom] & is_exon[chrom]
n_core_coding = nnz(is_core_coding)
all_n_core_coding += n_core_coding
is_core_coding_tr = is_core_coding & is_tr[chrom]
n_core_coding_tr = nnz(is_core_coding_tr)
all_n_core_coding_tr += n_core_coding_tr
is_core_noncoding = is_core[chrom] & (is_intron[chrom] | is_intergenic[chrom])
n_core_noncoding = nnz(is_core_noncoding)
all_n_core_noncoding += n_core_noncoding
is_core_noncoding_tr = is_core_noncoding & is_tr[chrom]
n_core_noncoding_tr = nnz(is_core_noncoding_tr)
all_n_core_noncoding_tr += n_core_noncoding_tr
row = [chrom, n_core, n_core_tr, n_core_coding, n_core_coding_tr, n_core_noncoding, n_core_noncoding_tr]
tbl += [row]
# final row
row = ['all', all_n_core, all_n_core_tr, all_n_core_coding, all_n_core_coding_tr, all_n_core_noncoding, all_n_core_noncoding_tr]
tbl += [row]
return (etl
.wrap(tbl)
.addfield('pc_core_tr', lambda row: '%.1f%%' % (row.n_core_tr * 100 / row.n_core))
.addfield('pc_core_coding_tr', lambda row: '%.1f%%' % (row.n_core_coding_tr * 100 / row.n_core_coding))
.addfield('pc_core_noncoding_tr', lambda row: '%.1f%%' % (row.n_core_noncoding_tr * 100 / row.n_core_noncoding))
)
tr_composition().displayall()
In [16]:
tbl_tr_core = tbl_tr.eq('region_type', 'Core').cutout('region_type').cache()
tbl_tr_core
Out[16]:
In [17]:
def tabulate_indels(cross):
# VCF file name
vcf_fn = COMBINED_VCF_FN_TEMPLATE.format(cross=cross)
# build the table
tbl = (etl
.fromvcf(vcf_fn, samples=None)
# only keep PASS variants
.false('FILTER')
# exclude fields not needed in this analysis
.cutout('ID', 'QUAL', 'FILTER')
# unpack INFO
.unpackdict('INFO', ['set', 'STR', 'RU', 'RPA'])
# only keep INDELs
.select(lambda row: len(row.REF) > 1 or any(len(a) > 1 for a in row.ALT))
.addfield('cross', cross)
)
return tbl
In [18]:
tbl_indels = (
etl
.cat(*[tabulate_indels(cross) for cross in CROSSES])
.sort(key=('CHROM', 'POS'))
)
tbl_indels
Out[18]:
In [19]:
tbl_tr_indel = (
tbl_tr_core
.leftjoin(tbl_indels.addfield('pos_indel', lambda row: row.POS+1),
lkey=('chrom', 'start'),
rkey=('CHROM', 'pos_indel'))
# .cutout(13)
)
tbl_tr_indel.display(10)
In [20]:
tbl_tr_indel.gt('tract_length', 10).display(50)
In [21]:
tbl_analysis = (
tbl_tr_indel
.aggregate(('chrom', 'start', 'stop', 'repeat_unit', 'n_units', 'unit_length', 'last_unit_length', 'tract_length'),
set, 'cross', presorted=True)
.addfield('polymorphic', lambda row: row.value != set([None]))
.addfield('polymorphic_3d7_hb3', lambda row: '3d7_hb3' in row.value)
.addfield('polymorphic_hb3_dd2', lambda row: 'hb3_dd2' in row.value)
.addfield('polymorphic_7g8_gb4', lambda row: '7g8_gb4' in row.value)
.cutout('value')
)
tbl_analysis.gt('tract_length', 10).display(20)
In [22]:
df = tbl_analysis.todataframe()
df.head()
Out[22]:
In [23]:
df.tail()
Out[23]:
In [32]:
palette = sns.hls_palette(13)
sns.palplot(palette);
In [44]:
def fig_str_indel():
fig = plt.figure()
ax = fig.add_subplot(111)
sns.despine(ax=ax, offset=5)
unit_lengths = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
palette = sns.hls_palette(len(unit_lengths))
for unit_length, color in zip(unit_lengths, palette):
s1 = df[df.unit_length == unit_length].tract_length
s2 = df[(df.unit_length == unit_length) & df.polymorphic].tract_length
y1 = np.bincount(s1, minlength=70)[:70]
y2 = np.bincount(s2, minlength=70)[:70]
x = np.arange(0, 70, 1)
y3 = y2 / y1
if unit_length == 1:
x = x[:12]
y3 = y3[:12]
ax.plot(x, y3, linestyle='-', marker=' ', lw=2, color=color, label=unit_length)
ax.set_xlim(0, 30)
ax.legend(bbox_to_anchor=(1, 1), loc='upper left', title='Repeat unit length')
ax.set_yscale('log')
ax.set_ylim(0.0005, 1)
ax.set_xlabel('Tract length')
ax.set_ylabel('Fraction polymorphic')
fig.tight_layout()
fn = '../../artwork/supp/indel_str.{dpi}.{fmt}'
for fmt in 'jpeg', 'png':
for dpi in 120, 300:
fig.savefig(fn.format(dpi=dpi, fmt=fmt), dpi=dpi, jpeg_quality=100, bbox_inches='tight')
fig_str_indel()
In [26]:
tbl_tr_indel_2 = (
tbl_tr_core
.intervalleftjoin(tbl_indels.true('STR').addfield('pos_indel_stop', lambda row: row.POS+len(row.REF)),
lkey='chrom', lstart='start', lstop='stop',
rkey='CHROM', rstart='POS', rstop='pos_indel_stop',
include_stop=True)
)
tbl_tr_indel_2.display(10)
In [27]:
tbl_tr_indel_2.gt('tract_length', 10).display(50)
In [28]:
fasta['Pf3D7_01_v3'][94980:94993+30]
Out[28]:
In [29]:
tbl_tr_core.eq('chrom', 'Pf3D7_01_v3').gt('start', 94990)
Out[29]:
In [30]:
tbl_analysis_2 = (
tbl_tr_indel_2
.aggregate(('chrom', 'start', 'stop', 'repeat_unit', 'n_units', 'unit_length', 'last_unit_length', 'tract_length'),
set, 'cross', presorted=True)
.addfield('polymorphic', lambda row: row.value != set([None]))
.addfield('polymorphic_3d7_hb3', lambda row: '3d7_hb3' in row.value)
.addfield('polymorphic_hb3_dd2', lambda row: 'hb3_dd2' in row.value)
.addfield('polymorphic_7g8_gb4', lambda row: '7g8_gb4' in row.value)
.cutout('value')
)
tbl_analysis_2.gt('tract_length', 10).display(20)
In [31]:
df2 = tbl_analysis_2.todataframe()
df2.head()
Out[31]:
In [32]:
fig = plt.figure()
ax = fig.add_subplot(111)
for unit_length in 1, 2, 3, 4, 5, 6:
s1 = df2[df2.unit_length == unit_length].tract_length
s2 = df2[(df2.unit_length == unit_length) & df2.polymorphic].tract_length
y1 = np.bincount(s1, minlength=70)[:70]
y2 = np.bincount(s2, minlength=70)[:70]
x = np.arange(0, 70, 1)
y3 = y2 / y1
ax.plot(x, y3, linestyle='-', marker=' ', lw=2, label=unit_length)
ax.set_xlim(1, 30)
ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
ax.set_yscale('log');
In [ ]: