Study diversity in relation to feature types, core promoter position, and other phenomena.
In [1]:
%run ../../shared_setup.ipynb
In [2]:
tbl_regions_1b
Out[2]:
In [3]:
tbl_features
Out[3]:
In [4]:
tbl_features.valuecounts('feature_type').displayall()
In [5]:
tbl_genes
Out[5]:
In [6]:
df = tbl_genes.cut('feature_strand', 'feature_length').todataframe()
df.boxplot('feature_length', 'feature_strand', grid=False)
plt.suptitle('Genes');
In [7]:
tbl_intergenic
Out[7]:
In [8]:
df = tbl_intergenic.cut('feature_length', 'feature_type').todataframe()
df.boxplot('feature_length', 'feature_type', grid=False)
plt.ylim(0, 8000)
plt.suptitle('Intergenic');
In [9]:
tbl_exons
Out[9]:
In [10]:
df = tbl_exons.cut('feature_strand', 'feature_length').todataframe()
df.boxplot('feature_length', 'feature_strand', grid=False)
plt.suptitle('Exons');
In [11]:
tbl_introns
Out[11]:
In [12]:
df = tbl_introns.cut('feature_strand', 'feature_length').todataframe()
df.boxplot('feature_length', 'feature_strand', grid=False)
plt.suptitle('Introns');
In [13]:
callsets = load_callsets(COMBINED_CALLSET_FN_TEMPLATE, variant_filter='FILTER_PASS')
callsets_snp = filter_callsets(callsets, variant_filter='is_snp')
callsets_indel = filter_callsets(callsets, variant_filter='~is_snp')
callsets_indel_str = filter_callsets(callsets, variant_filter='~is_snp & STR')
callsets_indel_nostr = filter_callsets(callsets, variant_filter='~is_snp & ~STR')
In [14]:
def tabulate_diversity(tbl, callset, field='diversity'):
V, _, _ = unpack_callset(callset)
CHROM = V.CHROM
POS = V.POS
def add_diversity(row):
start_index, stop_index = locate_variants(CHROM, POS, row.feature_chrom, row.feature_start, row.feature_stop)
n = stop_index - start_index
return n / row.feature_length
return tbl.addfield(field, add_diversity)
etl.Table.tabulate_diversity = tabulate_diversity
def add_diversity_fields(tbl):
return (tbl
.tabulate_diversity(
callsets_snp['3d7_hb3'],
'snp_diversity_3d7_hb3'
)
.tabulate_diversity(
callsets_indel['3d7_hb3'],
'indel_diversity_3d7_hb3'
)
.tabulate_diversity(
callsets_indel_str['3d7_hb3'],
'indel_str_diversity_3d7_hb3'
)
.tabulate_diversity(
callsets_indel_nostr['3d7_hb3'],
'indel_nostr_diversity_3d7_hb3'
)
.tabulate_diversity(
callsets_snp['hb3_dd2'],
'snp_diversity_hb3_dd2'
)
.tabulate_diversity(
callsets_indel['hb3_dd2'],
'indel_diversity_hb3_dd2'
)
.tabulate_diversity(
callsets_indel_str['hb3_dd2'],
'indel_str_diversity_hb3_dd2'
)
.tabulate_diversity(
callsets_indel_nostr['hb3_dd2'],
'indel_nostr_diversity_hb3_dd2'
)
.tabulate_diversity(
callsets_snp['7g8_gb4'],
'snp_diversity_7g8_gb4'
)
.tabulate_diversity(
callsets_indel['7g8_gb4'],
'indel_diversity_7g8_gb4'
)
.tabulate_diversity(
callsets_indel_str['7g8_gb4'],
'indel_str_diversity_7g8_gb4'
)
.tabulate_diversity(
callsets_indel_nostr['7g8_gb4'],
'indel_nostr_diversity_7g8_gb4'
)
.addfield('snp_diversity_all', lambda row: sum(row['snp_diversity_%s' % cross] for cross in CROSSES) / 3)
.addfield('indel_diversity_all', lambda row: sum(row['indel_diversity_%s' % cross] for cross in CROSSES) / 3)
.addfield('indel_str_diversity_all', lambda row: sum(row['indel_str_diversity_%s' % cross] for cross in CROSSES) / 3)
.addfield('indel_nostr_diversity_all', lambda row: sum(row['indel_nostr_diversity_%s' % cross] for cross in CROSSES) / 3)
)
etl.Table.add_diversity_fields = add_diversity_fields
In [15]:
@etlcache
def tbl_feature_diversity():
return (etl
.cat(tbl_exons, tbl_introns, tbl_intergenic)
.eq('feature_region_type', 'Core')
.cutout('feature_region_type')
.sort(key=('feature_chrom', 'feature_start', 'feature_type'))
.add_diversity_fields()
)
tbl_feature_diversity
Out[15]:
In [16]:
df_feature_diversity = tbl_feature_diversity.cutout('feature_id', 'feature_parent_id').todataframe()
feature_types = sorted(df_feature_diversity.feature_type.unique())
feature_types
Out[16]:
In [17]:
def multi_boxplot(df, by, variables, figsize=(10, 2.5), suptitle='', axtitles=None, ylim=None):
# setup figure
fig = plt.figure(figsize=figsize)
fig.suptitle(suptitle, fontsize=11, y=1.05)
n = len(variables)
# make subplots
for i, v in enumerate(variables):
# setup subplot
ax = fig.add_subplot(1, n, i+1)
sns.despine(offset=5, ax=ax)
# obtain subplot data
grouped = df.groupby(by)[v]
xticklabels = [g[0] for g in grouped]
x = [g[1] for g in grouped]
# plotting
ax.boxplot(x, showmeans=True, notch=True);
ax.set_xticklabels(xticklabels, rotation=90)
ax.set_ylabel('')
if ylim:
ax.set_ylim(*ylim)
if axtitles:
ax.set_title(axtitles[i])
else:
ax.set_title(v)
if i == 0:
ax.set_ylabel('diversity (bp$^{-1}$)')
else:
ax.set_yticks([])
ax.spines['left'].set_visible(False)
return fig
In [18]:
by = 'feature_type'
for cross in CROSSES + ('all',):
variables = [v % cross for v in ('snp_diversity_%s', 'indel_diversity_%s', 'indel_str_diversity_%s', 'indel_nostr_diversity_%s')]
suptitle = LABELS.get(cross, 'All crosses')
axtitles = 'SNPs', 'INDELs', 'INDELs (STR)', 'INDELs (non-STR)'
fig = multi_boxplot(df_feature_diversity, by, variables, ylim=(0, 0.005), suptitle=suptitle, axtitles=axtitles)
# fig.savefig('../../artwork/suppfig_feature_diversity_%s.jpeg' % cross, jpeg_quality=100, dpi=200, bbox_inches='tight')
In [19]:
import scikits.bootstrap as bootstrap
In [20]:
variable = 'snp_diversity_all'
condition = (df_feature_diversity.feature_type == 'CDS') & (df_feature_diversity.feature_length > 50)
x = df_feature_diversity[condition][variable] * (10**3)
x.describe(percentiles=[.05, .25, .5, .75, .95]), bootstrap.ci(x)
Out[20]:
In [21]:
variable = 'snp_diversity_all'
condition = (
(df_feature_diversity.feature_length > 50)
& (
(df_feature_diversity.feature_type == 'intergenic_0')
| (df_feature_diversity.feature_type == 'intergenic_1')
| (df_feature_diversity.feature_type == 'intergenic_2')
| (df_feature_diversity.feature_type == 'intron')
)
)
x = df_feature_diversity[condition][variable] * (10**3)
x.describe(percentiles=[.05, .25, .5, .75, .95]), bootstrap.ci(x)
Out[21]:
In [22]:
variable = 'indel_diversity_all'
condition = (df_feature_diversity.feature_type == 'CDS') & (df_feature_diversity.feature_length > 50)
x = df_feature_diversity[condition][variable] * (10**3)
x.describe(percentiles=[.05, .25, .5, .75, .95]), bootstrap.ci(x)
Out[22]:
In [23]:
variable = 'indel_diversity_all'
condition = (
(df_feature_diversity.feature_length > 50)
& (
(df_feature_diversity.feature_type == 'intergenic_0')
| (df_feature_diversity.feature_type == 'intergenic_1')
| (df_feature_diversity.feature_type == 'intergenic_2')
| (df_feature_diversity.feature_type == 'intron')
)
)
x = df_feature_diversity[condition][variable] * (10**3)
x.describe(percentiles=[.05, .25, .5, .75, .95]), bootstrap.ci(x)
Out[23]:
In [24]:
cp_v3_coords_fn = os.path.join(DATA_DIR, 'reference/ponts_2011/cp_v3_coords.bed')
@etlcache
def tbl_core_promoters():
return (etl
.fromtsv(cp_v3_coords_fn)
.cut(0, 1, 3, 4)
.pushheader(['chrom', 'pos', 'strand', 'gene'])
.convert('pos', int)
.intervalleftjoin(tbl_intergenic.prefixheader('parent_'),
lkey='chrom', lstart='pos', lstop='pos',
rkey='parent_feature_chrom', rstart='parent_feature_start', rstop='parent_feature_stop',
include_stop=True)
.eq('parent_feature_region_type', 'Core')
.cutout('parent_feature_chrom', 'parent_feature_region_type')
.sort(key=('chrom', 'pos'))
)
tbl_core_promoters.display(10)
tbl_core_promoters.distinct(key=('parent_feature_id', 'strand')).display(10)
In [25]:
def tabulate_site_proximity_regions(tbl_sites, window_size=100):
tbl_out = [['feature_chrom', 'feature_start', 'feature_stop', 'feature_length', 'site_pos', 'site_strand', 'site_dist', 'parent_feature_type', 'parent_feature_id']]
for rec in tbl_sites.records():
if rec.strand == '+':
# upstream bins
# setup first bin, then work upstream
window_stop = rec.pos
window_start = window_stop - window_size
window_center = (window_start + window_stop) // 2
dist = window_center - rec.pos
while window_start > rec.parent_feature_start:
row = [rec.chrom, window_start, window_stop, window_size, rec.pos, rec.strand, dist, rec.parent_feature_type, rec.parent_feature_id]
tbl_out += [row]
window_stop = window_start
window_start = window_stop - window_size
window_center = (window_start + window_stop) // 2
dist = window_center - rec.pos
# downstream bins
# setup first bin, then work downstream
window_start = rec.pos
window_stop = window_start + window_size
window_center = (window_start + window_stop) // 2
dist = window_center - rec.pos
while window_stop < rec.parent_feature_stop:
row = [rec.chrom, window_start, window_stop, window_size, rec.pos, rec.strand, dist, rec.parent_feature_type, rec.parent_feature_id]
tbl_out += [row]
window_start = window_stop
window_stop = window_start + window_size
window_center = (window_start + window_stop) // 2
dist = window_center - rec.pos
else:
# upstream bins
# setup first bin, then work upstream
window_start = rec.pos
window_stop = window_start + window_size
window_center = (window_start + window_stop) // 2
dist = rec.pos - window_center
while window_stop < rec.parent_feature_stop:
row = [rec.chrom, window_start, window_stop, window_size, rec.pos, rec.strand, dist, rec.parent_feature_type, rec.parent_feature_id]
tbl_out += [row]
window_start = window_stop
window_stop = window_start + window_size
window_center = (window_start + window_stop) // 2
dist = rec.pos - window_center
# downstream bins
# setup first bin, then work downstream
window_stop = rec.pos
window_start = window_stop - window_size
window_center = (window_start + window_stop) // 2
dist = rec.pos - window_center
while window_start > rec.parent_feature_start:
row = [rec.chrom, window_start, window_stop, window_size, rec.pos, rec.strand, dist, rec.parent_feature_type, rec.parent_feature_id]
tbl_out += [row]
window_stop = window_start
window_start = window_stop - window_size
window_center = (window_start + window_stop) // 2
dist = rec.pos - window_center
return etl.wrap(tbl_out).sort(key=('feature_chrom', 'feature_start'))
In [26]:
@etlcache
def tbl_core_promoter_diversity_100():
return (
tabulate_site_proximity_regions(tbl_core_promoters, window_size=100)
.add_diversity_fields()
)
tbl_core_promoter_diversity_100
Out[26]:
In [27]:
@pdcache
def df_core_promoter_diversity_100():
return (
tbl_core_promoter_diversity_100
.cutout('site_pos', 'site_strand', 'parent_feature_type', 'parent_feature_id')
.todataframe()
)
df_core_promoter_diversity_100.head()
Out[27]:
In [28]:
@etlcache
def tbl_core_promoter_diversity_50():
return (
tabulate_site_proximity_regions(tbl_core_promoters, window_size=50)
.add_diversity_fields()
)
tbl_core_promoter_diversity_50
Out[28]:
In [29]:
@pdcache
def df_core_promoter_diversity_50():
return (
tbl_core_promoter_diversity_50
.cutout('site_pos', 'site_strand', 'parent_feature_type', 'parent_feature_id')
.todataframe()
)
df_core_promoter_diversity_50.head()
Out[29]:
In [30]:
def multi_regplot(df, x, variables, figsize=(10, 2.5), suptitle='', axtitles=None, xlim=None, ylim=None, markersize=20, xlabel=None):
# setup figure
fig = plt.figure(figsize=figsize)
fig.suptitle(suptitle, fontsize=12, y=1.05)
n = len(variables)
x = df[x]
# make subplots
for i, v in enumerate(variables):
# setup subplot
ax = fig.add_subplot(1, n, i+1)
sns.despine(offset=5, ax=ax)
# obtain subplot data
y = df[v]
# plotting
ax.axhline(np.mean(y), color='k', linestyle=':')
# ax.axvline(0, color='k', linestyle=':')
sns.regplot(x, y, x_estimator=np.mean, fit_reg=False, ax=ax, color='k', scatter_kws=dict(s=markersize))
if xlim:
ax.set_xlim(*xlim)
if ylim:
ax.set_ylim(*ylim)
if axtitles:
ax.set_title(axtitles[i])
else:
ax.set_title(v)
if i == 0:
ax.set_ylabel('diversity (bp$^{-1}$)')
else:
ax.set_yticks([])
ax.spines['left'].set_visible(False)
ax.set_ylabel('')
if i == 1 and xlabel:
ax.set_xlabel(xlabel)
else:
ax.set_xlabel('')
return fig
In [31]:
x = 'site_dist'
for cross in CROSSES + ('all',):
variables = [v % cross for v in ('snp_diversity_%s', 'indel_nostr_diversity_%s', 'indel_str_diversity_%s')]
suptitle = LABELS.get(cross, 'All crosses')
axtitles = 'SNPs', 'Indels (non-STR)', 'Indels (STR)'
xlabel = 'distance to core promoter (bp)'
fig = multi_regplot(df_core_promoter_diversity_100, x, variables, xlim=(-600, 600), ylim=(0, 0.0025), suptitle=suptitle, axtitles=axtitles, xlabel=xlabel)
# fig.savefig('../artwork/suppfig_core_promoter_diversity_%s.jpeg' % cross, jpeg_quality=100, dpi=200, bbox_inches='tight')
plt.show()
In [32]:
x = 'site_dist'
for cross in CROSSES + ('all',):
variables = [v % cross for v in ('snp_diversity_%s', 'indel_nostr_diversity_%s', 'indel_str_diversity_%s')]
suptitle = LABELS.get(cross, 'All crosses')
axtitles = 'SNPs', 'Indels (non-STR)', 'Indels (STR)'
xlabel = 'distance to core promoter (bp)'
fig = multi_regplot(df_core_promoter_diversity_50, x, variables, xlim=(-500, 500), ylim=(0, 0.0025), suptitle=suptitle, axtitles=axtitles, xlabel=xlabel)
# fig.savefig('../../artwork/suppfig_core_promoter_diversity_%s.jpeg' % cross, jpeg_quality=100, dpi=200, bbox_inches='tight')
plt.show()
In [33]:
xlim = -500, 500
ylim = 0, .002
markersize = 20
xlabel = 'distance to core promoter (bp)'
fig = plt.figure(figsize=(4, 7))
fig.suptitle('C', fontsize=12, fontweight='bold', y=.95)
df = df_core_promoter_diversity_50
x = df.site_dist
ax = fig.add_subplot(2, 1, 1)
sns.despine(offset=5, ax=ax)
y = df.indel_str_diversity_all
ax.axhline(np.mean(y), color='k', linestyle=':')
sns.regplot(x, y, x_estimator=np.mean, fit_reg=False, ax=ax, color='k', scatter_kws=dict(s=markersize))
ax.set_xlim(*xlim)
ax.set_ylim(*ylim)
ax.text(1, .1, 'Indels (STR)', transform=ax.transAxes, ha='right', va='bottom')
ax.set_ylabel('diversity (bp$^{-1}$)')
ax.set_xticklabels([])
ax.set_xlabel('')
ax = fig.add_subplot(2, 1, 2)
sns.despine(offset=5, ax=ax)
y = df.indel_nostr_diversity_all
ax.axhline(np.mean(y), color='k', linestyle=':')
sns.regplot(x, y, x_estimator=np.mean, fit_reg=False, ax=ax, color='k', scatter_kws=dict(s=markersize))
ax.set_xlim(*xlim)
ax.set_ylim(*ylim)
ax.text(1, .9, 'Indels (non-STR)', transform=ax.transAxes, ha='right', va='top')
ax.set_ylabel('diversity (bp$^{-1}$)')
ax.set_xlabel(xlabel)
#fig.savefig('../artwork/fig_indels_c.jpeg', dpi=1200, jpeg_quality=100, bbox_inches='tight');
Out[33]:
In [44]:
tbl_intergenic.display(20)
In [48]:
tss_v3_coords_fn = os.path.join(DATA_DIR, 'reference/ponts_2011/tss_v3_coords.bed')
tbl_tss_v3 = (etl
.fromtsv(tss_v3_coords_fn)
.cut(0, 1, 3, 4)
.pushheader(['chrom', 'pos', 'strand', 'gene'])
.convert('pos', int)
.intervalleftjoin(tbl_intergenic.prefixheader('parent_'),
lkey='chrom', lstart='pos', lstop='pos',
rkey='parent_feature_chrom', rstart='parent_feature_start', rstop='parent_feature_stop',
include_stop=True)
.eq('parent_feature_region_type', 'Core')
.cutout('parent_feature_chrom', 'parent_feature_region_type')
.sort(key=('chrom', 'pos'))
)
tbl_tss_v3.display(20)
In [52]:
tbl_tss_diversity_50 = (
tabulate_site_proximity_regions(
tbl_tss_v3
# # only use intergenic regions where you expect one core promoter
# .eq('type_intergenic', 'intergenic_1')
# # make sure there is only one
# .distinct(key='label_intergenic')
,
window_size=50,
)
.tabulate_diversity(
callsets_snp['3d7_hb3'],
'snp_diversity_3d7_hb3'
)
.tabulate_diversity(
callsets_indel_str['3d7_hb3'],
'indel_str_diversity_3d7_hb3'
)
.tabulate_diversity(
callsets_indel_nostr['3d7_hb3'],
'indel_nostr_diversity_3d7_hb3'
)
.tabulate_diversity(
callsets_snp['hb3_dd2'],
'snp_diversity_hb3_dd2'
)
.tabulate_diversity(
callsets_indel_str['hb3_dd2'],
'indel_str_diversity_hb3_dd2'
)
.tabulate_diversity(
callsets_indel_nostr['hb3_dd2'],
'indel_nostr_diversity_hb3_dd2'
)
.tabulate_diversity(
callsets_snp['7g8_gb4'],
'snp_diversity_7g8_gb4'
)
.tabulate_diversity(
callsets_indel_str['7g8_gb4'],
'indel_str_diversity_7g8_gb4'
)
.tabulate_diversity(
callsets_indel_nostr['7g8_gb4'],
'indel_nostr_diversity_7g8_gb4'
)
.addfield('snp_diversity_all', lambda row: sum(row['snp_diversity_%s' % cross] for cross in CROSSES) / 3)
.addfield('indel_str_diversity_all', lambda row: sum(row['indel_str_diversity_%s' % cross] for cross in CROSSES) / 3)
.addfield('indel_nostr_diversity_all', lambda row: sum(row['indel_nostr_diversity_%s' % cross] for cross in CROSSES) / 3)
)
df_tss_diversity_50 = tbl_tss_diversity_50.cutout('feature_chrom', 'feature_start', 'feature_stop', 'feature_length', 'site_pos', 'site_strand', 'parent_feature_type').todataframe()
tbl_tss_diversity_50.display(20)
In [53]:
x = 'site_dist'
for cross in CROSSES + ('all',):
variables = [v % cross for v in ('snp_diversity_%s', 'indel_nostr_diversity_%s', 'indel_str_diversity_%s')]
suptitle = LABELS.get(cross, 'All crosses')
axtitles = 'SNPs', 'Indels (non-STR)', 'Indels (STR)'
xlabel = 'distance to transcription start site (bp)'
fig = multi_regplot(df_tss_diversity_50, x, variables, xlim=(-500, 500), ylim=(0, 0.003), suptitle=suptitle, axtitles=axtitles, xlabel=xlabel)
# fig.savefig('../artwork/suppfig_tss_diversity_%s.jpeg' % cross, jpeg_quality=100, dpi=200, bbox_inches='tight')
plt.show()
In [ ]: