In [1]:
from __future__ import division, print_function
%run _shared_setup.ipynb
%run _plotting_setup.ipynb
In [2]:
callsets = load_callsets(COMBINED_CALLSET_FN_TEMPLATE, variant_filter='FILTER_PASS & ~is_snp')
In [3]:
def plot_indel_size_stacked_hist(ax, STR, svlen, major_step, minor_step, minsize=-30, maxsize=30):
x = [svlen[~STR], svlen[STR]]
ax.hist(x,
bins=np.arange(minsize, maxsize, 1),
histtype='bar',
stacked=1,
color=['w', 'k'],
edgecolor='k',
linewidth=.5,
alpha=1,
align='left'
)
# ax.spines['top'].set_visible(False)
# ax.spines['left'].set_visible(False)
# ax.spines['right'].set_visible(False)
sns.despine(ax=ax, offset=5, left=True)
ax.xaxis.tick_bottom()
ax.set_xlim(minsize-1, maxsize+1)
major_locator = mpl.ticker.MultipleLocator(major_step)
minor_locator = mpl.ticker.MultipleLocator(minor_step)
ax.xaxis.set_major_locator(major_locator)
ax.xaxis.set_minor_locator(minor_locator)
ax.set_yticks([])
In [4]:
fig, ax = plt.subplots()
variants, _, _ = unpack_callset(callsets['7g8_gb4'])
variants = variants[(variants.num_alleles == 2) & (variants.CDSAnnotationID != '.')]
plot_indel_size_stacked_hist(ax, variants.STR, variants.svlen[:, 0], major_step=6, minor_step=3)
In [5]:
fig, ax = plt.subplots()
variants, _, _ = unpack_callset(callsets['7g8_gb4'])
variants = variants[(variants.CDSAnnotationID != '.')]
plot_indel_size_stacked_hist(ax, variants.STR, variants.svlen[:, 0] - variants.svlen[:, 1], major_step=6, minor_step=3)
In [6]:
variants_all = np.concatenate([callsets[cross]['variants'] for cross in CROSSES]).view(np.recarray)
variants_all.size
Out[6]:
In [7]:
fig = plt.figure(figsize=(4, 7))
# coding
ax = fig.add_subplot(2, 1, 1)
variants = variants_all[variants_all.CDSAnnotationID != '.']
STR = variants.STR
svlen = variants.svlen[:, 0] - variants.svlen[:, 1]
plot_indel_size_stacked_hist(ax, STR, svlen, major_step=6, minor_step=3, minsize=-33, maxsize=33)
# non-coding
ax = fig.add_subplot(2, 1, 2)
variants = variants_all[variants_all.CDSAnnotationID == '.']
STR = variants.STR
svlen = variants.svlen[:, 0] - variants.svlen[:, 1]
plot_indel_size_stacked_hist(ax, STR, svlen, major_step=8, minor_step=2, minsize=-33, maxsize=33)
In [8]:
# following Lesk, Introduction to Bioinformatics, see http://www.bioinformatics.nl/~berndb/aacolour.html
AA_COLOURS = dict()
# small nonpolar
for aa in 'G', 'A', 'S', 'T':
AA_COLOURS[aa] = 'orange'
# hydrophobic
for aa in 'C', 'V', 'I', 'L', 'P', 'F', 'Y', 'M', 'W':
AA_COLOURS[aa] = 'green'
# polar
for aa in 'N', 'Q', 'H':
AA_COLOURS[aa] = 'magenta'
# negatively charged
for aa in 'D', 'E':
AA_COLOURS[aa] = 'red'
# positively charged
for aa in 'K', 'R':
AA_COLOURS[aa] = 'blue'
# other
AA_COLOURS['*'] = 'black'
def plot_insertion_aa(ax, Amino_Acid_Change, n_labels=15):
prog = re.compile(r'\d+')
aa_changes = np.array([prog.split(s) if prog.search(s) is not None else ('.', '.') for s in Amino_Acid_Change])
aa_ins = np.array([after if before != '.' else '' for (before, after) in aa_changes])
c = collections.Counter()
for x in aa_ins:
c.update(x.replace('?', '').replace('-', ''))
aas, values = zip(*c.most_common())
labels = list(aas[:n_labels]) + ([''] * (len(aas)-n_labels))
ax.pie(values, labels=labels, colors=[AA_COLOURS[a] for a in aas], shadow=0, startangle=90)
def plot_deletion_aa(ax, Amino_Acid_Change, n_labels=15):
prog = re.compile(r'\d+')
aa_changes = np.array([prog.split(s) if prog.search(s) is not None else ('.', '.') for s in Amino_Acid_Change])
aa_ins = np.array([before if after != '.' else '' for (before, after) in aa_changes])
c = collections.Counter()
for x in aa_ins:
c.update(x.replace('?', '').replace('-', ''))
aas, values = zip(*c.most_common())
labels = list(aas[:n_labels]) + ([''] * (len(aas)-n_labels))
ax.pie(values, labels=labels, colors=[AA_COLOURS[a] for a in aas], shadow=0, startangle=90)
In [9]:
fig = plt.figure(figsize=(6, 3))
ax = fig.add_subplot(121)
variants = variants_all[(variants_all.num_alleles == 2) & (variants_all.CDSAnnotationID != '.') & (variants_all.svlen[:, 0] > 0)]
plot_insertion_aa(ax, variants.EFF.Amino_Acid_Change)
ax = fig.add_subplot(122)
variants = variants_all[(variants_all.num_alleles == 2) & (variants_all.CDSAnnotationID != '.') & (variants_all.svlen[:, 0] < 0)]
plot_deletion_aa(ax, variants.EFF.Amino_Acid_Change)
fig.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=0, hspace=0)
In [10]:
df_core_promoter_diversity_50 = pandas.read_pickle(os.path.join(CACHE_DIR, 'df_core_promoter_diversity_50'))
df_core_promoter_diversity_50.head()
Out[10]:
In [11]:
xlim = -500, 500
ylim = 0, .002
markersize = 20
xlabel = 'distance to core promoter (bp)'
fig = plt.figure(figsize=(4, 7))
fig.suptitle('C', fontsize=12, fontweight='bold', y=.95)
df = df_core_promoter_diversity_50
x = df.site_dist
ax = fig.add_subplot(2, 1, 1)
sns.despine(offset=5, ax=ax)
y = df.indel_str_diversity_all
ax.axhline(np.mean(y), color='k', linestyle=':')
sns.regplot(x, y, x_estimator=np.mean, fit_reg=False, ax=ax, color='k', scatter_kws=dict(s=markersize))
ax.set_xlim(*xlim)
ax.set_ylim(*ylim)
ax.text(1, .1, 'Indels (STR)', transform=ax.transAxes, ha='right', va='bottom')
ax.set_ylabel('diversity (bp$^{-1}$)')
ax.set_xticklabels([])
ax.set_xlabel('')
ax = fig.add_subplot(2, 1, 2)
sns.despine(offset=5, ax=ax)
y = df.indel_nostr_diversity_all
ax.axhline(np.mean(y), color='k', linestyle=':')
sns.regplot(x, y, x_estimator=np.mean, fit_reg=False, ax=ax, color='k', scatter_kws=dict(s=markersize))
ax.set_xlim(*xlim)
ax.set_ylim(*ylim)
ax.text(1, .9, 'Indels (non-STR)', transform=ax.transAxes, ha='right', va='top')
ax.set_ylabel('diversity (bp$^{-1}$)')
ax.set_xlabel(xlabel)
#fig.savefig('../artwork/fig_indels_c.jpeg', dpi=1200, jpeg_quality=100, bbox_inches='tight');
In [32]:
fig = plt.figure(figsize=(8, 5))
# plot coding indel size distribution
ax = fig.add_subplot(2, 3, 1)
variants = variants_all[variants_all.CDSAnnotationID != '.']
STR = variants.STR
svlen = variants.svlen[:, 0] - variants.svlen[:, 1]
plot_indel_size_stacked_hist(ax, STR, svlen, major_step=10, minor_step=1, minsize=-33, maxsize=33)
ax.set_ylabel('coding indels')
p = plt.Rectangle((0, 0), 1, 1, fc="k")
q = plt.Rectangle((0, 0), 1, 1, fc="w")
ax.legend([p, q], ['STR', 'non-STR'],
ncol=2, loc='upper left', bbox_to_anchor=(0, -.2))
# plot non-coding indel size distribution
ax = fig.add_subplot(2, 3, 4)
variants = variants_all[variants_all.CDSAnnotationID == '.']
STR = variants.STR
svlen = variants.svlen[:, 0] - variants.svlen[:, 1]
plot_indel_size_stacked_hist(ax, STR, svlen, major_step=10, minor_step=1, minsize=-33, maxsize=33)
ax.set_xlabel('indel size')
ax.set_ylabel('non-coding indels')
# plot coding insertion mutations
ax = fig.add_subplot(2, 3, 2)
variants = variants_all[(variants_all.num_alleles == 2) & (variants_all.CDSAnnotationID != '.') & (variants_all.svlen[:, 0] > 0)]
plot_insertion_aa(ax, variants.EFF.Amino_Acid_Change)
ax.set_title('amino acids inserted')
# plot coding deletion mutations
ax = fig.add_subplot(2, 3, 3)
variants = variants_all[(variants_all.num_alleles == 2) & (variants_all.CDSAnnotationID != '.') & (variants_all.svlen[:, 0] < 0)]
plot_deletion_aa(ax, variants.EFF.Amino_Acid_Change)
ax.set_title('amino acids deleted')
# plot STR indel diversity relative to core promoters
ax = fig.add_subplot(2, 3, 5)
sns.despine(offset=5, ax=ax)
y = df.indel_str_diversity_all
ax.axhline(np.mean(y), color='k', linestyle=':')
sns.regplot(x, y, x_estimator=np.mean, fit_reg=False, ax=ax, color='k', scatter_kws=dict(s=markersize), n_boot=1000)
ax.set_xlim(*xlim)
ax.set_ylim(*ylim)
ax.set_title('intergenic indels (STR)')
ax.set_ylabel('diversity (bp$^{-1}$)')
ax.set_xlabel('distance to core promoter')
# plot non-STR indel diversity relative to core promoters
ax = fig.add_subplot(2, 3, 6)
sns.despine(offset=5, ax=ax, left=True)
y = df.indel_nostr_diversity_all
ax.axhline(np.mean(y), color='k', linestyle=':')
sns.regplot(x, y, x_estimator=np.mean, fit_reg=False, ax=ax, color='k', scatter_kws=dict(s=markersize), n_boot=1000)
ax.set_xlim(*xlim)
ax.set_ylim(*ylim)
ax.set_yticks([])
ax.set_ylabel('')
ax.set_title('intergenic indels (non-STR)')
ax.set_xlabel('distance to core promoter')
# tidy up
fig.tight_layout(h_pad=1.8, w_pad=1.8);
ax = fig.add_axes([0, 0, 1, 1])
ax.set_axis_off()
ax.axvline(.28, lw=1, color='k')
ax.axhline(.55, xmin=.28, xmax=1, lw=1, color='k')
ax.text(.25, .97, 'A', fontsize=12, fontweight='bold', ha='right', va='top');
ax.text(.7, .97, 'B', fontsize=12, fontweight='bold', ha='center', va='top');
ax.text(.7, .52, 'C', fontsize=12, fontweight='bold', ha='center', va='top');
fig.savefig('../artwork/main/fig_indels_lores.jpeg', jpeg_quality=100, dpi=200)
In [ ]: