In [1]:
from __future__ import division, print_function
%run _shared_setup.ipynb
%run _plotting_setup.ipynb

In [2]:
callsets = load_callsets(COMBINED_CALLSET_FN_TEMPLATE, variant_filter='FILTER_PASS & ~is_snp')


2014-11-06 11:05:46.903128 :: loading /home/aliman/src/github/malariagen/pf-crosses/data/public/20141022/3d7_hb3.combined.final.npz
2014-11-06 11:05:47.457728 :: filter variants: excluding 15545 (36.8%) retaining 26699 (63.2%) of 42244 variants
2014-11-06 11:05:47.575347 :: loading /home/aliman/src/github/malariagen/pf-crosses/data/public/20141022/hb3_dd2.combined.final.npz
2014-11-06 11:05:48.134760 :: filter variants: excluding 15335 (41.5%) retaining 21576 (58.5%) of 36911 variants
2014-11-06 11:05:48.308804 :: loading /home/aliman/src/github/malariagen/pf-crosses/data/public/20141022/7g8_gb4.combined.final.npz
2014-11-06 11:05:49.010352 :: filter variants: excluding 14696 (42.3%) retaining 20079 (57.7%) of 34775 variants

Indel size distribution


In [3]:
def plot_indel_size_stacked_hist(ax, STR, svlen, major_step, minor_step, minsize=-30, maxsize=30):
    x = [svlen[~STR], svlen[STR]]
    ax.hist(x, 
            bins=np.arange(minsize, maxsize, 1), 
            histtype='bar', 
            stacked=1, 
            color=['w', 'k'], 
            edgecolor='k', 
            linewidth=.5, 
            alpha=1,
            align='left'
            )
#     ax.spines['top'].set_visible(False)
#     ax.spines['left'].set_visible(False)
#     ax.spines['right'].set_visible(False)
    sns.despine(ax=ax, offset=5, left=True)
    ax.xaxis.tick_bottom()
    ax.set_xlim(minsize-1, maxsize+1)
    major_locator = mpl.ticker.MultipleLocator(major_step)
    minor_locator = mpl.ticker.MultipleLocator(minor_step)
    ax.xaxis.set_major_locator(major_locator)
    ax.xaxis.set_minor_locator(minor_locator)
    ax.set_yticks([])

In [4]:
fig, ax = plt.subplots()
variants, _, _ = unpack_callset(callsets['7g8_gb4'])
variants = variants[(variants.num_alleles == 2) & (variants.CDSAnnotationID != '.')]
plot_indel_size_stacked_hist(ax, variants.STR, variants.svlen[:, 0], major_step=6, minor_step=3)



In [5]:
fig, ax = plt.subplots()
variants, _, _ = unpack_callset(callsets['7g8_gb4'])
variants = variants[(variants.CDSAnnotationID != '.')]
plot_indel_size_stacked_hist(ax, variants.STR, variants.svlen[:, 0] - variants.svlen[:, 1], major_step=6, minor_step=3)



In [6]:
variants_all = np.concatenate([callsets[cross]['variants'] for cross in CROSSES]).view(np.recarray)
variants_all.size


Out[6]:
68354

In [7]:
fig = plt.figure(figsize=(4, 7))

# coding
ax = fig.add_subplot(2, 1, 1)
variants = variants_all[variants_all.CDSAnnotationID != '.']
STR = variants.STR
svlen = variants.svlen[:, 0] - variants.svlen[:, 1]
plot_indel_size_stacked_hist(ax, STR, svlen, major_step=6, minor_step=3, minsize=-33, maxsize=33)

# non-coding
ax = fig.add_subplot(2, 1, 2)
variants = variants_all[variants_all.CDSAnnotationID == '.']
STR = variants.STR
svlen = variants.svlen[:, 0] - variants.svlen[:, 1]
plot_indel_size_stacked_hist(ax, STR, svlen, major_step=8, minor_step=2, minsize=-33, maxsize=33)


Coding indel mutations


In [8]:
# following Lesk, Introduction to Bioinformatics, see http://www.bioinformatics.nl/~berndb/aacolour.html
AA_COLOURS = dict()
# small nonpolar
for aa in 'G', 'A', 'S', 'T':
    AA_COLOURS[aa] = 'orange'
# hydrophobic
for aa in 'C', 'V', 'I', 'L', 'P', 'F', 'Y', 'M', 'W':
    AA_COLOURS[aa] = 'green'
# polar
for aa in 'N', 'Q', 'H':
    AA_COLOURS[aa] = 'magenta'
# negatively charged
for aa in 'D', 'E':
    AA_COLOURS[aa] = 'red'
# positively charged
for aa in 'K', 'R':
    AA_COLOURS[aa] = 'blue'
# other
AA_COLOURS['*'] = 'black'


def plot_insertion_aa(ax, Amino_Acid_Change, n_labels=15):
    prog = re.compile(r'\d+')
    aa_changes = np.array([prog.split(s) if prog.search(s) is not None else ('.', '.') for s in Amino_Acid_Change])
    aa_ins = np.array([after if before != '.' else '' for (before, after) in aa_changes])
    c = collections.Counter()
    for x in aa_ins:
        c.update(x.replace('?', '').replace('-', ''))
    aas, values = zip(*c.most_common())
    labels = list(aas[:n_labels]) + ([''] * (len(aas)-n_labels))
    ax.pie(values, labels=labels, colors=[AA_COLOURS[a] for a in aas], shadow=0, startangle=90)
    
    
def plot_deletion_aa(ax, Amino_Acid_Change, n_labels=15):
    prog = re.compile(r'\d+')
    aa_changes = np.array([prog.split(s) if prog.search(s) is not None else ('.', '.') for s in Amino_Acid_Change])
    aa_ins = np.array([before if after != '.' else '' for (before, after) in aa_changes])
    c = collections.Counter()
    for x in aa_ins:
        c.update(x.replace('?', '').replace('-', ''))
    aas, values = zip(*c.most_common())
    labels = list(aas[:n_labels]) + ([''] * (len(aas)-n_labels))
    ax.pie(values, labels=labels, colors=[AA_COLOURS[a] for a in aas], shadow=0, startangle=90)

In [9]:
fig = plt.figure(figsize=(6, 3))
ax = fig.add_subplot(121)
variants = variants_all[(variants_all.num_alleles == 2) & (variants_all.CDSAnnotationID != '.') & (variants_all.svlen[:, 0] > 0)]
plot_insertion_aa(ax, variants.EFF.Amino_Acid_Change)

ax = fig.add_subplot(122)
variants = variants_all[(variants_all.num_alleles == 2) & (variants_all.CDSAnnotationID != '.') & (variants_all.svlen[:, 0] < 0)]
plot_deletion_aa(ax, variants.EFF.Amino_Acid_Change)
fig.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=0, hspace=0)


Intergenic indels & core promoters


In [10]:
df_core_promoter_diversity_50 = pandas.read_pickle(os.path.join(CACHE_DIR, 'df_core_promoter_diversity_50'))
df_core_promoter_diversity_50.head()


Out[10]:
feature_chrom feature_start feature_stop feature_length site_dist snp_diversity_3d7_hb3 indel_str_diversity_3d7_hb3 indel_nostr_diversity_3d7_hb3 snp_diversity_hb3_dd2 indel_str_diversity_hb3_dd2 indel_nostr_diversity_hb3_dd2 snp_diversity_7g8_gb4 indel_str_diversity_7g8_gb4 indel_nostr_diversity_7g8_gb4 snp_diversity_all indel_str_diversity_all indel_nostr_diversity_all
0 Pf3D7_01_v3 95941 95991 50 -2575 0 0 0 0 0 0 0 0 0 0 0 0
1 Pf3D7_01_v3 95948 95998 50 2425 0 0 0 0 0 0 0 0 0 0 0 0
2 Pf3D7_01_v3 95991 96041 50 -2525 0 0 0 0 0 0 0 0 0 0 0 0
3 Pf3D7_01_v3 95998 96048 50 2375 0 0 0 0 0 0 0 0 0 0 0 0
4 Pf3D7_01_v3 96041 96091 50 -2475 0 0 0 0 0 0 0 0 0 0 0 0

5 rows × 17 columns


In [11]:
xlim = -500, 500
ylim = 0, .002
markersize = 20
xlabel = 'distance to core promoter (bp)'

fig = plt.figure(figsize=(4, 7))
fig.suptitle('C', fontsize=12, fontweight='bold', y=.95)

df = df_core_promoter_diversity_50
x = df.site_dist

ax = fig.add_subplot(2, 1, 1)
sns.despine(offset=5, ax=ax)
y = df.indel_str_diversity_all
ax.axhline(np.mean(y), color='k', linestyle=':')
sns.regplot(x, y, x_estimator=np.mean, fit_reg=False, ax=ax, color='k', scatter_kws=dict(s=markersize))
ax.set_xlim(*xlim)
ax.set_ylim(*ylim)
ax.text(1, .1, 'Indels (STR)', transform=ax.transAxes, ha='right', va='bottom')
ax.set_ylabel('diversity (bp$^{-1}$)')
ax.set_xticklabels([])
ax.set_xlabel('')

ax = fig.add_subplot(2, 1, 2)
sns.despine(offset=5, ax=ax)
y = df.indel_nostr_diversity_all
ax.axhline(np.mean(y), color='k', linestyle=':')
sns.regplot(x, y, x_estimator=np.mean, fit_reg=False, ax=ax, color='k', scatter_kws=dict(s=markersize))
ax.set_xlim(*xlim)
ax.set_ylim(*ylim)
ax.text(1, .9, 'Indels (non-STR)', transform=ax.transAxes, ha='right', va='top')
ax.set_ylabel('diversity (bp$^{-1}$)')
ax.set_xlabel(xlabel)

#fig.savefig('../artwork/fig_indels_c.jpeg', dpi=1200, jpeg_quality=100, bbox_inches='tight');


Assemble figure


In [32]:
fig = plt.figure(figsize=(8, 5))

# plot coding indel size distribution
ax = fig.add_subplot(2, 3, 1)
variants = variants_all[variants_all.CDSAnnotationID != '.']
STR = variants.STR
svlen = variants.svlen[:, 0] - variants.svlen[:, 1]
plot_indel_size_stacked_hist(ax, STR, svlen, major_step=10, minor_step=1, minsize=-33, maxsize=33)
ax.set_ylabel('coding indels')

p = plt.Rectangle((0, 0), 1, 1, fc="k")
q = plt.Rectangle((0, 0), 1, 1, fc="w")
ax.legend([p, q], ['STR', 'non-STR'], 
          ncol=2, loc='upper left', bbox_to_anchor=(0, -.2))

# plot non-coding indel size distribution
ax = fig.add_subplot(2, 3, 4)
variants = variants_all[variants_all.CDSAnnotationID == '.']
STR = variants.STR
svlen = variants.svlen[:, 0] - variants.svlen[:, 1]
plot_indel_size_stacked_hist(ax, STR, svlen, major_step=10, minor_step=1, minsize=-33, maxsize=33)
ax.set_xlabel('indel size')
ax.set_ylabel('non-coding indels')

# plot coding insertion mutations
ax = fig.add_subplot(2, 3, 2)
variants = variants_all[(variants_all.num_alleles == 2) & (variants_all.CDSAnnotationID != '.') & (variants_all.svlen[:, 0] > 0)]
plot_insertion_aa(ax, variants.EFF.Amino_Acid_Change)
ax.set_title('amino acids inserted')

# plot coding deletion mutations
ax = fig.add_subplot(2, 3, 3)
variants = variants_all[(variants_all.num_alleles == 2) & (variants_all.CDSAnnotationID != '.') & (variants_all.svlen[:, 0] < 0)]
plot_deletion_aa(ax, variants.EFF.Amino_Acid_Change)
ax.set_title('amino acids deleted')

# plot STR indel diversity relative to core promoters
ax = fig.add_subplot(2, 3, 5)
sns.despine(offset=5, ax=ax)
y = df.indel_str_diversity_all
ax.axhline(np.mean(y), color='k', linestyle=':')
sns.regplot(x, y, x_estimator=np.mean, fit_reg=False, ax=ax, color='k', scatter_kws=dict(s=markersize), n_boot=1000)
ax.set_xlim(*xlim)
ax.set_ylim(*ylim)
ax.set_title('intergenic indels (STR)')
ax.set_ylabel('diversity (bp$^{-1}$)')
ax.set_xlabel('distance to core promoter')

# plot non-STR indel diversity relative to core promoters
ax = fig.add_subplot(2, 3, 6)
sns.despine(offset=5, ax=ax, left=True)
y = df.indel_nostr_diversity_all
ax.axhline(np.mean(y), color='k', linestyle=':')
sns.regplot(x, y, x_estimator=np.mean, fit_reg=False, ax=ax, color='k', scatter_kws=dict(s=markersize), n_boot=1000)
ax.set_xlim(*xlim)
ax.set_ylim(*ylim)
ax.set_yticks([])
ax.set_ylabel('')
ax.set_title('intergenic indels (non-STR)')
ax.set_xlabel('distance to core promoter')

# tidy up
fig.tight_layout(h_pad=1.8, w_pad=1.8);

ax = fig.add_axes([0, 0, 1, 1])
ax.set_axis_off()
ax.axvline(.28, lw=1, color='k')
ax.axhline(.55, xmin=.28, xmax=1, lw=1, color='k')
ax.text(.25, .97, 'A', fontsize=12, fontweight='bold', ha='right', va='top');
ax.text(.7, .97, 'B', fontsize=12, fontweight='bold', ha='center', va='top');
ax.text(.7, .52, 'C', fontsize=12, fontweight='bold', ha='center', va='top');

fig.savefig('../artwork/main/fig_indels_lores.jpeg', jpeg_quality=100, dpi=200)



In [ ]: