In [1]:
    
%run ../../shared_setup.ipynb
    
    
    
In [2]:
    
tbl_regions_1b.display(5, caption='Genome regions classification')
    
    
In [3]:
    
tbl_regions_total_size = (
    tbl_regions_1b
    .aggregate('region_type', sum, 'region_size')
    .rename('value', 'total_size')
)
lkp_regions_total_size = tbl_regions_total_size.lookupone('region_type', 'total_size') 
tbl_regions_total_size.displayall(caption='Total size of region types')
    
    
In [4]:
    
max_chr_size = max(len(fasta[str(chrom, 'ascii')]) for chrom in CHROMOSOMES)
fig = plt.figure(figsize=(7, 9))
gs = mpl.gridspec.GridSpec(4*14, 1, height_ratios=[1, 1, .5, .5]*14)
gs.update(hspace=0)
for i, chrom in enumerate(CHROMOSOMES):
    chrom = str(chrom, 'ascii')
    
    ax = fig.add_subplot(gs[i*4+0])
    plot_hp1(ax, chrom)
    ax.set_xlim(-5000, max_chr_size)
    
    ax = fig.add_subplot(gs[i*4+1])
    plot_accessibility(ax, chrom, linewidth=.5)
    ax.set_xlim(-5000, max_chr_size)
    chrnum = int(chrom[6:8])
    ax.set_ylabel(chrnum, ha='right', va='center', rotation=0)
    
    ax = fig.add_subplot(gs[i*4+2])
    plot_genes(ax, chrom, hypervariable_color='red', normal_color='#bbbbbb', width=.5, divider_linewidth=.5)
    ax.spines['bottom'].set_visible(False)
    ax.set_xlim(-5000, max_chr_size)
    ax.set_xticks([])
    
fig.tight_layout()
# legend
ax = fig.add_axes([0, 0, 1, 1])
ax.set_axis_off()
ax.set_xticks([])
ax.set_yticks([])
region_types = 'Core', 'InternalHypervariable', 'SubtelomericHypervariable', 'SubtelomericRepeat', 'Centromere'
leg_artists = [plt.Rectangle((0, 0), 1, 1, fc=accessibility_colors[r]) for r in region_types]
leg_labels = ['%s (%.1fMb)' % (r, lkp_regions_total_size[r]*1./1e6) for r in region_types]
ax.legend(leg_artists, leg_labels, bbox_to_anchor=(1, 1), loc='upper right', borderaxespad=0.1)
fn = '../../artwork/supp/genome_regions_map.{dpi}.{fmt}'
for fmt in 'jpeg', 'png':
    for dpi in 120, 300:
        fig.savefig(fn.format(dpi=dpi, fmt=fmt), dpi=dpi, jpeg_quality=100)
    
    
In [5]:
    
tbl_coverage_summary_by_region = etl.fromtsv(os.path.join(PUBLIC_DIR, 'tbl_coverage_summary_by_region.wg.txt')).convertnumbers()
tbl_coverage_summary_by_region.eq('region', 'Core').displayall()
    
    
In [6]:
    
tbl_mq0_summary_by_region = etl.fromtsv(os.path.join(PUBLIC_DIR, 'tbl_mq0_summary_by_region.wg.txt')).convertnumbers()
tbl_mq0_summary_by_region.eq('region', 'Core').displayall()
    
    
In [7]:
    
LABELS
    
    Out[7]:
In [8]:
    
tbl1 = tbl_coverage_summary_by_region.convert('run', LABELS).rename('pc_not_covered', 'pc_positions').addfield('state', 'no_coverage')
tbl2 = tbl_mq0_summary_by_region.convert('run', LABELS).rename('pc_mq0', 'pc_positions').addfield('state', 'mq0')
df_summary_by_region = tbl1.cat(tbl2).rename('run', 'sample').todataframe()
df_summary_by_region.head()
    
    Out[8]:
In [9]:
    
region_colours = [accessibility_colors[r] for r in region_types]
region_types = 'Core', 'InternalHypervariable', 'SubtelomericHypervariable', 'SubtelomericRepeat'
parent_samples = ['3D7', 'HB3(1)', 'HB3(2)', 'Dd2', '7G8', 'GB4']
fg = sns.factorplot(x='sample', y='pc_positions', hue='region', data=df_summary_by_region, kind='bar',
                    hue_order=region_types, col='state',
                    x_order=parent_samples,
                    palette=region_colours, legend=False)
fg.fig.set_size_inches(7, 3)
fg.fig.tight_layout(w_pad=3)
fg.fig.subplots_adjust(bottom=.4)
# legend
ax = fg.fig.add_subplot(111)
#ax = plt.gca()
ax.set_axis_off()
ax.set_xticks([])
ax.set_yticks([])
leg_artists = [plt.Rectangle((0, 0), 1, 1, fc=accessibility_colors[r]) for r in region_types]
leg_labels = ['%s (%.1fMb)' % (r, lkp_regions_total_size[r]*1./1e6) for r in region_types]
ax.legend(leg_artists, leg_labels, bbox_to_anchor=(0.5, -0.4), loc='upper center', borderaxespad=0, fontsize=8, ncol=2)
fn = '../../artwork/supp/genome_regions_summary.{dpi}.{fmt}'
for fmt in 'jpeg', 'png':
    for dpi in 120, 300:
        fg.fig.savefig(fn.format(dpi=dpi, fmt=fmt), dpi=dpi, jpeg_quality=100)
    
    
    
In [10]:
    
bam_fn_template = '/data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/bam/{run}.realigned.bam'
stats_fn_template = '/data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/cache/{run}.{chrom}.{stats}.npy'
    
In [11]:
    
def load_stats(run, chrom, stats, requires_fasta=False):
    stats_fn = stats_fn_template.format(run=run, chrom=chrom, stats=stats)
    if os.path.exists(stats_fn):
        a = np.load(stats_fn, mmap_mode='r').view(np.recarray)
    else:
        f = getattr(pysamstats, 'load_' + stats)
        bam_fn = bam_fn_template.format(run=run)
        bam = pysam.AlignmentFile(bam_fn)
        if requires_fasta:
            fasta = pysam.FastaFile(FASTA_FN)
            a = f(bam, fasta, chrom=chrom)
        else:
            a = f(bam, chrom=chrom)
        np.save(stats_fn, a)
    return a
    
In [12]:
    
load_stats('ERR019054', 'Pf3D7_01_v3', 'mapq')
    
    Out[12]:
In [13]:
    
load_stats('ERR019054', 'Pf3D7_01_v3', 'variation', requires_fasta=True)
    
    Out[13]:
In [14]:
    
# tbl_samples = (etl
#     .fromtsv('/data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/samples.txt')
#     .addfield('ID', lambda row: '%s/%s/%s' % (row.clone, row.sample, row.run))
# )
# tbl_samples
    
In [15]:
    
# run2id = tbl_samples.lookupone('run', 'ID')
    
In [16]:
    
def plot_accessibility_diagnostics(run, chrom, figsize=(7, 7)):
    
    chrlen = len(fasta[chrom])
    
    coverage_ext = load_stats(run, chrom, 'coverage_ext')
    mapq = load_stats(run, chrom, 'mapq')
    variation = load_stats(run, chrom, 'variation', requires_fasta=True)
    
    fig = plt.figure(figsize=figsize)
    
    rows = 6
    i = 0
    
    i += 1
    ax = fig.add_subplot(rows, 1, i)
    sns.despine(ax=ax, bottom=True, top=False, offset=5)
    x = coverage_ext.pos
    y = coverage_ext.reads_all 
#     ax.fill_between(x, 0, y, lw=.1, color='k')
    ax.plot(x, y, lw=.2, color='k')
    ax.set_xlim(0, chrlen)
    ax.set_ylim(0, 400)
    ax.set_ylabel('DP', rotation=0, ha='right')
    ax.set_yticks([0, 100, 200, 300])
    ax.set_yticklabels(['0', '100X', '200X', '300X'])
    ax.set_xlabel('position (bp)')
    ax.xaxis.set_label_position('top')
    
    i += 1
    ax = fig.add_subplot(rows, 1, i)
    sns.despine(ax=ax, bottom=True, offset=5)
    ax.set_xticks([])
    x = coverage_ext.pos
    y = coverage_ext.reads_pp * 100 / coverage_ext.reads_all
#     ax.fill_between(x, 100, y, lw=.1, color='k')
    ax.plot(x, y, lw=.2, color='k')
    ax.set_xlim(0, chrlen)
    ax.set_ylim(0, 102)
    ax.set_yticks([0, 100])
    ax.set_ylabel('PP', rotation=0, ha='right')
    
    i += 1
    ax = fig.add_subplot(rows, 1, i)
    sns.despine(ax=ax, bottom=True, offset=5)
    ax.set_xticks([])
    x = mapq.pos
    y = mapq.rms_mapq
#     ax.fill_between(x, 0, y, lw=.1, color='k')
    ax.plot(x, y, lw=.2, color='k')
    ax.set_xlim(0, chrlen)
    ax.set_ylim(0, 62)
    ax.set_yticks([0, 60])
    ax.set_ylabel('MQ', rotation=0, ha='right')
    i += 1
    ax = fig.add_subplot(rows, 1, i)
    x = mapq.pos
    y = mapq.reads_mapq0 * 100 / mapq.reads_all
#     ax.fill_between(x, 0, y, lw=.1, color='k')
    ax.plot(x, y, lw=.2, color='k')
    ax.set_xlim(0, chrlen)
    ax.set_ylim(0, 102)
    ax.set_yticks([0, 100])
    ax.set_ylabel('MQ0', rotation=0, ha='right')
    sns.despine(ax=ax, bottom=True, offset=5)
    ax.set_xticks([])
    
    i += 1
    ax = fig.add_subplot(rows, 1, i)
    x = variation.pos
    y = variation.mismatches * 100 / variation.reads_all
#     ax.fill_between(x, 0, y, lw=.1, color='k')
    ax.plot(x, y, lw=.05, color='k')
    ax.set_xlim(0, chrlen)
    ax.set_ylim(0, 102)
    ax.set_yticks([0, 100])
    ax.set_ylabel('MIS', rotation=0, ha='right')
    sns.despine(ax=ax, bottom=True, offset=5)
    ax.set_xticks([])
    
    i += 1
    ax = fig.add_subplot(2*rows, 1, 2*i-1)
    sns.despine(ax=ax, bottom=True, offset=5)
    ax.set_xticks([])
    plot_genes(ax, chrom, hypervariable_color='red', normal_color='#bbbbbb', width=.5, divider_linewidth=.5)
    ax.set_xlim(0, chrlen)
    ax.set_ylabel('genes', rotation=0, ha='right')
    ax = fig.add_subplot(2*rows, 1, 2*i)
    sns.despine(ax=ax, bottom=False, offset=5)
    plot_accessibility(ax, chrom, linewidth=.5)
    ax.set_xlim(0, chrlen)
    ax.set_ylabel('classification', rotation=0, ha='right')
    fig.suptitle('sample %s, chromosome %s' % (run2id[run], chrom), fontsize=12, fontweight='bold', y=.98, va='top')
    
    # legend
    region_types = 'Core', 'InternalHypervariable', 'SubtelomericHypervariable', 'SubtelomericRepeat', 'Centromere'
    leg_artists = [plt.Rectangle((0, 0), 1, 1, fc=accessibility_colors[r]) for r in region_types]
    leg_labels = ['%s (%.1fMb)' % (r, lkp_regions_total_size[r]*1./1e6) for r in region_types]
    ax.legend(leg_artists, leg_labels, bbox_to_anchor=(0.5, -0.3), loc='upper center', borderaxespad=0, fontsize=8, ncol=2)
    fig.tight_layout()
    fig.subplots_adjust(hspace=.2, top=.83, bottom=.15)
    
    fn = '../../artwork/supp/genome_regions_example.{dpi}.{fmt}'
    for fmt in 'jpeg', 'png':
        for dpi in 120, 300:
            fig.savefig(fn.format(dpi=dpi, fmt=fmt), dpi=dpi, jpeg_quality=100)
    
plot_accessibility_diagnostics('ERR019054', 'Pf3D7_04_v3', figsize=(7, 6))
    
    
In [17]:
    
fig, ax = plt.subplots(figsize=(8, .5))
plot_accessibility(ax, 'Pf3D7_04_v3')
    
    
In [18]:
    
fig, ax = plt.subplots(figsize=(8, .5))
plot_hp1(ax, 'Pf3D7_04_v3')
    
    
In [19]:
    
fig, ax = plt.subplots(figsize=(8, .5))
chrom = 'Pf3D7_04_v3'
plot_genes(ax, chrom, hypervariable_color='r', normal_color='#bbbbbb')
    
    
In [ ]: