Study diversity in relation to feature types, core promoter position, and other phenomena.


In [1]:
%run ../../shared_setup.ipynb


docker image cggh/biipy:v1.6.0

Diversity by feature type


In [2]:
tbl_regions_1b


Out[2]:
0|region_chrom 1|region_start 2|region_stop 3|region_type 4|region_size
Pf3D7_01_v3 1 27336 SubtelomericRepeat 27336
Pf3D7_01_v3 27337 92900 SubtelomericHypervariable 65564
Pf3D7_01_v3 92901 457931 Core 365031
Pf3D7_01_v3 457932 460311 Centromere 2380
Pf3D7_01_v3 460312 575900 Core 115589

...


In [3]:
tbl_features


Out[3]:
0|feature_chrom 1|feature_type 2|feature_start 3|feature_stop 4|feature_length 5|feature_strand 6|feature_id 7|feature_parent_id 8|feature_name 9|feature_previous_id 10|feature_region_type 11|feature_region_size
Pf3D7_01_v3 repeat_region 1 360 359 + Pfalciparum_REP_20 None None None SubtelomericRepeat 27336
Pf3D7_01_v3 repeat_region 361 1418 1057 + Pfalciparum_REP_15 None None None SubtelomericRepeat 27336
Pf3D7_01_v3 repeat_region 2160 3858 1698 + Pfalciparum_REP_35 None None None SubtelomericRepeat 27336
Pf3D7_01_v3 repeat_region 8856 9021 165 + Pfalciparum_REP_5 None None None SubtelomericRepeat 27336
Pf3D7_01_v3 repeat_region 9313 9529 216 + Pfalciparum_REP_25 None None None SubtelomericRepeat 27336

...


In [4]:
tbl_features.valuecounts('feature_type').displayall()


0|feature_type 1|count 2|frequency
CDS 14334 0.4532776776396926
polypeptide 5525 0.17471460645732537
gene 5449 0.17231129241374948
mRNA 5367 0.16971824305094393
pseudogenic_exon 389 0.012301173196723903
pseudogenic_transcript 139 0.0043955348954874615
pseudogene 139 0.0043955348954874615
repeat_region 58 0.0018341080858868546
ncRNA 58 0.0018341080858868546
tRNA 45 0.0014230148942225595
polypeptide_motif 39 0.001233279574992885
snoRNA 38 0.0012016570217879391
rRNA 26 0.0008221863833285899
centromere 13 0.00041109319166429497
snRNA 4 0.00012649021281978308

In [5]:
tbl_genes


Out[5]:
0|feature_chrom 1|feature_type 2|feature_start 3|feature_stop 4|feature_length 5|feature_strand 6|feature_id 7|feature_name 8|feature_previous_id 9|feature_region_type 10|feature_region_size
Pf3D7_01_v3 gene 29510 37126 7616 + PF3D7_0100100 VAR PFA0005w SubtelomericHypervariable 65564
Pf3D7_01_v3 gene 38982 40207 1225 - PF3D7_0100200 RIF PFA0010c SubtelomericHypervariable 65564
Pf3D7_01_v3 gene 42367 46507 4140 - PF3D7_0100300 None PFA0015c SubtelomericHypervariable 65564
Pf3D7_01_v3 gene 50363 51636 1273 + PF3D7_0100400 RIF PFA0020w SubtelomericHypervariable 65564
Pf3D7_01_v3 pseudogene 53169 53280 111 - PF3D7_0100500 var pseudogene, exon 1 PFA0025c SubtelomericHypervariable 65564

...


In [6]:
df = tbl_genes.cut('feature_strand', 'feature_length').todataframe()
df.boxplot('feature_length', 'feature_strand', grid=False)
plt.suptitle('Genes');



In [7]:
tbl_intergenic


Out[7]:
0|feature_chrom 1|feature_type 2|feature_start 3|feature_stop 4|feature_length 5|feature_id 6|feature_region_size 7|feature_region_type 8|region_size
Pf3D7_01_v3 intergenic_0 37126 38982 1856 PF3D7_0100100_PF3D7_0100200 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_1 40207 42367 2160 PF3D7_0100200_PF3D7_0100300 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_2 46507 50363 3856 PF3D7_0100300_PF3D7_0100400 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_0 51636 53169 1533 PF3D7_0100400_PF3D7_0100500 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_1 53280 53778 498 PF3D7_0100500_PF3D7_0100600 65564 SubtelomericHypervariable 65564

...


In [8]:
df = tbl_intergenic.cut('feature_length', 'feature_type').todataframe()
df.boxplot('feature_length', 'feature_type', grid=False)
plt.ylim(0, 8000)
plt.suptitle('Intergenic');



In [9]:
tbl_exons


Out[9]:
0|feature_chrom 1|feature_type 2|feature_start 3|feature_stop 4|feature_length 5|feature_strand 6|feature_id 7|feature_parent_id 8|feature_name 9|feature_previous_id 10|feature_region_type 11|feature_region_size
Pf3D7_01_v3 CDS 29510 34762 5252 + PF3D7_0100100.1:exon:1 PF3D7_0100100.1 None None SubtelomericHypervariable 65564
Pf3D7_01_v3 CDS 35888 37126 1238 + PF3D7_0100100.1:exon:2 PF3D7_0100100.1 None None SubtelomericHypervariable 65564
Pf3D7_01_v3 CDS 38982 39923 941 - PF3D7_0100200.1:exon:1 PF3D7_0100200.1 None None SubtelomericHypervariable 65564
Pf3D7_01_v3 CDS 40154 40207 53 - PF3D7_0100200.1:exon:2 PF3D7_0100200.1 None None SubtelomericHypervariable 65564
Pf3D7_01_v3 CDS 42367 43617 1250 - PF3D7_0100300.1:exon:1 PF3D7_0100300.1 None None SubtelomericHypervariable 65564

...


In [10]:
df = tbl_exons.cut('feature_strand', 'feature_length').todataframe()
df.boxplot('feature_length', 'feature_strand', grid=False)
plt.suptitle('Exons');



In [11]:
tbl_introns


Out[11]:
0|feature_chrom 1|feature_type 2|feature_start 3|feature_stop 4|feature_length 5|feature_strand 6|feature_id 7|feature_parent_id 8|feature_name 9|feature_previous_id 10|feature_region_size 11|feature_region_type 12|region_size
Pf3D7_01_v3 intron 34762 35888 1126 + PF3D7_0100100.1:exon:1_PF3D7_0100100.1:exon:2 PF3D7_0100100.1 None None 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intron 39923 40154 231 - PF3D7_0100200.1:exon:1_PF3D7_0100200.1:exon:2 PF3D7_0100200.1 None None 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intron 43617 43775 158 - PF3D7_0100300.1:exon:1_PF3D7_0100300.1:exon:2 PF3D7_0100300.1 None None 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intron 50416 50572 156 + PF3D7_0100400.1:exon:1_PF3D7_0100400.1:exon:2 PF3D7_0100400.1 None None 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intron 54788 54938 150 - PF3D7_0100600.1:exon:1_PF3D7_0100600.1:exon:2 PF3D7_0100600.1 None None 65564 SubtelomericHypervariable 65564

...


In [12]:
df = tbl_introns.cut('feature_strand', 'feature_length').todataframe()
df.boxplot('feature_length', 'feature_strand', grid=False)
plt.suptitle('Introns');



In [13]:
callsets = load_callsets(COMBINED_CALLSET_FN_TEMPLATE, variant_filter='FILTER_PASS')
callsets_snp = filter_callsets(callsets, variant_filter='is_snp')
callsets_indel = filter_callsets(callsets, variant_filter='~is_snp')
callsets_indel_str = filter_callsets(callsets, variant_filter='~is_snp & STR')
callsets_indel_nostr = filter_callsets(callsets, variant_filter='~is_snp & ~STR')


2016-03-08 21:11:05.683271 :: loading /data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/3d7_hb3.combined.final.npz
2016-03-08 21:11:06.039263 :: filter variants: excluding 157 (0.4%) retaining 42087 (99.6%) of 42244 variants
2016-03-08 21:11:06.058451 :: loading /data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/hb3_dd2.combined.final.npz
2016-03-08 21:11:06.473692 :: filter variants: excluding 450 (1.2%) retaining 36461 (98.8%) of 36911 variants
2016-03-08 21:11:06.491738 :: loading /data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/7g8_gb4.combined.final.npz
2016-03-08 21:11:06.976072 :: filter variants: excluding 304 (0.9%) retaining 34471 (99.1%) of 34775 variants
2016-03-08 21:11:07.001920 :: filter variants: excluding 26699 (63.4%) retaining 15388 (36.6%) of 42087 variants
2016-03-08 21:11:07.010012 :: filter variants: excluding 20079 (58.2%) retaining 14392 (41.8%) of 34471 variants
2016-03-08 21:11:07.025142 :: filter variants: excluding 21576 (59.2%) retaining 14885 (40.8%) of 36461 variants
2016-03-08 21:11:07.041564 :: filter variants: excluding 15388 (36.6%) retaining 26699 (63.4%) of 42087 variants
2016-03-08 21:11:07.055913 :: filter variants: excluding 14392 (41.8%) retaining 20079 (58.2%) of 34471 variants
2016-03-08 21:11:07.072491 :: filter variants: excluding 14885 (40.8%) retaining 21576 (59.2%) of 36461 variants
2016-03-08 21:11:07.090631 :: filter variants: excluding 19736 (46.9%) retaining 22351 (53.1%) of 42087 variants
2016-03-08 21:11:07.107698 :: filter variants: excluding 18167 (52.7%) retaining 16304 (47.3%) of 34471 variants
2016-03-08 21:11:07.125817 :: filter variants: excluding 18816 (51.6%) retaining 17645 (48.4%) of 36461 variants
2016-03-08 21:11:07.142836 :: filter variants: excluding 37739 (89.7%) retaining 4348 (10.3%) of 42087 variants
2016-03-08 21:11:07.146887 :: filter variants: excluding 30696 (89.0%) retaining 3775 (11.0%) of 34471 variants
2016-03-08 21:11:07.153330 :: filter variants: excluding 32530 (89.2%) retaining 3931 (10.8%) of 36461 variants

In [14]:
def tabulate_diversity(tbl, callset, field='diversity'):
    
    V, _, _ = unpack_callset(callset)
    CHROM = V.CHROM
    POS = V.POS
    
    def add_diversity(row):
        start_index, stop_index = locate_variants(CHROM, POS, row.feature_chrom, row.feature_start, row.feature_stop)
        n = stop_index - start_index
        return n / row.feature_length
    
    return tbl.addfield(field, add_diversity)
    
    
etl.Table.tabulate_diversity = tabulate_diversity


def add_diversity_fields(tbl):
    return (tbl
        .tabulate_diversity(
            callsets_snp['3d7_hb3'],
            'snp_diversity_3d7_hb3'
        )
        .tabulate_diversity(
            callsets_indel['3d7_hb3'],
            'indel_diversity_3d7_hb3'
        )
        .tabulate_diversity(
            callsets_indel_str['3d7_hb3'],
            'indel_str_diversity_3d7_hb3'
        )
        .tabulate_diversity(
            callsets_indel_nostr['3d7_hb3'],
            'indel_nostr_diversity_3d7_hb3'
        )
        .tabulate_diversity(
            callsets_snp['hb3_dd2'],
            'snp_diversity_hb3_dd2'
        )
        .tabulate_diversity(
            callsets_indel['hb3_dd2'],
            'indel_diversity_hb3_dd2'
        )
        .tabulate_diversity(
            callsets_indel_str['hb3_dd2'],
            'indel_str_diversity_hb3_dd2'
        )
        .tabulate_diversity(
            callsets_indel_nostr['hb3_dd2'],
            'indel_nostr_diversity_hb3_dd2'
        )
        .tabulate_diversity(
            callsets_snp['7g8_gb4'],
            'snp_diversity_7g8_gb4'
        )
        .tabulate_diversity(
            callsets_indel['7g8_gb4'],
            'indel_diversity_7g8_gb4'
        )
        .tabulate_diversity(
            callsets_indel_str['7g8_gb4'],
            'indel_str_diversity_7g8_gb4'
        )
        .tabulate_diversity(
            callsets_indel_nostr['7g8_gb4'],
            'indel_nostr_diversity_7g8_gb4'
        )
        .addfield('snp_diversity_all', lambda row: sum(row['snp_diversity_%s' % cross] for cross in CROSSES) / 3)
        .addfield('indel_diversity_all', lambda row: sum(row['indel_diversity_%s' % cross] for cross in CROSSES) / 3)
        .addfield('indel_str_diversity_all', lambda row: sum(row['indel_str_diversity_%s' % cross] for cross in CROSSES) / 3)
        .addfield('indel_nostr_diversity_all', lambda row: sum(row['indel_nostr_diversity_%s' % cross] for cross in CROSSES) / 3)
    )


etl.Table.add_diversity_fields = add_diversity_fields

In [15]:
@etlcache
def tbl_feature_diversity():
    return (etl
        .cat(tbl_exons, tbl_introns, tbl_intergenic)
        .eq('feature_region_type', 'Core')
        .cutout('feature_region_type')
        .sort(key=('feature_chrom', 'feature_start', 'feature_type'))
        .add_diversity_fields()
    )

tbl_feature_diversity


Out[15]:
0|feature_chrom 1|feature_type 2|feature_start 3|feature_stop 4|feature_length 5|feature_strand 6|feature_id 7|feature_parent_id 8|feature_name 9|feature_previous_id 10|feature_region_size 11|region_size 12|snp_diversity_3d7_hb3 13|indel_diversity_3d7_hb3 14|indel_str_diversity_3d7_hb3 15|indel_nostr_diversity_3d7_hb3 16|snp_diversity_hb3_dd2 17|indel_diversity_hb3_dd2 18|indel_str_diversity_hb3_dd2 19|indel_nostr_diversity_hb3_dd2 20|snp_diversity_7g8_gb4 21|indel_diversity_7g8_gb4 22|indel_str_diversity_7g8_gb4 23|indel_nostr_diversity_7g8_gb4 24|snp_diversity_all 25|indel_diversity_all 26|indel_str_diversity_all 27|indel_nostr_diversity_all
Pf3D7_01_v3 intergenic_1 91420 93113 1693 None PF3D7_0101900_PF3D7_0102000 None None None 65564 365031 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 intergenic_2 93875 95050 1175 None PF3D7_0102000_PF3D7_0102100 None None None 365031 365031 0.0 0.002553191489361702 0.001702127659574468 0.000851063829787234 0.0 0.001702127659574468 0.000851063829787234 0.000851063829787234 0.002553191489361702 0.003404255319148936 0.002553191489361702 0.000851063829787234 0.000851063829787234 0.002553191489361702 0.001702127659574468 0.000851063829787234
Pf3D7_01_v3 intergenic_1 95899 98819 2920 None PF3D7_0102100_PF3D7_0102200 None None None 365031 365031 0.0 0.0006849315068493151 0.0006849315068493151 0.0 0.0 0.0006849315068493151 0.00034246575342465754 0.00034246575342465754 0.0 0.0006849315068493151 0.0006849315068493151 0.0 0.0 0.0006849315068493152 0.0005707762557077626 0.00011415525114155251
Pf3D7_01_v3 CDS 98819 99013 194 + PF3D7_0102200.1:exon:1 PF3D7_0102200.1 None None 365031 None 0.005154639175257732 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.005154639175257732 0.0 0.0 0.0 0.003436426116838488 0.0 0.0 0.0
Pf3D7_01_v3 intron 99013 99220 207 + PF3D7_0102200.1:exon:1_PF3D7_0102200.1:exon:2 PF3D7_0102200.1 None None 365031 365031 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

...


In [16]:
df_feature_diversity = tbl_feature_diversity.cutout('feature_id', 'feature_parent_id').todataframe()
feature_types = sorted(df_feature_diversity.feature_type.unique())
feature_types


Out[16]:
['CDS', 'intergenic_0', 'intergenic_1', 'intergenic_2', 'intron']

In [17]:
def multi_boxplot(df, by, variables, figsize=(10, 2.5), suptitle='', axtitles=None, ylim=None):
    
    # setup figure
    fig = plt.figure(figsize=figsize)
    fig.suptitle(suptitle, fontsize=11, y=1.05)
    n = len(variables)

    # make subplots
    for i, v in enumerate(variables):
        
        # setup subplot
        ax = fig.add_subplot(1, n, i+1)
        sns.despine(offset=5, ax=ax)
        
        # obtain subplot data
        grouped = df.groupby(by)[v]
        xticklabels = [g[0] for g in grouped]
        x = [g[1] for g in grouped]
        
        # plotting
        ax.boxplot(x, showmeans=True, notch=True);
        ax.set_xticklabels(xticklabels, rotation=90)
        ax.set_ylabel('')
        if ylim:
            ax.set_ylim(*ylim)
        if axtitles:
            ax.set_title(axtitles[i])
        else:
            ax.set_title(v)
        if i == 0:
            ax.set_ylabel('diversity (bp$^{-1}$)')
        else:
            ax.set_yticks([])
            ax.spines['left'].set_visible(False)
    
    return fig

In [18]:
by = 'feature_type'
for cross in CROSSES + ('all',):
    variables = [v % cross for v in ('snp_diversity_%s', 'indel_diversity_%s', 'indel_str_diversity_%s', 'indel_nostr_diversity_%s')]
    suptitle = LABELS.get(cross, 'All crosses')
    axtitles = 'SNPs', 'INDELs', 'INDELs (STR)', 'INDELs (non-STR)'
    fig = multi_boxplot(df_feature_diversity, by, variables, ylim=(0, 0.005), suptitle=suptitle, axtitles=axtitles)
#     fig.savefig('../../artwork/suppfig_feature_diversity_%s.jpeg' % cross, jpeg_quality=100, dpi=200, bbox_inches='tight')



In [19]:
import scikits.bootstrap as bootstrap

In [20]:
variable = 'snp_diversity_all'
condition = (df_feature_diversity.feature_type == 'CDS') & (df_feature_diversity.feature_length > 50) 
x = df_feature_diversity[condition][variable] * (10**3)
x.describe(percentiles=[.05, .25, .5, .75, .95]), bootstrap.ci(x)


Out[20]:
(count    12323.000000
 mean         0.555371
 std          2.384540
 min          0.000000
 5%           0.000000
 25%          0.000000
 50%          0.000000
 75%          0.395935
 95%          2.655808
 max        106.354126
 Name: snp_diversity_all, dtype: float64, array([ 0.51836525,  0.60427006]))

In [21]:
variable = 'snp_diversity_all'
condition = (
    (df_feature_diversity.feature_length > 50)
    & (
        (df_feature_diversity.feature_type == 'intergenic_0') 
        | (df_feature_diversity.feature_type == 'intergenic_1') 
        | (df_feature_diversity.feature_type == 'intergenic_2') 
        | (df_feature_diversity.feature_type == 'intron') 
    )
)
x = df_feature_diversity[condition][variable] * (10**3)
x.describe(percentiles=[.05, .25, .5, .75, .95]), bootstrap.ci(x)


Out[21]:
(count    12762.000000
 mean         0.694238
 std          2.019858
 min          0.000000
 5%           0.000000
 25%          0.000000
 50%          0.000000
 75%          0.579165
 95%          3.375101
 max         68.518519
 Name: snp_diversity_all, dtype: float64, array([ 0.66080506,  0.73170366]))

In [22]:
variable = 'indel_diversity_all'
condition = (df_feature_diversity.feature_type == 'CDS') & (df_feature_diversity.feature_length > 50) 
x = df_feature_diversity[condition][variable] * (10**3)
x.describe(percentiles=[.05, .25, .5, .75, .95]), bootstrap.ci(x)


Out[22]:
(count    12323.000000
 mean         0.105829
 std          0.376915
 min          0.000000
 5%           0.000000
 25%          0.000000
 50%          0.000000
 75%          0.000000
 95%          0.677804
 max         13.888889
 Name: indel_diversity_all, dtype: float64, array([ 0.09961339,  0.11287913]))

In [23]:
variable = 'indel_diversity_all'
condition = (
    (df_feature_diversity.feature_length > 50)
    & (
        (df_feature_diversity.feature_type == 'intergenic_0') 
        | (df_feature_diversity.feature_type == 'intergenic_1') 
        | (df_feature_diversity.feature_type == 'intergenic_2') 
        | (df_feature_diversity.feature_type == 'intron') 
    )
)
x = df_feature_diversity[condition][variable] * (10**3)
x.describe(percentiles=[.05, .25, .5, .75, .95]), bootstrap.ci(x)


Out[23]:
(count    12762.000000
 mean         2.975352
 std          2.876797
 min          0.000000
 5%           0.000000
 25%          1.063764
 50%          2.258992
 75%          4.184100
 95%          8.620690
 max         45.454545
 Name: indel_diversity_all, dtype: float64, array([ 2.92409911,  3.02580347]))

Diversity by distance to core promoter


In [24]:
cp_v3_coords_fn = os.path.join(DATA_DIR, 'reference/ponts_2011/cp_v3_coords.bed')

@etlcache
def tbl_core_promoters():
    return (etl
        .fromtsv(cp_v3_coords_fn)
        .cut(0, 1, 3, 4)
        .pushheader(['chrom', 'pos', 'strand', 'gene'])
        .convert('pos', int)
        .intervalleftjoin(tbl_intergenic.prefixheader('parent_'), 
                          lkey='chrom', lstart='pos', lstop='pos',
                          rkey='parent_feature_chrom', rstart='parent_feature_start', rstop='parent_feature_stop', 
                          include_stop=True)
        .eq('parent_feature_region_type', 'Core')
        .cutout('parent_feature_chrom', 'parent_feature_region_type')
        .sort(key=('chrom', 'pos'))
    )

tbl_core_promoters.display(10)
tbl_core_promoters.distinct(key=('parent_feature_id', 'strand')).display(10)


0|chrom 1|pos 2|strand 3|gene 4|parent_feature_type 5|parent_feature_start 6|parent_feature_stop 7|parent_feature_length 8|parent_feature_id 9|parent_feature_region_size 10|parent_region_size
Pf3D7_01_v3 98398 - PFA0100c intergenic_1 95899 98819 2920 PF3D7_0102100_PF3D7_0102200 365031 365031
Pf3D7_01_v3 98541 + PFA0110w intergenic_1 95899 98819 2920 PF3D7_0102100_PF3D7_0102200 365031 365031
Pf3D7_01_v3 116896 - PFA0125c intergenic_1 115799 119041 3242 PF3D7_0102500_PF3D7_0102600 365031 365031
Pf3D7_01_v3 124035 - PFA0130c intergenic_2 121249 124517 3268 PF3D7_0102600_PF3D7_0102700 365031 365031
Pf3D7_01_v3 124100 + PFA0135w intergenic_2 121249 124517 3268 PF3D7_0102600_PF3D7_0102700 365031 365031
Pf3D7_01_v3 128798 - PFA0140c intergenic_1 128141 128960 819 PF3D7_0102800_PF3D7_0102900 365031 365031
Pf3D7_01_v3 134146 - PFA0150c intergenic_1 133624 134353 729 PF3D7_0103000_PF3D7_0103100 365031 365031
Pf3D7_01_v3 134173 + PFA0175w intergenic_1 133624 134353 729 PF3D7_0103000_PF3D7_0103100 365031 365031
Pf3D7_01_v3 139306 - PFA0155c intergenic_1 139257 140662 1405 PF3D7_0103100_PF3D7_0103200 365031 365031
Pf3D7_01_v3 140472 - PFA0155c intergenic_1 139257 140662 1405 PF3D7_0103100_PF3D7_0103200 365031 365031

...

0|chrom 1|pos 2|strand 3|gene 4|parent_feature_type 5|parent_feature_start 6|parent_feature_stop 7|parent_feature_length 8|parent_feature_id 9|parent_feature_region_size 10|parent_region_size
Pf3D7_01_v3 98541 + PFA0110w intergenic_1 95899 98819 2920 PF3D7_0102100_PF3D7_0102200 365031 365031
Pf3D7_01_v3 98398 - PFA0100c intergenic_1 95899 98819 2920 PF3D7_0102100_PF3D7_0102200 365031 365031
Pf3D7_01_v3 116896 - PFA0125c intergenic_1 115799 119041 3242 PF3D7_0102500_PF3D7_0102600 365031 365031
Pf3D7_01_v3 124100 + PFA0135w intergenic_2 121249 124517 3268 PF3D7_0102600_PF3D7_0102700 365031 365031
Pf3D7_01_v3 124035 - PFA0130c intergenic_2 121249 124517 3268 PF3D7_0102600_PF3D7_0102700 365031 365031
Pf3D7_01_v3 128798 - PFA0140c intergenic_1 128141 128960 819 PF3D7_0102800_PF3D7_0102900 365031 365031
Pf3D7_01_v3 134173 + PFA0175w intergenic_1 133624 134353 729 PF3D7_0103000_PF3D7_0103100 365031 365031
Pf3D7_01_v3 134146 - PFA0150c intergenic_1 133624 134353 729 PF3D7_0103000_PF3D7_0103100 365031 365031
Pf3D7_01_v3 140525 + PFA0175w intergenic_1 139257 140662 1405 PF3D7_0103100_PF3D7_0103200 365031 365031
Pf3D7_01_v3 139306 - PFA0155c intergenic_1 139257 140662 1405 PF3D7_0103100_PF3D7_0103200 365031 365031

...


In [25]:
def tabulate_site_proximity_regions(tbl_sites, window_size=100):
    tbl_out = [['feature_chrom', 'feature_start', 'feature_stop', 'feature_length', 'site_pos', 'site_strand', 'site_dist', 'parent_feature_type', 'parent_feature_id']]
    for rec in tbl_sites.records():
        if rec.strand == '+':
            # upstream bins
            # setup first bin, then work upstream
            window_stop = rec.pos
            window_start = window_stop - window_size
            window_center = (window_start + window_stop) // 2
            dist = window_center - rec.pos
            while window_start > rec.parent_feature_start:
                row = [rec.chrom, window_start, window_stop, window_size, rec.pos, rec.strand, dist, rec.parent_feature_type, rec.parent_feature_id]
                tbl_out += [row]
                window_stop = window_start
                window_start = window_stop - window_size
                window_center = (window_start + window_stop) // 2
                dist = window_center - rec.pos
            # downstream bins
            # setup first bin, then work downstream
            window_start = rec.pos
            window_stop = window_start + window_size
            window_center = (window_start + window_stop) // 2
            dist = window_center - rec.pos
            while window_stop < rec.parent_feature_stop:
                row = [rec.chrom, window_start, window_stop, window_size, rec.pos, rec.strand, dist, rec.parent_feature_type, rec.parent_feature_id]
                tbl_out += [row]
                window_start = window_stop
                window_stop = window_start + window_size
                window_center = (window_start + window_stop) // 2
                dist = window_center - rec.pos
        else:
            # upstream bins
            # setup first bin, then work upstream
            window_start = rec.pos
            window_stop = window_start + window_size
            window_center = (window_start + window_stop) // 2
            dist = rec.pos - window_center
            while window_stop < rec.parent_feature_stop:
                row = [rec.chrom, window_start, window_stop, window_size, rec.pos, rec.strand, dist, rec.parent_feature_type, rec.parent_feature_id]
                tbl_out += [row]
                window_start = window_stop
                window_stop = window_start + window_size
                window_center = (window_start + window_stop) // 2
                dist = rec.pos - window_center
            # downstream bins
            # setup first bin, then work downstream
            window_stop = rec.pos
            window_start = window_stop - window_size
            window_center = (window_start + window_stop) // 2
            dist = rec.pos - window_center
            while window_start > rec.parent_feature_start:
                row = [rec.chrom, window_start, window_stop, window_size, rec.pos, rec.strand, dist, rec.parent_feature_type, rec.parent_feature_id]
                tbl_out += [row]
                window_stop = window_start
                window_start = window_stop - window_size
                window_center = (window_start + window_stop) // 2
                dist = rec.pos - window_center
            
    return etl.wrap(tbl_out).sort(key=('feature_chrom', 'feature_start'))

In [26]:
@etlcache
def tbl_core_promoter_diversity_100():
    return (
        tabulate_site_proximity_regions(tbl_core_promoters, window_size=100)
        .add_diversity_fields()
    )

tbl_core_promoter_diversity_100


Out[26]:
0|feature_chrom 1|feature_start 2|feature_stop 3|feature_length 4|site_pos 5|site_strand 6|site_dist 7|parent_feature_type 8|parent_feature_id 9|snp_diversity_3d7_hb3 10|indel_diversity_3d7_hb3 11|indel_str_diversity_3d7_hb3 12|indel_nostr_diversity_3d7_hb3 13|snp_diversity_hb3_dd2 14|indel_diversity_hb3_dd2 15|indel_str_diversity_hb3_dd2 16|indel_nostr_diversity_hb3_dd2 17|snp_diversity_7g8_gb4 18|indel_diversity_7g8_gb4 19|indel_str_diversity_7g8_gb4 20|indel_nostr_diversity_7g8_gb4 21|snp_diversity_all 22|indel_diversity_all 23|indel_str_diversity_all 24|indel_nostr_diversity_all
Pf3D7_01_v3 95941 96041 100 98541 + -2550 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 95998 96098 100 98398 - 2350 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96041 96141 100 98541 + -2450 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96098 96198 100 98398 - 2250 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96141 96241 100 98541 + -2350 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

...


In [27]:
@pdcache
def df_core_promoter_diversity_100():
    return (
        tbl_core_promoter_diversity_100
        .cutout('site_pos', 'site_strand', 'parent_feature_type', 'parent_feature_id')
        .todataframe()
    )

df_core_promoter_diversity_100.head()


Out[27]:
feature_chrom feature_start feature_stop feature_length site_dist snp_diversity_3d7_hb3 indel_diversity_3d7_hb3 indel_str_diversity_3d7_hb3 indel_nostr_diversity_3d7_hb3 snp_diversity_hb3_dd2 ... indel_str_diversity_hb3_dd2 indel_nostr_diversity_hb3_dd2 snp_diversity_7g8_gb4 indel_diversity_7g8_gb4 indel_str_diversity_7g8_gb4 indel_nostr_diversity_7g8_gb4 snp_diversity_all indel_diversity_all indel_str_diversity_all indel_nostr_diversity_all
0 Pf3D7_01_v3 95941 96041 100 -2550 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 Pf3D7_01_v3 95998 96098 100 2350 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 Pf3D7_01_v3 96041 96141 100 -2450 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 Pf3D7_01_v3 96098 96198 100 2250 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 Pf3D7_01_v3 96141 96241 100 -2350 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 21 columns


In [28]:
@etlcache
def tbl_core_promoter_diversity_50():
    return (
        tabulate_site_proximity_regions(tbl_core_promoters, window_size=50)
        .add_diversity_fields()
    )

tbl_core_promoter_diversity_50


Out[28]:
0|feature_chrom 1|feature_start 2|feature_stop 3|feature_length 4|site_pos 5|site_strand 6|site_dist 7|parent_feature_type 8|parent_feature_id 9|snp_diversity_3d7_hb3 10|indel_diversity_3d7_hb3 11|indel_str_diversity_3d7_hb3 12|indel_nostr_diversity_3d7_hb3 13|snp_diversity_hb3_dd2 14|indel_diversity_hb3_dd2 15|indel_str_diversity_hb3_dd2 16|indel_nostr_diversity_hb3_dd2 17|snp_diversity_7g8_gb4 18|indel_diversity_7g8_gb4 19|indel_str_diversity_7g8_gb4 20|indel_nostr_diversity_7g8_gb4 21|snp_diversity_all 22|indel_diversity_all 23|indel_str_diversity_all 24|indel_nostr_diversity_all
Pf3D7_01_v3 95941 95991 50 98541 + -2575 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 95948 95998 50 98398 - 2425 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 95991 96041 50 98541 + -2525 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 95998 96048 50 98398 - 2375 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96041 96091 50 98541 + -2475 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

...


In [29]:
@pdcache
def df_core_promoter_diversity_50():
    return (
        tbl_core_promoter_diversity_50
        .cutout('site_pos', 'site_strand', 'parent_feature_type', 'parent_feature_id')
        .todataframe()
    )

df_core_promoter_diversity_50.head()


Out[29]:
feature_chrom feature_start feature_stop feature_length site_dist snp_diversity_3d7_hb3 indel_diversity_3d7_hb3 indel_str_diversity_3d7_hb3 indel_nostr_diversity_3d7_hb3 snp_diversity_hb3_dd2 ... indel_str_diversity_hb3_dd2 indel_nostr_diversity_hb3_dd2 snp_diversity_7g8_gb4 indel_diversity_7g8_gb4 indel_str_diversity_7g8_gb4 indel_nostr_diversity_7g8_gb4 snp_diversity_all indel_diversity_all indel_str_diversity_all indel_nostr_diversity_all
0 Pf3D7_01_v3 95941 95991 50 -2575 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 Pf3D7_01_v3 95948 95998 50 2425 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 Pf3D7_01_v3 95991 96041 50 -2525 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 Pf3D7_01_v3 95998 96048 50 2375 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 Pf3D7_01_v3 96041 96091 50 -2475 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 21 columns


In [30]:
def multi_regplot(df, x, variables, figsize=(10, 2.5), suptitle='', axtitles=None, xlim=None, ylim=None, markersize=20, xlabel=None):
    
    # setup figure
    fig = plt.figure(figsize=figsize)
    fig.suptitle(suptitle, fontsize=12, y=1.05)
    n = len(variables)
    x = df[x]

    # make subplots
    for i, v in enumerate(variables):
        
        # setup subplot
        ax = fig.add_subplot(1, n, i+1)
        sns.despine(offset=5, ax=ax)
        
        # obtain subplot data
        y = df[v]
        
        # plotting
        ax.axhline(np.mean(y), color='k', linestyle=':')
#         ax.axvline(0, color='k', linestyle=':')
        sns.regplot(x, y, x_estimator=np.mean, fit_reg=False, ax=ax, color='k', scatter_kws=dict(s=markersize))
        if xlim:
            ax.set_xlim(*xlim)
        if ylim:
            ax.set_ylim(*ylim)
        if axtitles:
            ax.set_title(axtitles[i])
        else:
            ax.set_title(v)
        if i == 0:
            ax.set_ylabel('diversity (bp$^{-1}$)')
        else:
            ax.set_yticks([])
            ax.spines['left'].set_visible(False)
            ax.set_ylabel('')
        if i == 1 and xlabel:
            ax.set_xlabel(xlabel)
        else:
            ax.set_xlabel('')
            
    return fig

In [31]:
x = 'site_dist'
for cross in CROSSES + ('all',):
    variables = [v % cross for v in ('snp_diversity_%s', 'indel_nostr_diversity_%s', 'indel_str_diversity_%s')]
    suptitle = LABELS.get(cross, 'All crosses')
    axtitles = 'SNPs', 'Indels (non-STR)', 'Indels (STR)'
    xlabel = 'distance to core promoter (bp)'
    fig = multi_regplot(df_core_promoter_diversity_100, x, variables, xlim=(-600, 600), ylim=(0, 0.0025), suptitle=suptitle, axtitles=axtitles, xlabel=xlabel)
#     fig.savefig('../artwork/suppfig_core_promoter_diversity_%s.jpeg' % cross, jpeg_quality=100, dpi=200, bbox_inches='tight')
    plt.show()



In [32]:
x = 'site_dist'
for cross in CROSSES + ('all',):
    variables = [v % cross for v in ('snp_diversity_%s', 'indel_nostr_diversity_%s', 'indel_str_diversity_%s')]
    suptitle = LABELS.get(cross, 'All crosses')
    axtitles = 'SNPs', 'Indels (non-STR)', 'Indels (STR)'
    xlabel = 'distance to core promoter (bp)'
    fig = multi_regplot(df_core_promoter_diversity_50, x, variables, xlim=(-500, 500), ylim=(0, 0.0025), suptitle=suptitle, axtitles=axtitles, xlabel=xlabel)
#     fig.savefig('../../artwork/suppfig_core_promoter_diversity_%s.jpeg' % cross, jpeg_quality=100, dpi=200, bbox_inches='tight')
    plt.show()



In [33]:
xlim = -500, 500
ylim = 0, .002
markersize = 20
xlabel = 'distance to core promoter (bp)'

fig = plt.figure(figsize=(4, 7))
fig.suptitle('C', fontsize=12, fontweight='bold', y=.95)

df = df_core_promoter_diversity_50
x = df.site_dist

ax = fig.add_subplot(2, 1, 1)
sns.despine(offset=5, ax=ax)
y = df.indel_str_diversity_all
ax.axhline(np.mean(y), color='k', linestyle=':')
sns.regplot(x, y, x_estimator=np.mean, fit_reg=False, ax=ax, color='k', scatter_kws=dict(s=markersize))
ax.set_xlim(*xlim)
ax.set_ylim(*ylim)
ax.text(1, .1, 'Indels (STR)', transform=ax.transAxes, ha='right', va='bottom')
ax.set_ylabel('diversity (bp$^{-1}$)')
ax.set_xticklabels([])
ax.set_xlabel('')

ax = fig.add_subplot(2, 1, 2)
sns.despine(offset=5, ax=ax)
y = df.indel_nostr_diversity_all
ax.axhline(np.mean(y), color='k', linestyle=':')
sns.regplot(x, y, x_estimator=np.mean, fit_reg=False, ax=ax, color='k', scatter_kws=dict(s=markersize))
ax.set_xlim(*xlim)
ax.set_ylim(*ylim)
ax.text(1, .9, 'Indels (non-STR)', transform=ax.transAxes, ha='right', va='top')
ax.set_ylabel('diversity (bp$^{-1}$)')
ax.set_xlabel(xlabel)

#fig.savefig('../artwork/fig_indels_c.jpeg', dpi=1200, jpeg_quality=100, bbox_inches='tight');


Out[33]:
<matplotlib.text.Text at 0x7f30f7f5fc88>

Diversity by distance to transcription start site (TSS)


In [44]:
tbl_intergenic.display(20)


0|feature_chrom 1|feature_type 2|feature_start 3|feature_stop 4|feature_length 5|feature_id 6|feature_region_size 7|feature_region_type 8|region_size
Pf3D7_01_v3 intergenic_0 37126 38982 1856 PF3D7_0100100_PF3D7_0100200 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_1 40207 42367 2160 PF3D7_0100200_PF3D7_0100300 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_2 46507 50363 3856 PF3D7_0100300_PF3D7_0100400 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_0 51636 53169 1533 PF3D7_0100400_PF3D7_0100500 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_1 53280 53778 498 PF3D7_0100500_PF3D7_0100600 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_1 55006 56690 1684 PF3D7_0100600_PF3D7_0100700 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_2 56893 59772 2879 PF3D7_0100700_PF3D7_0100800 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_0 61003 62187 1184 PF3D7_0100800_PF3D7_0100900 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_1 63400 65817 2417 PF3D7_0100900_PF3D7_0101000 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_1 66989 69304 2315 PF3D7_0101000_PF3D7_0101100 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_2 70417 71624 1207 PF3D7_0101100_PF3D7_0101200 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_1 72426 74563 2137 PF3D7_0101200_PF3D7_0101300 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_0 75366 75982 616 PF3D7_0101300_PF3D7_0101400 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_2 76809 78241 1432 PF3D7_0101400_PF3D7_0101500 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_0 79891 81765 1874 PF3D7_0101500_PF3D7_0101600 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_1 83106 84791 1685 PF3D7_0101600_PF3D7_0101700 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_1 86152 87203 1051 PF3D7_0101700_PF3D7_0101800 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_1 88177 90242 2065 PF3D7_0101800_PF3D7_0101900 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_1 91420 93113 1693 PF3D7_0101900_PF3D7_0102000 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_1 91420 93113 1693 PF3D7_0101900_PF3D7_0102000 65564 Core 365031

...


In [48]:
tss_v3_coords_fn = os.path.join(DATA_DIR, 'reference/ponts_2011/tss_v3_coords.bed')

tbl_tss_v3 = (etl
    .fromtsv(tss_v3_coords_fn)
    .cut(0, 1, 3, 4)
    .pushheader(['chrom', 'pos', 'strand', 'gene'])
    .convert('pos', int)
    .intervalleftjoin(tbl_intergenic.prefixheader('parent_'), 
                      lkey='chrom', lstart='pos', lstop='pos',
                      rkey='parent_feature_chrom', rstart='parent_feature_start', rstop='parent_feature_stop',
                      include_stop=True)
    .eq('parent_feature_region_type', 'Core')
    .cutout('parent_feature_chrom', 'parent_feature_region_type')
    .sort(key=('chrom', 'pos'))
)
tbl_tss_v3.display(20)


0|chrom 1|pos 2|strand 3|gene 4|parent_feature_type 5|parent_feature_start 6|parent_feature_stop 7|parent_feature_length 8|parent_feature_id 9|parent_feature_region_size 10|parent_region_size
Pf3D7_01_v3 98403 + PFA0110w intergenic_1 95899 98819 2920 PF3D7_0102100_PF3D7_0102200 365031 365031
Pf3D7_01_v3 104334 + PFA0115w intergenic_1 102282 104704 2422 PF3D7_0102200_PF3D7_0102300 365031 365031
Pf3D7_01_v3 109759 - PFA0120c intergenic_1 108348 110750 2402 PF3D7_0102400_PF3D7_0102500 365031 365031
Pf3D7_01_v3 116792 - PFA0125c intergenic_1 115799 119041 3242 PF3D7_0102500_PF3D7_0102600 365031 365031
Pf3D7_01_v3 116829 - PFA0125c intergenic_1 115799 119041 3242 PF3D7_0102500_PF3D7_0102600 365031 365031
Pf3D7_01_v3 121431 - PFA0130c intergenic_2 121249 124517 3268 PF3D7_0102600_PF3D7_0102700 365031 365031
Pf3D7_01_v3 133966 - PFA0150c intergenic_1 133624 134353 729 PF3D7_0103000_PF3D7_0103100 365031 365031
Pf3D7_01_v3 143091 - PFA0160c intergenic_1 142219 143640 1421 PF3D7_0103200_PF3D7_0103300 365031 365031
Pf3D7_01_v3 171700 - PFA0190c intergenic_2 171273 172864 1591 PF3D7_0103800_PF3D7_0103900 365031 365031
Pf3D7_01_v3 184556 - PFA0210c intergenic_2 184222 190269 6047 PF3D7_0104200_PF3D7_0104300 365031 365031
Pf3D7_01_v3 189591 + PFA0215w intergenic_2 184222 190269 6047 PF3D7_0104200_PF3D7_0104300 365031 365031
Pf3D7_01_v3 201692 + PFA0225w intergenic_1 201230 202536 1306 PF3D7_0104300_PF3D7_0104400 365031 365031
Pf3D7_01_v3 205594 - PFA0230c intergenic_2 205389 206170 781 PF3D7_0104500_PF3D7_0104600 365031 365031
Pf3D7_01_v3 224347 - PFA0255c intergenic_1 223960 225110 1150 PF3D7_0105200_PF3D7_0105300 365031 365031
Pf3D7_01_v3 224363 - PFA0255c intergenic_1 223960 225110 1150 PF3D7_0105200_PF3D7_0105300 365031 365031
Pf3D7_01_v3 225919 - PFA0260c intergenic_1 225779 226681 902 PF3D7_0105300_PF3D7_0105400 365031 365031
Pf3D7_01_v3 227286 - PFA0265c intergenic_1 227285 228485 1200 PF3D7_0105400_PF3D7_0105500 365031 365031
Pf3D7_01_v3 253183 + PFA0290w intergenic_2 252010 253584 1574 PF3D7_0105800_PF3D7_0105900 365031 365031
Pf3D7_01_v3 253571 + PFA0290w intergenic_2 252010 253584 1574 PF3D7_0105800_PF3D7_0105900 365031 365031
Pf3D7_01_v3 258551 - PFA0295c intergenic_1 258543 260658 2115 PF3D7_0106000_PF3D7_0106100 365031 365031

...


In [52]:
tbl_tss_diversity_50 = (
    tabulate_site_proximity_regions(
        tbl_tss_v3
#             # only use intergenic regions where you expect one core promoter
#             .eq('type_intergenic', 'intergenic_1')
#             # make sure there is only one
#             .distinct(key='label_intergenic')
        ,
        window_size=50,
    )
    .tabulate_diversity(
        callsets_snp['3d7_hb3'],
        'snp_diversity_3d7_hb3'
    )
    .tabulate_diversity(
        callsets_indel_str['3d7_hb3'],
        'indel_str_diversity_3d7_hb3'
    )
    .tabulate_diversity(
        callsets_indel_nostr['3d7_hb3'],
        'indel_nostr_diversity_3d7_hb3'
    )
    .tabulate_diversity(
        callsets_snp['hb3_dd2'],
        'snp_diversity_hb3_dd2'
    )
    .tabulate_diversity(
        callsets_indel_str['hb3_dd2'],
        'indel_str_diversity_hb3_dd2'
    )
    .tabulate_diversity(
        callsets_indel_nostr['hb3_dd2'],
        'indel_nostr_diversity_hb3_dd2'
    )
    .tabulate_diversity(
        callsets_snp['7g8_gb4'],
        'snp_diversity_7g8_gb4'
    )
    .tabulate_diversity(
        callsets_indel_str['7g8_gb4'],
        'indel_str_diversity_7g8_gb4'
    )
    .tabulate_diversity(
        callsets_indel_nostr['7g8_gb4'],
        'indel_nostr_diversity_7g8_gb4'
    )
    .addfield('snp_diversity_all', lambda row: sum(row['snp_diversity_%s' % cross] for cross in CROSSES) / 3)
    .addfield('indel_str_diversity_all', lambda row: sum(row['indel_str_diversity_%s' % cross] for cross in CROSSES) / 3)
    .addfield('indel_nostr_diversity_all', lambda row: sum(row['indel_nostr_diversity_%s' % cross] for cross in CROSSES) / 3)
)
df_tss_diversity_50 = tbl_tss_diversity_50.cutout('feature_chrom', 'feature_start', 'feature_stop', 'feature_length', 'site_pos', 'site_strand', 'parent_feature_type').todataframe()
tbl_tss_diversity_50.display(20)


0|feature_chrom 1|feature_start 2|feature_stop 3|feature_length 4|site_pos 5|site_strand 6|site_dist 7|parent_feature_type 8|parent_feature_id 9|snp_diversity_3d7_hb3 10|indel_str_diversity_3d7_hb3 11|indel_nostr_diversity_3d7_hb3 12|snp_diversity_hb3_dd2 13|indel_str_diversity_hb3_dd2 14|indel_nostr_diversity_hb3_dd2 15|snp_diversity_7g8_gb4 16|indel_str_diversity_7g8_gb4 17|indel_nostr_diversity_7g8_gb4 18|snp_diversity_all 19|indel_str_diversity_all 20|indel_nostr_diversity_all
Pf3D7_01_v3 95903 95953 50 98403 + -2475 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 95953 96003 50 98403 + -2425 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96003 96053 50 98403 + -2375 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96053 96103 50 98403 + -2325 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96103 96153 50 98403 + -2275 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96153 96203 50 98403 + -2225 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96203 96253 50 98403 + -2175 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96253 96303 50 98403 + -2125 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96303 96353 50 98403 + -2075 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96353 96403 50 98403 + -2025 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96403 96453 50 98403 + -1975 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96453 96503 50 98403 + -1925 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96503 96553 50 98403 + -1875 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96553 96603 50 98403 + -1825 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96603 96653 50 98403 + -1775 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96653 96703 50 98403 + -1725 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.02 0.0 0.0 0.0 0.0 0.0 0.006666666666666667 0.0
Pf3D7_01_v3 96703 96753 50 98403 + -1675 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96753 96803 50 98403 + -1625 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Pf3D7_01_v3 96803 96853 50 98403 + -1575 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.02 0.0 0.0 0.006666666666666667 0.0
Pf3D7_01_v3 96853 96903 50 98403 + -1525 intergenic_1 PF3D7_0102100_PF3D7_0102200 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

...


In [53]:
x = 'site_dist'
for cross in CROSSES + ('all',):
    variables = [v % cross for v in ('snp_diversity_%s', 'indel_nostr_diversity_%s', 'indel_str_diversity_%s')]
    suptitle = LABELS.get(cross, 'All crosses')
    axtitles = 'SNPs', 'Indels (non-STR)', 'Indels (STR)'
    xlabel = 'distance to transcription start site (bp)'
    fig = multi_regplot(df_tss_diversity_50, x, variables, xlim=(-500, 500), ylim=(0, 0.003), suptitle=suptitle, axtitles=axtitles, xlabel=xlabel)
#     fig.savefig('../artwork/suppfig_tss_diversity_%s.jpeg' % cross, jpeg_quality=100, dpi=200, bbox_inches='tight')
    plt.show()



In [ ]: