In [1]:
%run ~/relmapping/annot/notebooks/__init__.ipynb


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping

In [2]:
# Build gdf
def addon_H3K36me3_gene(df_regl_arg):
    random.seed(1)
    def H3K36me3_gene_id_(prom_fwd_gene_id, prom_rev_gene_id, associated_gene_id):
        if prom_fwd_gene_id != '.' and prom_rev_gene_id != '.':
            return random.choice([prom_fwd_gene_id, prom_rev_gene_id])
        elif prom_fwd_gene_id != '.':
            return prom_fwd_gene_id
        elif prom_rev_gene_id != '.':
            return prom_rev_gene_id
        else:
            return random.choice(associated_gene_id.split(','))

    df_regl_ = df_regl_arg.copy()
    df_regl_['H3K36me3_gene_id'] = list(map(H3K36me3_gene_id_, 
                                           df_regl_arg['promoter_gene_id_fwd'],
                                           df_regl_arg['promoter_gene_id_rev'],
                                           df_regl_arg['associated_gene_id']))

    col_ = ['chrom', 'start', 'end', 'strand', 'gene_id']
    df_genes = pd.read_csv('WS260_ce10/WS260_ce10.genes_by_CV.tsv', sep='\t')[col_]\
        .drop_duplicates(subset=['gene_id']) # CV table has two duplicate gene records where a gene is annotated in *two* distinct operons

    df_genes.columns = [
        'H3K36me3_gene_chrom',
        'H3K36me3_gene_start',
        'H3K36me3_gene_end',
        'H3K36me3_gene_strand',
        'H3K36me3_gene_id',
    ]

    df_regl = df_regl_.merge(df_genes, how='left', left_on='H3K36me3_gene_id', right_on='H3K36me3_gene_id')
    m_ = ~(df_regl['H3K36me3_gene_strand'] == '-')
    df_regl['H3K36me3_gene_start'] -= 1
    df_regl.loc[m_, 'H3K36me3_gene_end'] = df_regl.loc[m_, 'H3K36me3_gene_start'] + 1
    df_regl.loc[~m_, 'H3K36me3_gene_start'] = df_regl.loc[~m_, 'H3K36me3_gene_end'] - 1
    return df_regl

df_regl = regl_Apr27()
addon_H3K4me3(df_regl)
addon_H3K27me3(df_regl)
df_regl_H3K36me3 = addon_H3K36me3_gene(df_regl)
gdf = yp.GenomicDataFrame(df_regl_H3K36me3, pos_column='atac_mode')

#df_regl.query('annot == "coding_promoter"')[[
#    'H3K27me3_mean_mean',
#    'H3K27me3_mean_max',
#    'H3K27me3_max_mean',
#    'H3K27me3_max_max',
#    'CV'
#]].corr(method='spearman')

df_regl.query('annot == "coding_promoter"')[[
    'H3K4me3_mean_mean',
    'H3K4me3_mean_max',
    'H3K4me3_max_mean',
    'H3K4me3_max_max',
    'CV'
]].corr(method='spearman')

#df_regl.query('annot == "putative_enhancer"')[[
#    'H3K27me3_mean_mean',
#    'H3K27me3_mean_max',
#   'H3K27me3_max_mean',
#    'H3K27me3_max_max',
#    'CV'
#]].corr(method='spearman')

#df_regl.query('annot == "putative_enhancer"')[[
#    'H3K4me3_mean_mean',
#    'H3K4me3_mean_max',
#    'H3K4me3_max_mean',
#    'H3K4me3_max_max',
#    'CV'
#]].corr(method='spearman')


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/ipykernel_launcher.py:64: RuntimeWarning: Mean of empty slice
13054 of 42245 sites with CV values via promoter annotation
26764 of 42245 sites with CV values via "associated gene"
H3K4me3 wt_emb True
/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/ipykernel_launcher.py:12: RuntimeWarning: Mean of empty slice
  if sys.path[0] == '':
/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/ipykernel_launcher.py:14: RuntimeWarning: All-NaN slice encountered
  
H3K4me3 wt_l1 True
H3K4me3 wt_l2 True
H3K4me3 wt_l3 True
H3K4me3 wt_l4 True
H3K4me3 wt_ya True
H3K27me3 wt_emb True
H3K27me3 wt_l1 True
H3K27me3 wt_l2 True
H3K27me3 wt_l3 True
H3K27me3 wt_l4 True
H3K27me3 wt_ya True
gdf: pos_column unspecified, reverting to mean(start, end)
Out[2]:
H3K4me3_mean_mean H3K4me3_mean_max H3K4me3_max_mean H3K4me3_max_max CV
H3K4me3_mean_mean 1.000000 0.955091 0.975581 0.928832 -0.636029
H3K4me3_mean_max 0.955091 1.000000 0.926829 0.961140 -0.623913
H3K4me3_max_mean 0.975581 0.926829 1.000000 0.953522 -0.614114
H3K4me3_max_max 0.928832 0.961140 0.953522 1.000000 -0.596413
CV -0.636029 -0.623913 -0.614114 -0.596413 1.000000

In [3]:
df_regl.query('annot == "coding_promoter"')[[
    'H3K4me3_mean_mean',
    'CV'
]].corr(method='spearman')


Out[3]:
H3K4me3_mean_mean CV
H3K4me3_mean_mean 1.000000 -0.636029
CV -0.636029 1.000000

In [4]:
# Histone marks -- read signal

# H3K4me3, H3K4me1 at sites
for hmod in ['H3K4me3', 'H3K4me1']:
    for stage in config['stages_wt']:
        fp_ = 'hmod_geo/tracks/%s_%s.bw' % (hmod, stage)
        label = '%s\n%s' % (hmod, stage)
        gdf.add_track(label, fp_, flank_len=1000, bin_size=25)

# H3K27me3 at sites
for hmod in ['H3K27me3']:
    for stage in config['stages_wt']:
        fp_ = 'hmod_geo/tracks/%s_%s.bw' % (hmod, stage)
        label = '%s\n%s' % (hmod, stage)
        gdf.add_track(label, fp_, flank_len=1000, bin_size=25)


mread_regions True H3K4me3_wt_emb.bw.mread_regions_68c5afb1864752a35288eb157782bed4.tmp
mread_regions True H3K4me3_wt_l1.bw.mread_regions_8567b9aeb08e90642dab74a5dc648e3c.tmp
mread_regions True H3K4me3_wt_l2.bw.mread_regions_c49e8b0e1acd9ea9daa2b7850cad9a43.tmp
mread_regions True H3K4me3_wt_l3.bw.mread_regions_3cb99fdb8d17774374ac60f8326cd832.tmp
mread_regions True H3K4me3_wt_l4.bw.mread_regions_fcf080c44808fa792e7cf174e002dbe7.tmp
mread_regions True H3K4me3_wt_ya.bw.mread_regions_26b4f576ef8ceb5542ec3b4c3851a4fc.tmp
mread_regions True H3K4me1_wt_emb.bw.mread_regions_3ffc6c69b9acd6eba368435b24044c95.tmp
mread_regions True H3K4me1_wt_l1.bw.mread_regions_098d4c9b55c4647d7e0b9b6c8908d8bd.tmp
mread_regions True H3K4me1_wt_l2.bw.mread_regions_511bb68fb19fe5f1d9090deaf47def49.tmp
mread_regions True H3K4me1_wt_l3.bw.mread_regions_8fa365b13c0a4ac33bb64823bcda77bb.tmp
mread_regions True H3K4me1_wt_l4.bw.mread_regions_fa5b7c60316f5d562a2c4b6fa5cd4369.tmp
mread_regions True H3K4me1_wt_ya.bw.mread_regions_6f6a36ac7b78096c372797ecc58442da.tmp
mread_regions True H3K27me3_wt_emb.bw.mread_regions_964d5ef1caac7a491783d421d1a28280.tmp
mread_regions True H3K27me3_wt_l1.bw.mread_regions_6dd3dc069f299668e01b8a039d5c1967.tmp
mread_regions True H3K27me3_wt_l2.bw.mread_regions_db4d9534b5c26696a1224e7848a38dbf.tmp
mread_regions True H3K27me3_wt_l3.bw.mread_regions_301e68b99b5c6ab2c4c0a19204b1dc28.tmp
mread_regions True H3K27me3_wt_l4.bw.mread_regions_839062b64521ff6559e844a24e984a29.tmp
mread_regions True H3K27me3_wt_ya.bw.mread_regions_dc13aa098f21d8c5ccd4685020e40aa1.tmp

In [5]:
# Histone marks -- plot settings
hmod = 'H3K4me3'
for stage in config['stages_wt']:
    label = '%s\n%s' % (hmod, stage)
    gdf.t[label].imshow_kwargs['vmin'] = -5
    gdf.t[label].imshow_kwargs['vmax'] = +5
    gdf.t[label].smooth = None#lambda m: sp.ndimage.gaussian_filter(m, sigma=3)

hmod = 'H3K4me1'
for stage in config['stages_wt']:
    label = '%s\n%s' % (hmod, stage)
    gdf.t[label].imshow_kwargs['vmin'] = -3
    gdf.t[label].imshow_kwargs['vmax'] = +3
    gdf.t[label].smooth = None#lambda m: sp.ndimage.gaussian_filter(m, sigma=3)

hmod = 'H3K27me3'
for stage in config['stages_wt']:
    label = '%s\n%s' % (hmod, stage)
    gdf.t[label].imshow_kwargs['vmin'] = -1
    gdf.t[label].imshow_kwargs['vmax'] = +1
    gdf.t[label].smooth = None#lambda m: sp.ndimage.gaussian_filter(m, sigma=3)

In [6]:
# associated gene CV
CV_ = gdf.r['CV'].rank(pct=True, na_option='keep')
label_ = 'CV\nrank'
gdf.t[label_] = yp.GenomicDataFrameScalarTrack(CV_, flank_len=1000, bin_size=25)
gdf.t[label_].imshow_kwargs['cmap'] = matplotlib.cm.get_cmap('inferno_r')
gdf.t[label_].imshow_kwargs['vmin'] = 0
gdf.t[label_].imshow_kwargs['vmax'] = +1

In [8]:
# plot
row_labels_ = [
    'coding_promoter (%s)' % (yp.f_uk(len(gdf.r.query('(annot == "coding_promoter")'))),), 
    'putative_enhancer (%s)' % (yp.f_uk(len(gdf.r.query('(annot == "putative_enhancer")'))),),
]

cb = gdf.query('((annot == "coding_promoter") | (annot == "putative_enhancer"))')\
.sort(['annot']).sort(['H3K4me3_mean_mean'], ascending=False)\
.imshow(
    figsize=(8, 4),
    savefig=False,
    kwargs_grid = {'cbar_size': '5%', 'cbar_pad': 0.02, 'axes_pad': 0.02,},
    axis_off=True,
    cluster_col='annot',
    nsquashed=200,
    vextent=1,
    suptitle='',
    row_labels=row_labels_,
    fontsize=8,
    figB_magic=True,
    l_title_cbar=['H3K4me3\nwt_emb', 'H3K4me1\nwt_emb', 'H3K27me3\nwt_emb','CV\nrank'],
)

fp_ = 'annot_Apr27/Fig3A_HMsortbyH3K4me3.png'
plt.savefig(fp_, dpi=600, transparent=True, bbox_inches='tight')



In [ ]: