In [1]:
%run ~/relmapping/annot/notebooks/annot__init__.ipynb


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping

In [2]:
fp_ = 'WS260_ce10/WS260_ce10.genes.protein_coding.gtf.gz'
df_genes = yp.read_wbgtf(fp_, parse_attr=True, coords_adj=True)
print(len(df_genes))


20210

In [3]:
# df_genes['tss_wormbase'] <- 5' end of the annotation in WS260_ce10
def tss_wormbase_(start, end, strand):
    if strand != '-':
        return start
    else:
        return end - 1

df_genes['tss_wormbase'] = [*map(tss_wormbase_, df_genes['start'], df_genes['end'], df_genes['strand'])]

In [4]:
# df_jaenes2018_fwd, df_jaenes2018_rev <- representative promoters by short cap counts at the mode
fp_ = 'annot_eLife_revised/Figure 2-source data 1. Regulatory annotation.txt'
df_regl = pd.read_csv(fp_, sep='\t')

df_scap_fwd = pd.read_csv('annot_eLife_full/metrics_scap/scap_fwd.tsv', sep='\t')
df_scap_rev = pd.read_csv('annot_eLife_full/metrics_scap/scap_rev.tsv', sep='\t')

df_regl['tss_count_fwd_ce10'] = df_scap_fwd['scap_mode_count']
df_regl['tss_count_rev_ce10'] = df_scap_rev['scap_mode_count']

def fwd_(df_):
    return df_.sort_values(['tss_count_fwd_ce10'], ascending=False).iloc[0]

def rev_(df_):
    return df_.sort_values(['tss_count_rev_ce10'], ascending=False).iloc[0]

df_jaenes2018_fwd = df_regl.query('annot_fwd == "coding_promoter"').groupby(['promoter_gene_id_fwd']).apply(fwd_)
df_jaenes2018_rev = df_regl.query('annot_rev == "coding_promoter"').groupby(['promoter_gene_id_rev']).apply(rev_)

write_gffbed('TSSs_one_per_gene/df_jaenes2018_fwd.bed',
    chrom = df_jaenes2018_fwd['chrom_ce10'],
    start = df_jaenes2018_fwd['start_ce10'],
    end = df_jaenes2018_fwd['end_ce10'],
)

write_gffbed('TSSs_one_per_gene/df_jaenes2018_rev.bed',
    chrom = df_jaenes2018_rev['chrom_ce10'],
    start = df_jaenes2018_rev['start_ce10'],
    end = df_jaenes2018_rev['end_ce10'],
)

In [5]:
df_fwd_ = df_jaenes2018_fwd[['tss_fwd_ce10', 'tss_count_fwd_ce10']].copy()
df_rev_ = df_jaenes2018_rev[['tss_rev_ce10', 'tss_count_rev_ce10']].copy()

df_fwd_.index.name = 'gene_id'
df_rev_.index.name = 'gene_id'

df_fwd_.columns = ['tss_jaenes2018', 'tss_jaenes2018_count']
df_rev_.columns = ['tss_jaenes2018', 'tss_jaenes2018_count']

df_ = pd.concat([df_fwd_, df_rev_], axis=0)
df_tss = df_genes.merge(df_, how='left', left_on='gene_id', right_index=True)

In [16]:
def tss_(tss_wormbase, tss_jaenes2018, tss_jaenes2018_count):
    if tss_jaenes2018 == tss_jaenes2018:
        return tss_jaenes2018
    else:
        return tss_wormbase
    
def col_(tss_wormbase, tss_jaenes2018, tss_jaenes2018_count):
    if tss_jaenes2018 == tss_jaenes2018:
        return yp.RED if tss_jaenes2018_count >= 2 else yp.YELLOW
    else:
        return yp.BLUE

df_tss['tss'] = [*map(tss_, df_tss['tss_wormbase'], df_tss['tss_jaenes2018'], df_tss['tss_jaenes2018_count'])]
df_tss['col'] = [*map(col_, df_tss['tss_wormbase'], df_tss['tss_jaenes2018'], df_tss['tss_jaenes2018_count'])]

In [19]:
df_tss['feature_start'] = [*map(min, df_tss['start'], df_tss['tss'])]
df_tss['feature_end'] = [*map(max, df_tss['tss'] + 1, df_tss['end'])]

write_gffbed('TSSs_one_per_gene/TSSs_one_per_gene.bed',
    chrom = df_tss['chrom'],
    start = df_tss['feature_start'].map(int),
    end = df_tss['feature_end'].map(int),
    name = df_tss['gene_id'],
    strand = df_tss['strand'],
    itemRgb = df_tss['col'],
    attr = df_tss[['tss_jaenes2018', 'tss_jaenes2018_count', 'tss_wormbase']]
)

In [ ]: