In [1]:
# Set annotation to annot_ce10
%run ~/relmapping/annot/notebooks/annot__init__.ipynb
annot_ = 'annot_ce10_eLife_full'

df_atac = pd.read_csv(os.path.join(annot_, 'accessible_sites.tsv'), sep='\t')
l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
df_lcap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_lcap', 'lcap_all_fwd.tsv'), sep='\t', low_memory=False)
df_lcap_rev = pd.read_csv(os.path.join(annot_, 'metrics_lcap', 'lcap_all_rev.tsv'), sep='\t', low_memory=False)
df_exon_fwd = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_fwd.tsv'), sep='\t', low_memory=False)
df_exon_rev = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_rev.tsv'), sep='\t', low_memory=False)
df_maxgap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_maxgap', 'maxgap_fwd.tsv'), sep='\t')
df_maxgap_rev = pd.read_csv(os.path.join(annot_, 'metrics_maxgap', 'maxgap_rev.tsv'), sep='\t')
df_scap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_scap', 'scap_fwd.tsv'), sep='\t')
df_scap_rev = pd.read_csv(os.path.join(annot_, 'metrics_scap', 'scap_rev.tsv'), sep='\t')

df_prom_fwd = pd.read_csv(os.path.join(annot_, 'metrics_type', 'prom_fwd.tsv'), sep='\t')
df_prom_rev = pd.read_csv(os.path.join(annot_, 'metrics_type', 'prom_rev.tsv'), sep='\t')
df_regl = pd.read_csv(os.path.join(annot_, 'metrics_type', 'regl.tsv'), sep='\t')

def mp(fp, annot_=annot_): return os.path.join(annot_, 'Source Data', fp)


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping

In [2]:
# Fig2D1_regulatory_annotation.tsv: stricter subset of (most informative) annotation metrics
fp_ = os.path.join(annot_, 'Source Data/Fig 2 - source data 1. Regulatory annotation.txt')
l_cols_ = [
    'chrom', 'start', 'end', 
    'annot', 'annot_fwd', 'annot_rev', 
    'promoter_gene_id_fwd', 'promoter_locus_id_fwd', 'promoter_gene_biotype_fwd',
    'promoter_gene_id_rev', 'promoter_locus_id_rev', 'promoter_gene_biotype_rev',
    'associated_gene_id', 'associated_locus_id',
    'tss_fwd', 'tss_rev', 'scap_pass_fwd', 'scap_pass_rev',
]

l_cols_fwd_ = \
    ['lcap_%s_fwd_passed_jump' % (stage,) for stage in config['stages']] +\
    ['lcap_%s_fwd_passed_incr' % (stage,) for stage in config['stages']]

l_cols_rev_ = \
    ['lcap_%s_rev_passed_jump' % (stage,) for stage in config['stages']] +\
    ['lcap_%s_rev_passed_incr' % (stage,) for stage in config['stages']]

df_ = pd.concat([df_regl[l_cols_].copy(), df_lcap_fwd[l_cols_fwd_].copy(), df_lcap_rev[l_cols_rev_].copy()], axis=1)\
.rename(columns={'scap_pass_fwd': 'scap_fwd_passed', 'scap_pass_rev': 'scap_rev_passed'}, copy=True, inplace=False,)

p_fwd = df_['annot_fwd'].isin(['coding_promoter', 'pseudogene_promoter', 'non-coding_RNA'])
p_rev = df_['annot_rev'].isin(['coding_promoter', 'pseudogene_promoter', 'non-coding_RNA'])

df_.loc[~p_fwd, 'promoter_gene_id_fwd'] = '.'
df_.loc[~p_fwd, 'promoter_locus_id_fwd'] = '.'
df_.loc[~p_fwd, 'promoter_gene_biotype_fwd'] = '.'

df_.loc[~p_rev, 'promoter_gene_id_rev'] = '.'
df_.loc[~p_rev, 'promoter_locus_id_rev'] = '.'
df_.loc[~p_rev, 'promoter_gene_biotype_rev'] = '.'

assoc = df_['annot'].isin(['unknown_promoter', 'putative_enhancer', 'other_element'])
df_.loc[~assoc, 'associated_gene_id'] = '.'
df_.loc[~assoc, 'associated_locus_id'] = '.'

df_.to_csv(fp_, header=True, index=False, sep='\t')

In [3]:
df_clust = pd.read_csv(os.path.join(annot_, 'metrics_atac_dynamics/20180530.supp-table.txt'), sep='\t')
df_clust.head()


Out[3]:
chr start stop annot associated_gene_id associated_locus_id ATACdc.dev ATACdc.age clustersATAC.dev clustersATAC.age
0 chrI 1900 2051 putative_enhancer . . True False NaN NaN
1 chrI 3826 3977 non-coding_RNA WBGene00023193 Y74C9A.6 True False NaN NaN
2 chrI 4276 4427 putative_enhancer WBGene00022277 homt-1 True False NaN NaN
3 chrI 11272 11423 coding_promoter WBGene00022276,WBGene00022277 homt-1,nlp-40 True False I1 stable
4 chrI 13070 13221 putative_enhancer WBGene00022276 nlp-40 True False NaN NaN

In [4]:
print(sum(df_regl['chrom'] == df_clust['chr']))
print(sum(df_regl['start'] == df_clust['start']))
print(sum(df_regl['end'] == df_clust['stop']))


42245
42245
42245

In [5]:
df_clust['clustersATAC.dev'].value_counts()


Out[5]:
stable    3397
I2        1056
Mix3       938
Mix8       893
Mix1       822
Mix6       802
I1         722
G3         642
G4         626
G1         585
Mix2       574
N+M        563
H          525
Mix4       465
Mix7       450
G2         276
Mix5       260
Name: clustersATAC.dev, dtype: int64

In [6]:
df_clust['clustersATAC.age'].value_counts()


Out[6]:
stable     11796
I+H [2]      409
I            304
Mix3         273
I+H [1]      252
Mix1         210
Mix5         142
Mix4         112
Mix2          98
Name: clustersATAC.age, dtype: int64

In [7]:
pd.crosstab(df_clust['annot'], df_clust['clustersATAC.dev'])


Out[7]:
clustersATAC.dev G1 G2 G3 G4 H I1 I2 Mix1 Mix2 Mix3 Mix4 Mix5 Mix6 Mix7 Mix8 N+M stable
annot
coding_promoter 585 276 642 626 525 722 1056 822 574 938 465 260 802 450 893 563 3397

In [8]:
pd.crosstab(df_clust['annot'], df_clust['clustersATAC.age'])


Out[8]:
clustersATAC.age I I+H [1] I+H [2] Mix1 Mix2 Mix3 Mix4 Mix5 stable
annot
coding_promoter 304 252 409 210 98 273 112 142 11796

In [9]:
df_clust_compact = df_clust\
[['chr', 'start', 'stop', 'ATACdc.dev', 'ATACdc.age', 'clustersATAC.dev', 'clustersATAC.age']]\
.rename(columns={'chr': 'chrom', 'stop': 'end', 
    'ATACdc.dev': 'devel_is_dynamic',
    'ATACdc.age': 'ageing_is_dynamic',                
    'clustersATAC.dev': 'devel_cluster_label',
    'clustersATAC.age': 'ageing_cluster_label'}).reset_index(drop=True)
print(len(df_clust_compact))
df_clust_compact['devel_cluster_label'].fillna('.', inplace=True)
df_clust_compact['ageing_cluster_label'].fillna('.', inplace=True)
df_clust_compact.head(20)


42245
Out[9]:
chrom start end devel_is_dynamic ageing_is_dynamic devel_cluster_label ageing_cluster_label
0 chrI 1900 2051 True False . .
1 chrI 3826 3977 True False . .
2 chrI 4276 4427 True False . .
3 chrI 11272 11423 True False I1 stable
4 chrI 13070 13221 True False . .
5 chrI 15430 15581 True False . .
6 chrI 15722 15873 True False . .
7 chrI 16958 17109 True False . .
8 chrI 22266 22417 True False . .
9 chrI 22895 23046 False False . .
10 chrI 24452 24603 False True . .
11 chrI 26902 27053 True False I2 stable
12 chrI 27082 27233 True False . .
13 chrI 28163 28314 True False . .
14 chrI 30271 30422 False False . .
15 chrI 31958 32109 False False . .
16 chrI 33340 33491 False False . .
17 chrI 33764 33915 True False . .
18 chrI 34239 34390 False False . .
19 chrI 34626 34777 False False . .

In [10]:
df_clust_compact.to_csv(mp('Fig 4 - source data 1. Promoter accessibility.txt'), index=False, header=True, sep='\t')

In [37]:
# Fig2D1_regulatory_annotation.tsv: stricter subset of (most informative) annotation metrics
fp_ = os.path.join(annot_, 'reg_elements_eLife_full_review_expanded.tsv')

df_regl['annot_detailed_fwd'] = df_prom_fwd['annot_detailed_summary']
df_regl['annot_detailed_rev'] = df_prom_rev['annot_detailed_summary']

l_cols_ = [
    'chrom', 'start', 'end', 'chrom_ce11', 'start_ce11', 'end_ce11',
    'annot', 'annot_fwd', 'annot_rev', 'annot_detailed_fwd', 'annot_detailed_rev',
    'promoter_gene_id_fwd', 'promoter_locus_id_fwd', 'promoter_gene_biotype_fwd',
    'promoter_gene_id_rev', 'promoter_locus_id_rev', 'promoter_gene_biotype_rev',
    'associated_gene_id', 'associated_locus_id',
    'tss_fwd', 'tss_rev', 'scap_pass_fwd', 'scap_pass_rev',
]

l_cols_lcap_fwd_ = \
    ['lcap_%s_fwd_passed_jump' % (stage,) for stage in config['stages']] +\
    ['lcap_%s_fwd_passed_incr' % (stage,) for stage in config['stages']]

l_cols_lcap_rev_ = \
    ['lcap_%s_rev_passed_jump' % (stage,) for stage in config['stages']] +\
    ['lcap_%s_rev_passed_incr' % (stage,) for stage in config['stages']]

l_cols_clust_ = ['devel_is_dynamic', 'ageing_is_dynamic', 'devel_cluster_label', 'ageing_cluster_label']

df_ = pd.concat([
    df_regl[l_cols_].rename(columns={'scap_pass_fwd': 'scap_fwd_passed', 'scap_pass_rev': 'scap_rev_passed'}), 
    df_atac[['atac_%s_height' % stage_ for stage_ in config['stages']] + ['atac_source']],
    df_lcap_fwd[l_cols_lcap_fwd_],
    df_lcap_rev[l_cols_lcap_rev_],
    df_maxgap_fwd,
    df_maxgap_rev,
    df_clust_compact[l_cols_clust_]], axis=1)

p_fwd = df_['annot_fwd'].isin(['coding_promoter', 'pseudogene_promoter', 'non-coding_RNA'])
p_rev = df_['annot_rev'].isin(['coding_promoter', 'pseudogene_promoter', 'non-coding_RNA'])

df_.loc[~p_fwd, 'promoter_gene_id_fwd'] = '.'
df_.loc[~p_fwd, 'promoter_locus_id_fwd'] = '.'
df_.loc[~p_fwd, 'promoter_gene_biotype_fwd'] = '.'

df_.loc[~p_rev, 'promoter_gene_id_rev'] = '.'
df_.loc[~p_rev, 'promoter_locus_id_rev'] = '.'
df_.loc[~p_rev, 'promoter_gene_biotype_rev'] = '.'

assoc = df_['annot'].isin(['unknown_promoter', 'putative_enhancer', 'other_element'])
df_.loc[~assoc, 'associated_gene_id'] = '.'
df_.loc[~assoc, 'associated_locus_id'] = '.'

df_.to_csv(fp_, header=True, index=False, sep='\t')