notebook.community

Edit and run



In [1]:

    
# Set annotation to annot_ce10
%run ~/relmapping/annot/notebooks/annot__init__.ipynb
annot_ = 'annot_ce10_eLife_full'

df_atac = pd.read_csv(os.path.join(annot_, 'accessible_sites.tsv'), sep='\t')
l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
df_lcap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_lcap', 'lcap_all_fwd.tsv'), sep='\t', low_memory=False)
df_lcap_rev = pd.read_csv(os.path.join(annot_, 'metrics_lcap', 'lcap_all_rev.tsv'), sep='\t', low_memory=False)
df_exon_fwd = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_fwd.tsv'), sep='\t', low_memory=False)
df_exon_rev = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_rev.tsv'), sep='\t', low_memory=False)
df_maxgap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_maxgap', 'maxgap_fwd.tsv'), sep='\t')
df_maxgap_rev = pd.read_csv(os.path.join(annot_, 'metrics_maxgap', 'maxgap_rev.tsv'), sep='\t')
df_scap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_scap', 'scap_fwd.tsv'), sep='\t')
df_scap_rev = pd.read_csv(os.path.join(annot_, 'metrics_scap', 'scap_rev.tsv'), sep='\t')

df_prom_fwd = pd.read_csv(os.path.join(annot_, 'metrics_type', 'prom_fwd.tsv'), sep='\t')
df_prom_rev = pd.read_csv(os.path.join(annot_, 'metrics_type', 'prom_rev.tsv'), sep='\t')
df_regl = pd.read_csv(os.path.join(annot_, 'metrics_type', 'regl.tsv'), sep='\t')

def mp(fp, annot_=annot_): return os.path.join(annot_, 'Source Data', fp)









    



/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools






    



os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping



In [2]:

    
# Fig2D1_regulatory_annotation.tsv: stricter subset of (most informative) annotation metrics
fp_ = os.path.join(annot_, 'Source Data/Fig 2 - source data 1. Regulatory annotation.txt')
l_cols_ = [
    'chrom', 'start', 'end', 
    'annot', 'annot_fwd', 'annot_rev', 
    'promoter_gene_id_fwd', 'promoter_locus_id_fwd', 'promoter_gene_biotype_fwd',
    'promoter_gene_id_rev', 'promoter_locus_id_rev', 'promoter_gene_biotype_rev',
    'associated_gene_id', 'associated_locus_id',
    'tss_fwd', 'tss_rev', 'scap_pass_fwd', 'scap_pass_rev',
]

l_cols_fwd_ = \
    ['lcap_%s_fwd_passed_jump' % (stage,) for stage in config['stages']] +\
    ['lcap_%s_fwd_passed_incr' % (stage,) for stage in config['stages']]

l_cols_rev_ = \
    ['lcap_%s_rev_passed_jump' % (stage,) for stage in config['stages']] +\
    ['lcap_%s_rev_passed_incr' % (stage,) for stage in config['stages']]

df_ = pd.concat([df_regl[l_cols_].copy(), df_lcap_fwd[l_cols_fwd_].copy(), df_lcap_rev[l_cols_rev_].copy()], axis=1)\
.rename(columns={'scap_pass_fwd': 'scap_fwd_passed', 'scap_pass_rev': 'scap_rev_passed'}, copy=True, inplace=False,)

p_fwd = df_['annot_fwd'].isin(['coding_promoter', 'pseudogene_promoter', 'non-coding_RNA'])
p_rev = df_['annot_rev'].isin(['coding_promoter', 'pseudogene_promoter', 'non-coding_RNA'])

df_.loc[~p_fwd, 'promoter_gene_id_fwd'] = '.'
df_.loc[~p_fwd, 'promoter_locus_id_fwd'] = '.'
df_.loc[~p_fwd, 'promoter_gene_biotype_fwd'] = '.'

df_.loc[~p_rev, 'promoter_gene_id_rev'] = '.'
df_.loc[~p_rev, 'promoter_locus_id_rev'] = '.'
df_.loc[~p_rev, 'promoter_gene_biotype_rev'] = '.'

assoc = df_['annot'].isin(['unknown_promoter', 'putative_enhancer', 'other_element'])
df_.loc[~assoc, 'associated_gene_id'] = '.'
df_.loc[~assoc, 'associated_locus_id'] = '.'

df_.to_csv(fp_, header=True, index=False, sep='\t')



In [3]:

    
df_clust = pd.read_csv(os.path.join(annot_, 'metrics_atac_dynamics/20180530.supp-table.txt'), sep='\t')
df_clust.head()









    Out[3]:







  
    
      
      chr
      start
      stop
      annot
      associated_gene_id
      associated_locus_id
      ATACdc.dev
      ATACdc.age
      clustersATAC.dev
      clustersATAC.age
    
  
  
    
      0
      chrI
      1900
      2051
      putative_enhancer
      .
      .
      True
      False
      NaN
      NaN
    
    
      1
      chrI
      3826
      3977
      non-coding_RNA
      WBGene00023193
      Y74C9A.6
      True
      False
      NaN
      NaN
    
    
      2
      chrI
      4276
      4427
      putative_enhancer
      WBGene00022277
      homt-1
      True
      False
      NaN
      NaN
    
    
      3
      chrI
      11272
      11423
      coding_promoter
      WBGene00022276,WBGene00022277
      homt-1,nlp-40
      True
      False
      I1
      stable
    
    
      4
      chrI
      13070
      13221
      putative_enhancer
      WBGene00022276
      nlp-40
      True
      False
      NaN
      NaN



In [4]:

    
print(sum(df_regl['chrom'] == df_clust['chr']))
print(sum(df_regl['start'] == df_clust['start']))
print(sum(df_regl['end'] == df_clust['stop']))



In [5]:

    
df_clust['clustersATAC.dev'].value_counts()









    Out[5]:





stable    3397
I2        1056
Mix3       938
Mix8       893
Mix1       822
Mix6       802
I1         722
G3         642
G4         626
G1         585
Mix2       574
N+M        563
H          525
Mix4       465
Mix7       450
G2         276
Mix5       260
Name: clustersATAC.dev, dtype: int64



In [6]:

    
df_clust['clustersATAC.age'].value_counts()









    Out[6]:





stable     11796
I+H [2]      409
I            304
Mix3         273
I+H [1]      252
Mix1         210
Mix5         142
Mix4         112
Mix2          98
Name: clustersATAC.age, dtype: int64



In [7]:

    
pd.crosstab(df_clust['annot'], df_clust['clustersATAC.dev'])









    Out[7]:







  
    
      clustersATAC.dev
      G1
      G2
      G3
      G4
      H
      I1
      I2
      Mix1
      Mix2
      Mix3
      Mix4
      Mix5
      Mix6
      Mix7
      Mix8
      N+M
      stable
    
    
      annot
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      coding_promoter
      585
      276
      642
      626
      525
      722
      1056
      822
      574
      938
      465
      260
      802
      450
      893
      563
      3397



In [8]:

    
pd.crosstab(df_clust['annot'], df_clust['clustersATAC.age'])









    Out[8]:







  
    
      clustersATAC.age
      I
      I+H [1]
      I+H [2]
      Mix1
      Mix2
      Mix3
      Mix4
      Mix5
      stable
    
    
      annot
      
      
      
      
      
      
      
      
      
    
  
  
    
      coding_promoter
      304
      252
      409
      210
      98
      273
      112
      142
      11796



In [9]:

    
df_clust_compact = df_clust\
[['chr', 'start', 'stop', 'ATACdc.dev', 'ATACdc.age', 'clustersATAC.dev', 'clustersATAC.age']]\
.rename(columns={'chr': 'chrom', 'stop': 'end', 
    'ATACdc.dev': 'devel_is_dynamic',
    'ATACdc.age': 'ageing_is_dynamic',                
    'clustersATAC.dev': 'devel_cluster_label',
    'clustersATAC.age': 'ageing_cluster_label'}).reset_index(drop=True)
print(len(df_clust_compact))
df_clust_compact['devel_cluster_label'].fillna('.', inplace=True)
df_clust_compact['ageing_cluster_label'].fillna('.', inplace=True)
df_clust_compact.head(20)









    



42245






    Out[9]:







  
    
      
      chrom
      start
      end
      devel_is_dynamic
      ageing_is_dynamic
      devel_cluster_label
      ageing_cluster_label
    
  
  
    
      0
      chrI
      1900
      2051
      True
      False
      .
      .
    
    
      1
      chrI
      3826
      3977
      True
      False
      .
      .
    
    
      2
      chrI
      4276
      4427
      True
      False
      .
      .
    
    
      3
      chrI
      11272
      11423
      True
      False
      I1
      stable
    
    
      4
      chrI
      13070
      13221
      True
      False
      .
      .
    
    
      5
      chrI
      15430
      15581
      True
      False
      .
      .
    
    
      6
      chrI
      15722
      15873
      True
      False
      .
      .
    
    
      7
      chrI
      16958
      17109
      True
      False
      .
      .
    
    
      8
      chrI
      22266
      22417
      True
      False
      .
      .
    
    
      9
      chrI
      22895
      23046
      False
      False
      .
      .
    
    
      10
      chrI
      24452
      24603
      False
      True
      .
      .
    
    
      11
      chrI
      26902
      27053
      True
      False
      I2
      stable
    
    
      12
      chrI
      27082
      27233
      True
      False
      .
      .
    
    
      13
      chrI
      28163
      28314
      True
      False
      .
      .
    
    
      14
      chrI
      30271
      30422
      False
      False
      .
      .
    
    
      15
      chrI
      31958
      32109
      False
      False
      .
      .
    
    
      16
      chrI
      33340
      33491
      False
      False
      .
      .
    
    
      17
      chrI
      33764
      33915
      True
      False
      .
      .
    
    
      18
      chrI
      34239
      34390
      False
      False
      .
      .
    
    
      19
      chrI
      34626
      34777
      False
      False
      .
      .



In [10]:

    
df_clust_compact.to_csv(mp('Fig 4 - source data 1. Promoter accessibility.txt'), index=False, header=True, sep='\t')



In [37]:

    
# Fig2D1_regulatory_annotation.tsv: stricter subset of (most informative) annotation metrics
fp_ = os.path.join(annot_, 'reg_elements_eLife_full_review_expanded.tsv')

df_regl['annot_detailed_fwd'] = df_prom_fwd['annot_detailed_summary']
df_regl['annot_detailed_rev'] = df_prom_rev['annot_detailed_summary']

l_cols_ = [
    'chrom', 'start', 'end', 'chrom_ce11', 'start_ce11', 'end_ce11',
    'annot', 'annot_fwd', 'annot_rev', 'annot_detailed_fwd', 'annot_detailed_rev',
    'promoter_gene_id_fwd', 'promoter_locus_id_fwd', 'promoter_gene_biotype_fwd',
    'promoter_gene_id_rev', 'promoter_locus_id_rev', 'promoter_gene_biotype_rev',
    'associated_gene_id', 'associated_locus_id',
    'tss_fwd', 'tss_rev', 'scap_pass_fwd', 'scap_pass_rev',
]

l_cols_lcap_fwd_ = \
    ['lcap_%s_fwd_passed_jump' % (stage,) for stage in config['stages']] +\
    ['lcap_%s_fwd_passed_incr' % (stage,) for stage in config['stages']]

l_cols_lcap_rev_ = \
    ['lcap_%s_rev_passed_jump' % (stage,) for stage in config['stages']] +\
    ['lcap_%s_rev_passed_incr' % (stage,) for stage in config['stages']]

l_cols_clust_ = ['devel_is_dynamic', 'ageing_is_dynamic', 'devel_cluster_label', 'ageing_cluster_label']

df_ = pd.concat([
    df_regl[l_cols_].rename(columns={'scap_pass_fwd': 'scap_fwd_passed', 'scap_pass_rev': 'scap_rev_passed'}), 
    df_atac[['atac_%s_height' % stage_ for stage_ in config['stages']] + ['atac_source']],
    df_lcap_fwd[l_cols_lcap_fwd_],
    df_lcap_rev[l_cols_lcap_rev_],
    df_maxgap_fwd,
    df_maxgap_rev,
    df_clust_compact[l_cols_clust_]], axis=1)

p_fwd = df_['annot_fwd'].isin(['coding_promoter', 'pseudogene_promoter', 'non-coding_RNA'])
p_rev = df_['annot_rev'].isin(['coding_promoter', 'pseudogene_promoter', 'non-coding_RNA'])

df_.loc[~p_fwd, 'promoter_gene_id_fwd'] = '.'
df_.loc[~p_fwd, 'promoter_locus_id_fwd'] = '.'
df_.loc[~p_fwd, 'promoter_gene_biotype_fwd'] = '.'

df_.loc[~p_rev, 'promoter_gene_id_rev'] = '.'
df_.loc[~p_rev, 'promoter_locus_id_rev'] = '.'
df_.loc[~p_rev, 'promoter_gene_biotype_rev'] = '.'

assoc = df_['annot'].isin(['unknown_promoter', 'putative_enhancer', 'other_element'])
df_.loc[~assoc, 'associated_gene_id'] = '.'
df_.loc[~assoc, 'associated_locus_id'] = '.'

df_.to_csv(fp_, header=True, index=False, sep='\t')

	chr	start	stop	annot	associated_gene_id	associated_locus_id	ATACdc.dev	ATACdc.age	clustersATAC.dev	clustersATAC.age
0	chrI	1900	2051	putative_enhancer	.	.	True	False	NaN	NaN
1	chrI	3826	3977	non-coding_RNA	WBGene00023193	Y74C9A.6	True	False	NaN	NaN
2	chrI	4276	4427	putative_enhancer	WBGene00022277	homt-1	True	False	NaN	NaN
3	chrI	11272	11423	coding_promoter	WBGene00022276,WBGene00022277	homt-1,nlp-40	True	False	I1	stable
4	chrI	13070	13221	putative_enhancer	WBGene00022276	nlp-40	True	False	NaN	NaN

clustersATAC.dev	G1	G2	G3	G4	H	I1	I2	Mix1	Mix2	Mix3	Mix4	Mix5	Mix6	Mix7	Mix8	N+M	stable
annot
coding_promoter	585	276	642	626	525	722	1056	822	574	938	465	260	802	450	893	563	3397