In [1]:
%run ~/relmapping/annot/notebooks/annot__init__.ipynb


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping

In [2]:
fp_ = 'annot_eLife_full/reg_elements_eLife_full_review_expanded.tsv'
df_regl = pd.read_csv(fp_, sep='\t').rename(columns={
    'chrom': 'chrom_ce10', 'start': 'start_ce10', 'end': 'end_ce10',
    'tss_fwd': 'tss_fwd_ce10', 'tss_rev': 'tss_rev_ce10'}
)

l_atac_peak_pos_ce10 = df_regl[['start_ce10', 'end_ce10']].mean(axis=1).map(int)
l_atac_peak_pos_ce11 = df_regl[['start_ce11', 'end_ce11']].mean(axis=1).map(int)
df_regl['tss_fwd_ce11'] = df_regl['tss_fwd_ce10'] - l_atac_peak_pos_ce10 + l_atac_peak_pos_ce11
df_regl['tss_rev_ce11'] = df_regl['tss_rev_ce10'] - l_atac_peak_pos_ce10 + l_atac_peak_pos_ce11

In [3]:
fp_ = 'annot_eLife_revised/20180223_modERN_modENCODE_peak_assignment/regulatory_elements.mid_flk200bp.mod_TF_assignment.with_HOTness.corrected.txt'
df_tf = pd.read_csv(fp_, sep='\t')
assert len(df_tf) == len(df_regl)
df_regl['HOTness'] = df_tf['HOTness']
df_regl['factor_count'] = df_tf['cnt']
df_regl['factor_names'] = df_tf['TF']
pd.crosstab(df_regl['factor_count'], df_regl['HOTness'])


Out[3]:
HOTness HOT cold none
factor_count
0 0 0 12854
1 0 6301 0
2 0 3801 0
3 0 2473 0
4 0 1825 0
5 0 1519 0
6 0 1275 0
7 0 1118 0
8 0 866 0
9 0 733 0
10 0 644 0
11 0 511 0
12 0 455 0
13 0 434 0
14 0 359 0
15 0 322 0
16 0 318 0
17 0 280 0
18 0 263 0
19 247 0 0
20 227 0 0
21 215 0 0
22 206 0 0
23 206 0 0
24 175 0 0
25 157 0 0
26 159 0 0
27 146 0 0
28 125 0 0
29 130 0 0
... ... ... ...
138 6 0 0
139 8 0 0
140 10 0 0
141 4 0 0
142 6 0 0
143 8 0 0
144 9 0 0
145 4 0 0
146 7 0 0
147 6 0 0
148 3 0 0
149 5 0 0
150 1 0 0
151 2 0 0
152 7 0 0
153 5 0 0
154 3 0 0
155 4 0 0
156 1 0 0
157 2 0 0
158 1 0 0
159 1 0 0
160 1 0 0
162 1 0 0
164 2 0 0
165 3 0 0
166 2 0 0
169 1 0 0
170 1 0 0
173 1 0 0

168 rows × 3 columns


In [4]:
fp_ = 'annot_eLife_revised/Figure 1-source data 1. Accessible sites.txt'
col_ = ['chrom_ce10', 'start_ce10', 'end_ce10'] \
     + ['atac_%s_height' % (stage,) for stage in config['stages']] \
     + ['atac_source']
df_regl[col_].to_csv(fp_, header=True, index=False, sep='\t')

In [5]:
fp_ = 'annot_eLife_revised/Figure 2-source data 1. Regulatory annotation.txt'
col_ = ['chrom_ce10', 'start_ce10', 'end_ce10', 'chrom_ce11', 'start_ce11', 'end_ce11',
    'annot', 'annot_fwd', 'annot_rev', 
    'promoter_gene_id_fwd', 'promoter_locus_id_fwd', 'promoter_gene_biotype_fwd',
    'promoter_gene_id_rev', 'promoter_locus_id_rev', 'promoter_gene_biotype_rev', 
    'associated_gene_id', 'associated_locus_id',
    'tss_fwd_ce10', 'tss_rev_ce10', 'tss_fwd_ce11', 'tss_rev_ce11',
    'scap_fwd_passed', 'scap_rev_passed',] \
    + ['lcap_%s_fwd_passed_jump' % (stage,) for stage in config['stages']] \
    + ['lcap_%s_fwd_passed_incr' % (stage,) for stage in config['stages']] \
    + ['lcap_%s_rev_passed_jump' % (stage,) for stage in config['stages']] \
    + ['lcap_%s_rev_passed_incr' % (stage,) for stage in config['stages']]
df_regl[col_].to_csv(fp_, header=True, index=False, sep='\t')

In [6]:
fp_ = 'annot_eLife_revised/Figure 4-source data 1. Promoter accessibility.txt'
col_ = [
    'chrom_ce10', 'start_ce10', 'end_ce10', 
    'devel_is_dynamic', 'ageing_is_dynamic', 'devel_cluster_label', 'ageing_cluster_label',
    'HOTness', 'factor_count', 'factor_names',
]
df_regl[col_].to_csv(fp_, header=True, index=False, sep='\t')

In [7]:
fp_ = 'annot_eLife_revised/20180223_modERN_modENCODE_peak_assignment/ce10.mod_factors_datasets.txt'
df_ = pd.read_csv(fp_, sep='\t', names=['factor', 'dataset_name', 'dataset_id'])

fp_ = 'annot_eLife_revised/Figure 5-source data 1. TF datasets.txt'
df_.to_csv(fp_, header=True, index=False, sep='\t')