In [1]:
%run ~/relmapping/annot/notebooks/__init__.ipynb


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping

In [5]:
df_regl = regl_Apr27()
print('%d regions loaded' % (len(df_regl),))

df_regl[yp.NAMES_BED3].to_csv('annot_Apr27/Fig1D1_accessible_sites.bed', sep='\t', index=False, header=False)


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/ipykernel_launcher.py:64: RuntimeWarning: Mean of empty slice
13054 of 42245 sites with CV values via promoter annotation
26764 of 42245 sites with CV values via "associated gene"
42245 regions loaded

In [20]:
# annot_Apr27/Fig2D2_regulatory_annotation_Apr27.tsv <= add ce11 coordinates by lifting over the peak position
fp_regl = 'annot/Fig2D2_regulatory_annotation_Apr27/Fig2D2_regulatory_annotation_Apr27.tsv'
df_regl = pd.read_csv(fp_regl, sep='\t')
print('%d regions loaded' % (len(df_regl),))



df_regl.to_csv('annot_Apr27/Fig2D2_regulatory_annotation_Apr27.tsv', sep='\t', index=False)


42245 regions loaded
Input file  = /mnt/home1/ahringer/jj374/lab/HTSProcessing/.liftover/fp_inp.tsv 
Output file = /mnt/home1/ahringer/jj374/lab/HTSProcessing/.liftover/fp_out.tsv 
Lines processed from input file              = 42245
Comment / blank lines written to output file = 0
Data lines written to output file            = 42245
Number of records changed                    = 40852

In [21]:
fp_ = 'annot/Fig2D2_regulatory_annotation_Apr27/Fig2D2_regulatory_annotation_Apr27.bed'
df_ = read_gffbed(fp_)
write_gffbed(fp_,
    chrom = df_['chrom'],
    start = df_['start'],
    end = df_['end'],
    name = df_['Name'],
        strand = df_regl_['strand'],
        itemRgb = list(map(lambda annot: d_annot_legend[annot], df_regl_['annot'])),
        attr = df_regl_[['annot', 'annot_fwd', 'annot_rev', 'annot_detailed_fwd', 'annot_detailed_rev']],
    )


/mnt/home1/ahringer/jj374/relmapping/scripts/yarp/yarp.py:400: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  df_name = df_name.convert_objects(convert_numeric=True)
Out[21]:
chrom start end score strand thickStart thickEnd itemRgb Name annot annot_detailed_fwd annot_detailed_rev annot_fwd annot_rev
0 chrI 1900 2051 0 . 1900 2051 NaN putative_enhancer transcription_initiation transcription_initiation transcription_initiation transcription_initiation
1 chrI 3826 3977 0 - 3826 3977 NaN Y74C9A.6 non-coding_RNA unknown_promoter_jump_scap non-coding_RNA unknown_promoter non-coding_RNA
2 chrI 4276 4427 0 . 4276 4427 NaN (homt-1) putative_enhancer no_transcription transcription_initiation no_transcription transcription_initiation
3 chrI 11272 11423 0 . 11272 11423 NaN homt-1%20/%20nlp-40 coding_promoter coding_promoter_jump_scap coding_promoter_incr_scap coding_promoter coding_promoter
4 chrI 13070 13221 0 . 13070 13221 NaN (nlp-40) putative_enhancer no_transcription transcription_initiation no_transcription transcription_initiation
5 chrI 15430 15581 0 - 15430 15581 NaN (nlp-40) unknown_promoter transcription_initiation unknown_promoter_jump_scap transcription_initiation unknown_promoter
6 chrI 15722 15873 0 . 15722 15873 NaN (nlp-40) putative_enhancer transcription_initiation no_transcription transcription_initiation no_transcription
7 chrI 16958 17109 0 - 16958 17109 NaN unknown_promoter transcription_initiation unknown_promoter_jump_scap transcription_initiation unknown_promoter
8 chrI 22266 22417 0 . 22266 22417 NaN (rcor-1) putative_enhancer transcription_initiation transcription_initiation transcription_initiation transcription_initiation
9 chrI 22895 23046 0 . 22895 23046 NaN (Y74C9A.9,rcor-1) other_element no_transcription no_transcription no_transcription no_transcription
10 chrI 24452 24603 0 + 24452 24603 NaN (rcor-1) unknown_promoter unknown_promoter_jump_scap no_transcription unknown_promoter no_transcription
11 chrI 26902 27053 0 - 26902 27053 NaN rcor-1 coding_promoter no_transcription coding_promoter_jump_scap no_transcription coding_promoter
12 chrI 27082 27233 0 . 27082 27233 NaN putative_enhancer transcription_initiation no_transcription transcription_initiation no_transcription
13 chrI 28163 28314 0 . 28163 28314 NaN (sesn-1) other_element no_transcription no_transcription no_transcription no_transcription
14 chrI 30271 30422 0 . 30271 30422 NaN (sesn-1) putative_enhancer no_transcription transcription_initiation no_transcription transcription_initiation
15 chrI 31958 32109 0 . 31958 32109 NaN (sesn-1) putative_enhancer transcription_initiation transcription_initiation transcription_initiation transcription_initiation
16 chrI 33340 33491 0 . 33340 33491 NaN (sesn-1) other_element no_transcription no_transcription no_transcription no_transcription
17 chrI 33764 33915 0 . 33764 33915 NaN (sesn-1) putative_enhancer transcription_initiation transcription_initiation transcription_initiation transcription_initiation
18 chrI 34239 34390 0 . 34239 34390 NaN (sesn-1) other_element no_transcription no_transcription no_transcription no_transcription
19 chrI 34626 34777 0 . 34626 34777 NaN (sesn-1) putative_enhancer no_transcription transcription_initiation no_transcription transcription_initiation
20 chrI 35381 35532 0 . 35381 35532 NaN (sesn-1) putative_enhancer transcription_initiation transcription_initiation transcription_initiation transcription_initiation
21 chrI 36018 36169 0 - 36018 36169 NaN sesn-1 coding_promoter transcription_initiation coding_promoter_jump_scap transcription_initiation coding_promoter
22 chrI 36471 36622 0 . 36471 36622 NaN (sesn-1) putative_enhancer no_transcription transcription_initiation no_transcription transcription_initiation
23 chrI 37988 38139 0 . 37988 38139 NaN (sesn-1) other_element no_transcription no_transcription no_transcription no_transcription
24 chrI 38986 39137 0 . 38986 39137 NaN (sesn-1) putative_enhancer transcription_initiation no_transcription transcription_initiation no_transcription
25 chrI 39776 39927 0 . 39776 39927 NaN (sesn-1) unknown_promoter unknown_promoter_jump_scap unknown_promoter_jump_scap unknown_promoter unknown_promoter
26 chrI 40327 40478 0 . 40327 40478 NaN (sesn-1) other_element no_transcription no_transcription no_transcription no_transcription
27 chrI 40869 41020 0 . 40869 41020 NaN (sesn-1) putative_enhancer no_transcription transcription_initiation no_transcription transcription_initiation
28 chrI 41422 41573 0 . 41422 41573 NaN (sesn-1) putative_enhancer transcription_initiation no_transcription transcription_initiation no_transcription
29 chrI 42126 42277 0 - 42126 42277 NaN sesn-1 coding_promoter transcription_initiation coding_promoter_incr_scap transcription_initiation coding_promoter
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
42215 chrX 17638436 17638587 0 . 17638436 17638587 NaN (H18N23.2) putative_enhancer no_transcription transcription_initiation no_transcription transcription_initiation
42216 chrX 17639006 17639157 0 . 17639006 17639157 NaN (H18N23.2) putative_enhancer no_transcription transcription_initiation no_transcription transcription_initiation
42217 chrX 17639434 17639585 0 - 17639434 17639585 NaN H18N23.2 coding_promoter transcription_initiation coding_promoter_jump_scap transcription_initiation coding_promoter
42218 chrX 17639698 17639849 0 . 17639698 17639849 NaN (H18N23.2) putative_enhancer no_transcription transcription_initiation no_transcription transcription_initiation
42219 chrX 17640787 17640938 0 - 17640787 17640938 NaN H18N23.2 coding_promoter no_transcription coding_promoter_jump_scap no_transcription coding_promoter
42220 chrX 17649160 17649311 0 + 17649160 17649311 NaN tmem-218 coding_promoter coding_promoter_lowconf no_transcription coding_promoter no_transcription
42221 chrX 17649969 17650120 0 . 17649969 17650120 NaN (tmem-218) putative_enhancer transcription_initiation transcription_initiation transcription_initiation transcription_initiation
42222 chrX 17656707 17656858 0 . 17656707 17656858 NaN putative_enhancer transcription_initiation no_transcription transcription_initiation no_transcription
42223 chrX 17667273 17667424 0 . 17667273 17667424 NaN putative_enhancer transcription_initiation transcription_initiation transcription_initiation transcription_initiation
42224 chrX 17668550 17668701 0 . 17668550 17668701 NaN putative_enhancer transcription_initiation transcription_initiation transcription_initiation transcription_initiation
42225 chrX 17670419 17670570 0 + 17670419 17670570 NaN T23E7.2 coding_promoter coding_promoter_incr_scap transcription_initiation coding_promoter transcription_initiation
42226 chrX 17672384 17672535 0 . 17672384 17672535 NaN (T23E7.2) putative_enhancer transcription_initiation transcription_initiation transcription_initiation transcription_initiation
42227 chrX 17674332 17674483 0 . 17674332 17674483 NaN (T23E7.2) putative_enhancer transcription_initiation no_transcription transcription_initiation no_transcription
42228 chrX 17677450 17677601 0 . 17677450 17677601 NaN (T23E7.2) putative_enhancer transcription_initiation no_transcription transcription_initiation no_transcription
42229 chrX 17681563 17681714 0 . 17681563 17681714 NaN (T23E7.6) other_element no_transcription no_transcription no_transcription no_transcription
42230 chrX 17682372 17682523 0 . 17682372 17682523 NaN (T23E7.6) putative_enhancer no_transcription transcription_initiation no_transcription transcription_initiation
42231 chrX 17683445 17683596 0 . 17683445 17683596 NaN (T23E7.6) other_element no_transcription no_transcription no_transcription no_transcription
42232 chrX 17683766 17683917 0 . 17683766 17683917 NaN (T23E7.6) putative_enhancer transcription_initiation transcription_initiation transcription_initiation transcription_initiation
42233 chrX 17684817 17684968 0 - 17684817 17684968 NaN T23E7.6 coding_promoter transcription_initiation coding_promoter_jump_scap transcription_initiation coding_promoter
42234 chrX 17685953 17686104 0 - 17685953 17686104 NaN T23E7.6 coding_promoter transcription_initiation coding_promoter_incr_scap transcription_initiation coding_promoter
42235 chrX 17686553 17686704 0 . 17686553 17686704 NaN other_element no_transcription no_transcription no_transcription no_transcription
42236 chrX 17688337 17688488 0 . 17688337 17688488 NaN putative_enhancer no_transcription transcription_initiation no_transcription transcription_initiation
42237 chrX 17694712 17694863 0 + 17694712 17694863 NaN cgt-2 coding_promoter coding_promoter_incr_scap unknown_promoter_jump_scap coding_promoter unknown_promoter
42238 chrX 17695583 17695734 0 . 17695583 17695734 NaN (cgt-2) other_element no_transcription no_transcription no_transcription no_transcription
42239 chrX 17697033 17697184 0 . 17697033 17697184 NaN (cgt-2) other_element no_transcription no_transcription no_transcription no_transcription
42240 chrX 17702041 17702192 0 . 17702041 17702192 NaN putative_enhancer transcription_initiation transcription_initiation transcription_initiation transcription_initiation
42241 chrX 17702679 17702830 0 - 17702679 17702830 NaN unknown_promoter transcription_initiation unknown_promoter_jump_scap transcription_initiation unknown_promoter
42242 chrX 17714406 17714557 0 . 17714406 17714557 NaN (6R55.2) putative_enhancer transcription_initiation transcription_initiation transcription_initiation transcription_initiation
42243 chrX 17714878 17715029 0 . 17714878 17715029 NaN (cTel55X.1) putative_enhancer transcription_initiation transcription_initiation transcription_initiation transcription_initiation
42244 chrX 17718375 17718526 0 . 17718375 17718526 NaN (cTel55X.1) other_element no_transcription no_transcription no_transcription no_transcription

42245 rows × 14 columns


In [19]:
# annot_Apr27/Fig2D2_regulatory_annotation_Apr27_ce10.bed
# annot_Apr27/Fig2D2_regulatory_annotation_Apr27_ce11.bed




write_regl_bed9('annot/S2_regulatory_annotation/S2_regulatory_annotation.bed', df_regl)

In [4]:



Out[4]:
chrom start end annot annot_fwd annot_rev promoter_gene_id_fwd promoter_locus_id_fwd promoter_gene_biotype_fwd promoter_gene_id_rev ... lcap_glp1_d13_rev_baseMean lcap_glp1_d13_rev_log2FoldChange lcap_glp1_d13_rev_lfcSE lcap_glp1_d13_rev_stat lcap_glp1_d13_rev_pvalue lcap_glp1_d13_rev_padj lcap_glp1_d13_rev_passed_jump lcap_glp1_d13_rev_passed_incr lcap_glp1_d13_rev_passed lcap_glp1_d13_rev_summary
0 chrI 1900 2051 putative_enhancer transcription_initiation transcription_initiation NaN NaN NaN NaN ... 3.178684 -0.744335 1.818434 0.000000 6.588504e-01 0.960937 False False False False / -0.74 / 0.961
1 chrI 3826 3977 non-coding_RNA unknown_promoter non-coding_RNA NaN NaN NaN WBGene00023193 ... 79.875590 9.320055 1.876127 4.967710 3.387408e-07 0.000008 True True True True / 9.32 / 7.63e-06
2 chrI 4276 4427 putative_enhancer no_transcription transcription_initiation NaN NaN NaN NaN ... 0.935771 0.961622 2.531971 0.379792 3.520500e-01 NaN False False False False / 0.96 / nan
3 chrI 11272 11423 coding_promoter coding_promoter coding_promoter WBGene00022276 nlp-40 protein_coding WBGene00022277 ... 1.818028 4.155157 2.536240 1.638314 5.067809e-02 NaN False True True True / 4.16 / nan
4 chrI 13070 13221 putative_enhancer no_transcription transcription_initiation NaN NaN NaN NaN ... 0.939379 0.966986 2.530644 0.382111 3.511897e-01 NaN False False False False / 0.97 / nan
5 chrI 15430 15581 unknown_promoter transcription_initiation unknown_promoter NaN NaN NaN NaN ... 2.065274 2.218425 2.201454 1.007709 1.567971e-01 NaN False False False False / 2.22 / nan
6 chrI 15722 15873 putative_enhancer transcription_initiation no_transcription NaN NaN NaN NaN ... 0.225901 1.724769 2.715061 0.635260 2.626295e-01 NaN False False False False / 1.72 / nan
7 chrI 16958 17109 unknown_promoter transcription_initiation unknown_promoter NaN NaN NaN NaN ... 3.082683 -2.134230 1.939769 0.000000 8.643883e-01 1.000000 False False False False / -2.13 / 1
8 chrI 22266 22417 putative_enhancer transcription_initiation transcription_initiation NaN NaN NaN NaN ... 2.676034 -0.381491 1.983369 0.000000 5.762639e-01 0.919400 False False False False / -0.38 / 0.919
9 chrI 22895 23046 other_element no_transcription no_transcription NaN NaN NaN NaN ... 4.948591 -0.978505 1.641441 0.000000 7.244543e-01 0.989684 False False False False / -0.98 / 0.99
10 chrI 24452 24603 unknown_promoter unknown_promoter no_transcription NaN NaN NaN NaN ... 4.583494 0.226853 1.623153 0.139761 4.444246e-01 0.833373 False False False False / 0.23 / 0.833
11 chrI 26902 27053 coding_promoter no_transcription coding_promoter NaN NaN NaN WBGene00022278 ... 8.215405 6.331396 2.195703 2.883539 1.966168e-03 0.014332 True True True True / 6.33 / 0.0143
12 chrI 27082 27233 putative_enhancer transcription_initiation no_transcription NaN NaN NaN NaN ... 0.000000 NaN NaN NaN NaN NaN False False False False / nan / nan
13 chrI 28163 28314 other_element no_transcription no_transcription NaN NaN NaN NaN ... 67.648090 -1.010281 0.550585 0.000000 9.667414e-01 1.000000 False False False False / -1.01 / 1
14 chrI 30271 30422 putative_enhancer no_transcription transcription_initiation NaN NaN NaN NaN ... 99.734660 2.416385 0.508638 4.750699 1.013575e-06 0.000020 True False True True / 2.42 / 2.04e-05
15 chrI 31958 32109 putative_enhancer transcription_initiation transcription_initiation NaN NaN NaN NaN ... 21.869460 0.913242 0.823248 1.109316 1.336469e-01 0.405393 False False False False / 0.91 / 0.405
16 chrI 33340 33491 other_element no_transcription no_transcription NaN NaN NaN NaN ... 18.924300 0.507403 0.861093 0.589254 2.778455e-01 0.658279 False False False False / 0.51 / 0.658
17 chrI 33764 33915 putative_enhancer transcription_initiation transcription_initiation NaN NaN NaN NaN ... 9.850807 -2.245930 1.207230 0.000000 9.685854e-01 1.000000 False False False False / -2.25 / 1
18 chrI 34239 34390 other_element no_transcription no_transcription NaN NaN NaN NaN ... 21.839100 -0.975375 0.819698 0.000000 8.829612e-01 1.000000 False False False False / -0.98 / 1
19 chrI 34626 34777 putative_enhancer no_transcription transcription_initiation NaN NaN NaN NaN ... 34.023570 -0.852417 0.702671 0.000000 8.874561e-01 1.000000 False False False False / -0.85 / 1
20 chrI 35381 35532 putative_enhancer transcription_initiation transcription_initiation NaN NaN NaN NaN ... 20.930300 -0.821336 0.863603 0.000000 8.292124e-01 1.000000 False False False False / -0.82 / 1
21 chrI 36018 36169 coding_promoter transcription_initiation coding_promoter NaN NaN NaN WBGene00022279 ... 18.047870 0.160218 0.908451 0.176364 4.300040e-01 0.822835 False False False False / 0.16 / 0.823
22 chrI 36471 36622 putative_enhancer no_transcription transcription_initiation NaN NaN NaN NaN ... 17.149200 0.791456 0.899785 0.879605 1.895367e-01 0.520811 False False False False / 0.79 / 0.521
23 chrI 37988 38139 other_element no_transcription no_transcription NaN NaN NaN NaN ... 4.417178 1.535945 1.726358 0.889702 1.868130e-01 0.514856 False False False False / 1.54 / 0.515
24 chrI 38986 39137 putative_enhancer transcription_initiation no_transcription NaN NaN NaN NaN ... 5.537305 0.409431 1.423458 0.287631 3.868144e-01 0.784160 False False False False / 0.41 / 0.784
25 chrI 39776 39927 unknown_promoter unknown_promoter unknown_promoter NaN NaN NaN NaN ... 4.915581 1.359471 1.635157 0.831401 2.028735e-01 0.546743 False False False False / 1.36 / 0.547
26 chrI 40327 40478 other_element no_transcription no_transcription NaN NaN NaN NaN ... 4.528258 0.839680 1.545338 0.543364 2.934397e-01 0.682350 False False False False / 0.84 / 0.682
27 chrI 40869 41020 putative_enhancer no_transcription transcription_initiation NaN NaN NaN NaN ... 3.442388 -0.509802 1.842060 0.000000 6.090164e-01 0.941622 False False False False / -0.51 / 0.942
28 chrI 41422 41573 putative_enhancer transcription_initiation no_transcription NaN NaN NaN NaN ... 3.439419 -0.893309 1.855076 0.000000 6.849366e-01 0.971728 False False False False / -0.89 / 0.972
29 chrI 42126 42277 coding_promoter transcription_initiation coding_promoter NaN NaN NaN WBGene00022279 ... 7.576785 3.456194 1.502992 2.299542 1.073709e-02 0.057270 True False True True / 3.46 / 0.0573
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
42215 chrX 17638436 17638587 putative_enhancer no_transcription transcription_initiation NaN NaN NaN NaN ... 0.493851 -0.109219 2.712533 0.000000 5.160589e-01 NaN False False False False / -0.11 / nan
42216 chrX 17639006 17639157 putative_enhancer no_transcription transcription_initiation NaN NaN NaN NaN ... 1.429623 0.637829 2.232717 0.285674 3.875640e-01 NaN False False False False / 0.64 / nan
42217 chrX 17639434 17639585 coding_promoter transcription_initiation coding_promoter NaN NaN NaN WBGene00019211 ... 0.264343 -1.926593 2.714843 0.000000 7.610399e-01 NaN False False False False / -1.93 / nan
42218 chrX 17639698 17639849 putative_enhancer no_transcription transcription_initiation NaN NaN NaN NaN ... 0.773266 -2.927316 2.700890 0.000000 8.607808e-01 NaN False False False False / -2.93 / nan
42219 chrX 17640787 17640938 coding_promoter no_transcription coding_promoter NaN NaN NaN WBGene00019211 ... 2.901296 0.259763 1.806776 0.143771 4.428405e-01 0.832737 False False False False / 0.26 / 0.833
42220 chrX 17649160 17649311 coding_promoter coding_promoter no_transcription WBGene00044771 tmem-218 protein_coding NaN ... 0.455409 2.212343 2.721586 0.812887 2.081413e-01 NaN False False False False / 2.21 / nan
42221 chrX 17649969 17650120 putative_enhancer transcription_initiation transcription_initiation NaN NaN NaN NaN ... 0.000000 NaN NaN NaN NaN NaN False False False False / nan / nan
42222 chrX 17656707 17656858 putative_enhancer transcription_initiation no_transcription NaN NaN NaN NaN ... 0.459017 2.216214 2.721642 0.814293 2.077386e-01 NaN False False False False / 2.22 / nan
42223 chrX 17667273 17667424 putative_enhancer transcription_initiation transcription_initiation NaN NaN NaN NaN ... 1.276999 -1.594376 2.406250 0.000000 7.462059e-01 NaN False False False False / -1.59 / nan
42224 chrX 17668550 17668701 putative_enhancer transcription_initiation transcription_initiation NaN NaN NaN NaN ... 0.264343 -1.926593 2.714843 0.000000 7.610399e-01 NaN False False False False / -1.93 / nan
42225 chrX 17670419 17670570 coding_promoter coding_promoter transcription_initiation WBGene00020732 T23E7.2 protein_coding NaN ... 2.097443 1.326392 2.123742 0.624554 2.661318e-01 NaN False False False False / 1.33 / nan
42226 chrX 17672384 17672535 putative_enhancer transcription_initiation transcription_initiation NaN NaN NaN NaN ... 2.279773 -1.635725 2.073125 0.000000 7.849482e-01 NaN False False False False / -1.64 / nan
42227 chrX 17674332 17674483 putative_enhancer transcription_initiation no_transcription NaN NaN NaN NaN ... 0.459017 2.216214 2.721642 0.814293 2.077386e-01 NaN False False False False / 2.22 / nan
42228 chrX 17677450 17677601 putative_enhancer transcription_initiation no_transcription NaN NaN NaN NaN ... 2.932522 -0.171471 1.789776 0.000000 5.381627e-01 0.895613 False False False False / -0.17 / 0.896
42229 chrX 17681563 17681714 other_element no_transcription no_transcription NaN NaN NaN NaN ... 38.616220 -0.858805 0.705525 0.000000 8.882468e-01 1.000000 False False False False / -0.86 / 1
42230 chrX 17682372 17682523 putative_enhancer no_transcription transcription_initiation NaN NaN NaN NaN ... 36.764720 -0.904272 0.724846 0.000000 8.938997e-01 1.000000 False False False False / -0.90 / 1
42231 chrX 17683445 17683596 other_element no_transcription no_transcription NaN NaN NaN NaN ... 109.478200 -0.722793 0.594928 0.000000 8.878025e-01 1.000000 False False False False / -0.72 / 1
42232 chrX 17683766 17683917 putative_enhancer transcription_initiation transcription_initiation NaN NaN NaN NaN ... 147.833200 -0.553972 0.479052 0.000000 8.762400e-01 1.000000 False False False False / -0.55 / 1
42233 chrX 17684817 17684968 coding_promoter transcription_initiation coding_promoter NaN NaN NaN WBGene00044772 ... 24.899730 6.062400 1.373521 4.413766 5.079381e-06 0.000084 True False True True / 6.06 / 8.43e-05
42234 chrX 17685953 17686104 coding_promoter transcription_initiation coding_promoter NaN NaN NaN WBGene00044772 ... 0.225901 1.724769 2.715061 0.635260 2.626295e-01 NaN False False False False / 1.72 / nan
42235 chrX 17686553 17686704 other_element no_transcription no_transcription NaN NaN NaN NaN ... 0.225901 1.724769 2.715061 0.635260 2.626295e-01 NaN False False False False / 1.72 / nan
42236 chrX 17688337 17688488 putative_enhancer no_transcription transcription_initiation NaN NaN NaN NaN ... 0.989285 -1.142517 2.587083 0.000000 6.706192e-01 NaN False False False False / -1.14 / nan
42237 chrX 17694712 17694863 coding_promoter coding_promoter unknown_promoter WBGene00017625 cgt-2 protein_coding NaN ... 0.455409 2.212343 2.721586 0.812887 2.081413e-01 NaN False False False False / 2.21 / nan
42238 chrX 17695583 17695734 other_element no_transcription no_transcription NaN NaN NaN NaN ... 0.229509 1.734175 2.715167 0.638699 2.615094e-01 NaN False False False False / 1.73 / nan
42239 chrX 17697033 17697184 other_element no_transcription no_transcription NaN NaN NaN NaN ... 0.000000 NaN NaN NaN NaN NaN False False False False / nan / nan
42240 chrX 17702041 17702192 putative_enhancer transcription_initiation transcription_initiation NaN NaN NaN NaN ... 5.325012 -3.852652 1.847568 0.000000 9.814769e-01 1.000000 False False False False / -3.85 / 1
42241 chrX 17702679 17702830 unknown_promoter transcription_initiation unknown_promoter NaN NaN NaN NaN ... 7.917342 6.211495 2.226492 2.789812 2.636929e-03 0.018311 True True True True / 6.21 / 0.0183
42242 chrX 17714406 17714557 putative_enhancer transcription_initiation transcription_initiation NaN NaN NaN NaN ... 1.147543 3.241939 2.673416 1.212658 1.126303e-01 NaN False False False False / 3.24 / nan
42243 chrX 17714878 17715029 putative_enhancer transcription_initiation transcription_initiation NaN NaN NaN NaN ... 0.000000 NaN NaN NaN NaN NaN False False False False / nan / nan
42244 chrX 17718375 17718526 other_element no_transcription no_transcription NaN NaN NaN NaN ... 0.528686 -2.361539 2.722619 0.000000 8.071325e-01 NaN False False False False / -2.36 / nan

42245 rows × 335 columns


In [10]:
q_ = 'annot_fwd == "coding_promoter" | annot_fwd == "pseudogene_promoter" | annot_fwd == "non-coding_RNA" | annot_fwd == "unknown_promoter"'
q_incr_ = ' | '.join(['(lcap_incr_%s_fwd)' % (stage,) for stage in config['lcap_geo_by_stage']])
q_njump_ = ' & '.join(['(~lcap_jump_%s_fwd)' % (stage,) for stage in config['lcap_geo_by_stage']])
q_incronly_ = '(' + q_incr_ + ') & (' + q_njump_ + ')'

df_ = df_regl.query(q_).reset_index(drop=True).query(q_incronly_).reset_index(drop=True)
write_gffbed(
    fp = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_only_fwd.bed',
    chrom = df_['chrom'],
    start = df_['start'],
    end = df_['end'],
    attr = df_[['lcap_incr_%s_fwd' % (stage,) for stage in config['lcap_geo_by_stage']]],
    strand = '+',
    itemRgb = list(map(lambda f: yp.RED if f else yp.BLUE,
        df_[['lcap_incr_%s_fwd' % (stage,) for stage in config['lcap_geo_by_stage']]].any(axis=1),
    )),
)

q_ = 'annot_rev == "coding_promoter" | annot_rev == "pseudogene_promoter" | annot_rev == "non-coding_RNA" | annot_rev == "unknown_promoter"'
q_incr_ = ' | '.join(['(lcap_incr_%s_rev)' % (stage,) for stage in config['lcap_geo_by_stage']])
q_njump_ = ' & '.join(['(~lcap_jump_%s_rev)' % (stage,) for stage in config['lcap_geo_by_stage']])
q_incronly_ = '(' + q_incr_ + ') & (' + q_njump_ + ')'

df_ = df_regl.query(q_).reset_index(drop=True).query(q_incronly_).reset_index(drop=True)
write_gffbed(
    fp = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_only_rev.bed',
    chrom = df_['chrom'],
    start = df_['start'],
    end = df_['end'],
    attr = df_[['lcap_incr_%s_rev' % (stage,) for stage in config['lcap_geo_by_stage']]],
    strand = '-',
    itemRgb = list(map(lambda f: yp.RED if f else yp.BLUE,
        df_[['lcap_incr_%s_rev' % (stage,) for stage in config['lcap_geo_by_stage']]].any(axis=1),
    )),
)

!wc -l annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_only_fwd.bed
!wc -l annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_only_rev.bed


1200 annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_only_fwd.bed
1150 annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_only_rev.bed

In [ ]:
q_ = 'annot_fwd == "coding_promoter" | annot_fwd == "pseudogene_promoter" | annot_fwd == "non-coding_RNA" | annot_fwd == "unknown_promoter"'
df_ = df_regl.query(q_).reset_index(drop=True)
write_gffbed(
    fp = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_jump_fwd.bed',
    chrom = df_['chrom'],
    start = df_['start'],
    end = df_['end'],
    attr = df_[['lcap_jump_%s_fwd' % (stage,) for stage in config['lcap_geo_by_stage']]],
    strand = '+',
    itemRgb = list(map(lambda f: yp.RED if f else yp.BLUE,
        df_[['lcap_jump_%s_fwd' % (stage,) for stage in config['lcap_geo_by_stage']]].any(axis=1),
    )),
)
write_gffbed(
    fp = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_fwd.bed',
    chrom = df_['chrom'],
    start = df_['start'],
    end = df_['end'],
    attr = df_[['lcap_incr_%s_fwd' % (stage,) for stage in config['lcap_geo_by_stage']]],
    strand = '+',
    itemRgb = list(map(lambda f: yp.RED if f else yp.BLUE,
        df_[['lcap_incr_%s_fwd' % (stage,) for stage in config['lcap_geo_by_stage']]].any(axis=1),
    )),
)

q_ = 'annot_rev == "coding_promoter" | annot_rev == "pseudogene_promoter" | annot_rev == "non-coding_RNA" | annot_rev == "unknown_promoter"'
df_ = df_regl.query(q_).reset_index(drop=True)
write_gffbed(
    fp = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_jump_rev.bed',
    chrom = df_['chrom'],
    start = df_['start'],
    end = df_['end'],
    attr = df_[['lcap_jump_%s_rev' % (stage,) for stage in config['lcap_geo_by_stage']]],
    strand = '-',
    itemRgb = list(map(lambda f: yp.RED if f else yp.BLUE,
        df_[['lcap_jump_%s_rev' % (stage,) for stage in config['lcap_geo_by_stage']]].any(axis=1),
    )),
)
write_gffbed(
    fp = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_rev.bed',
    chrom = df_['chrom'],
    start = df_['start'],
    end = df_['end'],
    attr = df_[['lcap_incr_%s_rev' % (stage,) for stage in config['lcap_geo_by_stage']]],
    strand = '-',
    itemRgb = list(map(lambda f: yp.RED if f else yp.BLUE,
        df_[['lcap_incr_%s_rev' % (stage,) for stage in config['lcap_geo_by_stage']]].any(axis=1),
    )),
)

In [ ]:
# annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_jump_fwd.bed
# annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_jump_rev.bed
# annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_fwd.bed
# annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_rev.bed

# summary results
def itemRgb_(is_jump, is_incr):
    if is_jump == True:
        return RED
    elif is_incr:
        return YELLOW
    else:
        return BLUE

strand = 'fwd'
write_gffbed(
    fp = 'annot/S2_regulatory_annotation/_lcap_jump/lcap_tests_wt_%(strand)s.bed' % locals(),
    chrom = df_['chrom'],
    start = df_['start'],
    end = df_['end'],
    attr = df_[
        ['lcap_%s_%s_padj' % (stage, strand) for stage in config['lcap_geo_by_stage'][:6]] +\
        ['lcap_%s_%s_log2FoldChange' % (stage, strand) for stage in config['lcap_geo_by_stage']][:6]
    ],
    strand = '+',
    itemRgb = map(itemRgb_,
        df_jump[['lcap_jump_%s_%s' % (stage, strand) for stage in config['lcap_geo_by_stage'][:6]]].any(axis=1),
        df_incr[['lcap_incr_%s_%s' % (stage, strand) for stage in config['lcap_geo_by_stage'][:6]]].any(axis=1)
    ),
)

In [22]:
df_prom_fwd = df_regl.query('(annot_fwd == "coding_promoter")')
df_prom_rev = df_regl.query('(annot_rev == "coding_promoter")')

df_prom_fwd = df_prom_fwd[['chrom', 'start', 'end', 'tss_fwd', 'promoter_gene_id_fwd', 'promoter_locus_id_fwd']].copy()
df_prom_rev = df_prom_rev[['chrom', 'start', 'end', 'tss_rev', 'promoter_gene_id_rev', 'promoter_locus_id_rev']].copy()

df_prom_fwd['strand'] = '+'
df_prom_rev['strand'] = '-'

df_prom_fwd.columns = ['chrom', 'start', 'end', 'tss', 'gene_id', 'locus_id', 'strand']
df_prom_rev.columns = ['chrom', 'start', 'end', 'tss', 'gene_id', 'locus_id', 'strand']

print(len(df_prom_fwd), 'fwd promoters')
print(len(df_prom_rev), 'rev promoters')

df_prom = pd.concat([df_prom_fwd, df_prom_rev], axis=0).reset_index(drop=True)
print(len(df_prom), 'total (coding) promtoters')

df_gene = pd.read_csv('WS260_ce10/WS260_ce10.genes_by_CV.tsv', sep='\t')
df_prom['operon_rank'] = df_prom.merge(df_gene, left_on='gene_id', right_on='gene_id')['operon_rank']
df_prom['operon_id'] = df_prom.merge(df_gene, left_on='gene_id', right_on='gene_id')['operon_id']
df_prom


8047 fwd promoters
7871 rev promoters
15918 total (coding) promtoters
Out[22]:
chrom start end tss gene_id locus_id strand operon_rank operon_id
0 chrI 11245 11465 11285 WBGene00022276 nlp-40 + 0 .
1 chrI 43019 43148 43170 WBGene00022275 Y74C9A.1 + 0 .
2 chrI 46648 46828 46772 WBGene00044345 Y48G1C.12 + 1 CEOP1971
3 chrI 47075 47182 47177 WBGene00044345 Y48G1C.12 + 1 CEOP1971
4 chrI 70045 70195 70213 WBGene00000812 csk-1 + 1 CEOP1940
5 chrI 72364 72504 72508 WBGene00000812 csk-1 + 1 CEOP1940
6 chrI 92900 93001 93028 WBGene00021676 Y48G1C.1 + 0 .
7 chrI 110115 110273 110249 WBGene00004418 rpl-7 + 0 .
8 chrI 110343 110467 110450 WBGene00004418 rpl-7 + 0 .
9 chrI 110654 110768 110783 WBGene00004418 rpl-7 + 0 .
10 chrI 122360 122612 122534 WBGene00003229 mex-3 + 0 .
11 chrI 123883 124079 123969 WBGene00003229 mex-3 + 0 .
12 chrI 126640 126775 126790 WBGene00003229 mex-3 + 0 .
13 chrI 128257 128365 128330 WBGene00003229 mex-3 + 0 .
14 chrI 128428 128536 128507 WBGene00003229 mex-3 + 0 .
15 chrI 128712 128793 128687 WBGene00003229 mex-3 + 0 .
16 chrI 169723 169899 169851 WBGene00018958 F56C11.6 + 0 .
17 chrI 171140 171254 171205 WBGene00018958 F56C11.6 + 0 .
18 chrI 172389 172514 172539 WBGene00018958 F56C11.6 + 0 .
19 chrI 173219 173349 173294 WBGene00018958 F56C11.6 + 0 .
20 chrI 177367 177508 177500 WBGene00018957 F56C11.5 + 0 .
21 chrI 182951 183055 183042 WBGene00000227 atm-1 + 0 .
22 chrI 183267 183419 183319 WBGene00000227 atm-1 + 0 .
23 chrI 188554 188732 188715 WBGene00000227 atm-1 + 0 .
24 chrI 215703 215781 215802 WBGene00021662 snpc-3.3 + 0 .
25 chrI 215855 215972 216004 WBGene00021662 snpc-3.3 + 0 .
26 chrI 217734 217909 217875 WBGene00255594 Y48G1BL.8 + 0 .
27 chrI 251772 251868 251880 WBGene00021667 snpc-3.2 + 0 .
28 chrI 251918 252058 252051 WBGene00021667 snpc-3.2 + 0 .
29 chrI 279908 280029 280020 WBGene00016904 C53D5.3 + 0 .
... ... ... ... ... ... ... ... ... ...
15888 chrX 17235818 17235933 17235794 WBGene00000993 dhs-30 - 0 .
15889 chrX 17236276 17236403 17236317 WBGene00000993 dhs-30 - 0 .
15890 chrX 17237710 17237891 17237777 WBGene00020817 T25G12.11 - 0 .
15891 chrX 17239235 17239376 17239219 WBGene00020817 T25G12.11 - 0 .
15892 chrX 17256830 17257010 17256930 WBGene00021500 Y40C7B.4 - 0 .
15893 chrX 17301903 17302020 17301937 WBGene00016350 C33E10.10 - 0 .
15894 chrX 17368899 17369024 17368916 WBGene00017339 F10D7.2 - 0 .
15895 chrX 17369612 17369764 17369607 WBGene00017339 F10D7.2 - 0 .
15896 chrX 17374702 17374964 17374920 WBGene00017340 F10D7.3 - 0 .
15897 chrX 17379600 17379816 17379701 WBGene00271701 F10D7.10 - 0 .
15898 chrX 17390620 17390795 17390650 WBGene00017342 F10D7.5 - 0 .
15899 chrX 17392218 17392339 17392236 WBGene00017342 F10D7.5 - 1 CEOPX161
15900 chrX 17413875 17414007 17413890 WBGene00000792 crb-1 - 1 CEOPX161
15901 chrX 17421356 17421442 17421392 WBGene00003891 osm-11 - 0 .
15902 chrX 17421611 17421960 17421852 WBGene00003891 osm-11 - 0 .
15903 chrX 17428867 17429016 17428881 WBGene00003891 osm-11 - 0 .
15904 chrX 17456691 17456984 17456873 WBGene00003370 mlc-2 - 0 .
15905 chrX 17464011 17464120 17464029 WBGene00022647 slc-17.1 - 0 .
15906 chrX 17464831 17464967 17464837 WBGene00022647 slc-17.1 - 0 .
15907 chrX 17466649 17466780 17466668 WBGene00022648 ZK54.3 - 0 .
15908 chrX 17474518 17474665 17474502 WBGene00006602 tps-1 - 0 .
15909 chrX 17476291 17476403 17476363 WBGene00006602 tps-1 - 0 .
15910 chrX 17476504 17476602 17476493 WBGene00006602 tps-1 - 0 .
15911 chrX 17534135 17534291 17534174 WBGene00017938 F31A3.3 - 0 .
15912 chrX 17606435 17606623 17606544 WBGene00015868 C16H3.3 - 0 .
15913 chrX 17610680 17610802 17610719 WBGene00015868 C16H3.3 - 0 .
15914 chrX 17633162 17633380 17633284 WBGene00019211 H18N23.2 - 0 .
15915 chrX 17640771 17640950 17640770 WBGene00019211 H18N23.2 - 0 .
15916 chrX 17684796 17684998 17684846 WBGene00044772 T23E7.6 - 0 .
15917 chrX 17685924 17686120 17685987 WBGene00044772 T23E7.6 - 0 .

15918 rows × 9 columns


In [23]:
df_hybrid = df_prom.query('operon_rank > 1').sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)
fp_ = 'annot/S2_regulatory_annotation/S2d_promoters_within_operons.tsv'
df_hybrid.to_csv(fp_, header=True, index=False, sep='\t')
!wc -l {fp_}

fp_ = 'annot/S2_regulatory_annotation/S2d_promoters_within_operons.bed'
write_gffbed(fp_,
    chrom = df_hybrid['chrom'],
    start = df_hybrid['start'],
    end = df_hybrid['end'],
    name = df_hybrid['locus_id'],
    strand = df_hybrid['strand'],
    attr = df_hybrid[['tss', 'gene_id', 'locus_id', 'operon_rank', 'operon_id']],
)
!wc -l {fp_}


678 annot/S2_regulatory_annotation/S2d_promoters_within_operons.tsv
678 annot/S2_regulatory_annotation/S2d_promoters_within_operons.bed

In [ ]:
df_ = df_prom.sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)
fp_ = 'annot/S2_regulatory_annotation/S2b_promoter_annotation.tsv'
df_.to_csv(fp_, header=True, index=False, sep='\t')

In [ ]:
fp_ = 'annot/S2_regulatory_annotation/S2b_promoter_annotation.bed'
write_gffbed(fp_,
    chrom = df_['chrom'],
    start = df_['start'],
    end = df_['end'],
    name = df_['locus_id'],
    strand = df_['strand'],
    attr = df_[['tss', 'gene_id', 'locus_id']],
)

In [11]:
# Annotate current annotation by date
l_fp_inp = [
    #'annot/S2_regulatory_annotation/S2_regulatory_annotation.bed',
    #'annot/S2_regulatory_annotation/S2_regulatory_annotation.tsv',
    #'annot/S2_regulatory_annotation/S2b_promoter_annotation.bed',
    #'annot/S2_regulatory_annotation/S2b_promoter_annotation.tsv',
    #'annot/S2_regulatory_annotation/S2c_outron-extended_genes.bed',
    #'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_fwd.bed',
    #'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_jump_fwd.bed',
    #'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_rev.bed',
    #'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_jump_rev.bed',
    'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_only_fwd.bed',
    'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_only_rev.bed',
]

date_ = "6Dec17"
for fp_inp in l_fp_inp:
    #print(fp_inp)
    #!git rm --cached {fp_inp}

    fp_out = '%s_%s.%s' % (fp_inp[:-4], date_, fp_inp[-3:])
    !cp {fp_inp} {fp_out}
    !git add {fp_out}

In [ ]: