In [1]:

    
from pathlib import Path
import pybedtools
import pandas as pd
import numpy as np
import json

This notebook documents the process for extending Dorina regulators database. Dorina depends on the correct naming of both the bedfile and and bed record (the rows in the bed file). Dorina filter most bed files by the name, and if the name of the file and the bed record do not match, there will be problems. Please see dorina.regulator.py line 80 for implementation details.

Also, webdorina depends on a specific parsing to process the regulator site. This specification is:
'{experiment}#{regname}{assembly}*{reg_name}'
PARCLIP#TAF15_hg19*TAF15

To keep the database consistent with the old data

peaks in experimental replicates are merged
the score of the resulting peak is the median peak score

This may not be optimal for every experiment, but the database contains various types of experiments.



In [2]:

    
def change_name(rec, new_name=""):
    rec.name = new_name
    return rec

def change_name_targetscan(rec):
    """Changes the names of each entry in the targetScan bedfile
    Default name example:
    KLHL17:miR-299-3p
    """
    rec.name = rec.name[ rec.name.find(':') + 1 :] + "|TargetScan"
    return rec

def change_name_fehlmannetal2017(rec):
    """Changes the names of each entry in the targetScan bedfile
    Default name example:
    KLHL17:miR-299-3p
    """
    rec[2] = rec.name
    return rec

Add RBP identified by RNA bind nd seek Lambert et al 2014 on 26 April 2018

supll material was obtained from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4142047/bin/NIHMS598961-supplement-02.xlsx



In [38]:

    
lambert_RBFOX2 = pd.read_excel('/Volumes/prj/dorina2/raw/NIHMS598961-supplement-02.xlsx',
                              header=3, sheetname='RBFOX2', skiprows=1)

# lambert_CELF1 = pd.read_excel('/Volumes/prj/dorina2/raw/NIHMS598961-supplement-02.xlsx',
#                               header=3, sheetname='CELF1')

# lambert_MBNL1 = pd.read_excel('/Volumes/prj/dorina2/raw/NIHMS598961-supplement-02.xlsx',
#                               header=3, sheetname='MBNL1')

lambert_RBFOX2 = lambert_RBFOX2\
    .set_index('kmer', drop=True)\
    .drop('Unnamed: 0', axis=1)



In [43]:

    
lambert_RBFOX2.columns = '0	1.5	4.5	14	40	121	365	1100	3300	9800'.split()



In [60]:

    
lambert_RBFOX2.shape









    Out[60]:





(4096, 10)



In [62]:

    
mean2sd = lambert_RBFOX2.mean(axis=1) + lambert_RBFOX2.std(axis=1) * 2



In [80]:

    
sig_motifs = lambert_RBFOX2.gt(mean2sd, axis='rows').any(axis=1)



In [90]:

    
with open('/Volumes/tbrittoborges/bindndseek/bg.txt', 'w') as fou:
    fou.write("\n".join(lambert_RBFOX2.index))

with open('/Volumes/tbrittoborges/bindndseek/RBFOX2.txt', 'w') as fou:
    fou.write("\n".join(sig_motifs[sig_motifs].index))



In [ ]:

    
%%bash
cd /Volumes/tbrittoborges/sites2meme 
module load meme

sites2meme bindndseek/ > bindndseek/RBFOX2.meme
fimo bindndseek/RBFOX2.meme /biodb/genomes/homo_sapiens/GRCh38_90/GRCh38_90.fa

fasta-grep TGCATG -dna -p < /biodb/genomes/homo_sapiens/GRCh38_90/GRCh38_90.fa

Expression from Gtex



In [5]:

    
%%script false 
# this one was processed in the computer cluster
!cd /biodb/gtex/
import pandas as pd

rnaseq = 'GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct.gz'
sample = 'GTEx_v7_Annotations_SampleAttributesDS.txt'
samples = pd.read_table(sample)
gtex = pd.read_table(rnaseq, skiprows=2, compression='gzip', header=0, )
samples_mapping = dict(zip(samples['SAMPID'], samples['SMTS']))
gtex.set_index('Name', inplace=True)
gtex.drop(columns=['Description'], inplace=True)
gtex = gtex.rename(columns=samples_mapping)
gtex = gtex.T
names = [x.split('.')[0] for x in gtex.columns]
#  alternative solution for id mapping

import numpy as np
from bioservices import BioDBNet
bioservice = BioDBNet()
input_db = "Ensembl Gene ID"
output_db = ["Gene Symbol"]
input_values = ['ENSG00000214688']
xref = [bioservice.db2db(input_db, output_db, list(x), 9606) for x in np.array_split(names, 562)]
xref = pd.concat(xref)



In [3]:

    
gtex = pd.read_csv('/Volumes/tbrittoborges-1/gtex_posprocessed2.csv',
                   index_col=0, header=0)



In [4]:

    
gtex = gtex.groupby(level=0).median()



In [5]:

    
gtex.head()









    Out[5]:







  
    
      
      ENSG00000223972.4
      ENSG00000227232.4
      ENSG00000243485.2
      ENSG00000237613.2
      ENSG00000268020.2
      ENSG00000240361.1
      ENSG00000186092.4
      ENSG00000238009.2
      ENSG00000233750.3
      ENSG00000237683.5
      ...
      ENSG00000198886.2
      ENSG00000210176.1
      ENSG00000210184.1
      ENSG00000210191.1
      ENSG00000198786.2
      ENSG00000198695.2
      ENSG00000210194.1
      ENSG00000198727.2
      ENSG00000210195.2
      ENSG00000210196.2
    
  
  
    
      Adipose Tissue
      0.05458
      11.200
      0.06012
      0.03653
      0.02565
      0.041180
      0.04875
      0.146500
      0.1299
      6.9240
      ...
      32440.0
      0.6692
      0.6153
      0.5386
      4028.0
      3744.0
      12.340
      24050.0
      0.6773
      1.3320
    
    
      Adrenal Gland
      0.07460
      8.023
      0.08179
      0.04050
      0.03479
      0.049065
      0.06136
      0.087785
      0.1488
      6.5955
      ...
      53425.0
      2.7575
      1.4325
      2.9195
      7246.0
      8498.0
      28.845
      28275.0
      1.5510
      1.8340
    
    
      Bladder
      0.05878
      14.240
      0.06097
      0.04113
      0.00000
      0.000000
      0.05461
      0.143000
      0.0761
      4.2820
      ...
      34110.0
      0.7275
      0.0000
      0.0000
      9434.0
      11080.0
      44.760
      23030.0
      0.0000
      0.8238
    
    
      Blood
      0.08195
      8.560
      0.00000
      0.02570
      0.00000
      0.000000
      0.00000
      0.121100
      0.5804
      150.7000
      ...
      5565.0
      0.3868
      0.0000
      0.3539
      969.1
      1446.0
      5.105
      3129.0
      0.0000
      0.4556
    
    
      Blood Vessel
      0.04535
      11.990
      0.05278
      0.03668
      0.00000
      0.038950
      0.04108
      0.138400
      0.1108
      4.3950
      ...
      16210.0
      0.0000
      0.0000
      0.0000
      3832.0
      5166.0
      19.600
      13160.0
      0.0000
      1.7400
    
  

5 rows × 56202 columns



In [7]:

    
gtex.loc['Heart'][gtex.loc['Heart'] > 1].shape









    Out[7]:





(13511,)



In [8]:

    
id_map = pd.read_csv('/Volumes/tbrittoborges-1/gtexv7_id_mapping.csv', header=0, index_col=0,sep=';')



In [9]:

    
mapping = id_map.set_index(['ensembl_id'])['hgnc_symbol']



In [10]:

    
mapped_names = mapping[names]



In [11]:

    
mapped_names.head()









    Out[11]:





ensembl_id
ENSG00000223972    DDX11L1
ENSG00000227232        NaN
ENSG00000243485        NaN
ENSG00000237613    FAM138A
ENSG00000268020        NaN
Name: hgnc_symbol, dtype: object



In [13]:

    
gtex.columns = [x[:x.index('.')] for x in gtex.columns]
gtex.head()









    Out[13]:







  
    
      
      ENSG00000223972
      ENSG00000227232
      ENSG00000243485
      ENSG00000237613
      ENSG00000268020
      ENSG00000240361
      ENSG00000186092
      ENSG00000238009
      ENSG00000233750
      ENSG00000237683
      ...
      ENSG00000198886
      ENSG00000210176
      ENSG00000210184
      ENSG00000210191
      ENSG00000198786
      ENSG00000198695
      ENSG00000210194
      ENSG00000198727
      ENSG00000210195
      ENSG00000210196
    
  
  
    
      Adipose Tissue
      0.05458
      11.200
      0.06012
      0.03653
      0.02565
      0.041180
      0.04875
      0.146500
      0.1299
      6.9240
      ...
      32440.0
      0.6692
      0.6153
      0.5386
      4028.0
      3744.0
      12.340
      24050.0
      0.6773
      1.3320
    
    
      Adrenal Gland
      0.07460
      8.023
      0.08179
      0.04050
      0.03479
      0.049065
      0.06136
      0.087785
      0.1488
      6.5955
      ...
      53425.0
      2.7575
      1.4325
      2.9195
      7246.0
      8498.0
      28.845
      28275.0
      1.5510
      1.8340
    
    
      Bladder
      0.05878
      14.240
      0.06097
      0.04113
      0.00000
      0.000000
      0.05461
      0.143000
      0.0761
      4.2820
      ...
      34110.0
      0.7275
      0.0000
      0.0000
      9434.0
      11080.0
      44.760
      23030.0
      0.0000
      0.8238
    
    
      Blood
      0.08195
      8.560
      0.00000
      0.02570
      0.00000
      0.000000
      0.00000
      0.121100
      0.5804
      150.7000
      ...
      5565.0
      0.3868
      0.0000
      0.3539
      969.1
      1446.0
      5.105
      3129.0
      0.0000
      0.4556
    
    
      Blood Vessel
      0.04535
      11.990
      0.05278
      0.03668
      0.00000
      0.038950
      0.04108
      0.138400
      0.1108
      4.3950
      ...
      16210.0
      0.0000
      0.0000
      0.0000
      3832.0
      5166.0
      19.600
      13160.0
      0.0000
      1.7400
    
  

5 rows × 56202 columns



In [15]:

    
# mapped_names.fillna(value={k:k for k in mapped_names.index}, inplace=True)



In [14]:

    
gtex.rename(columns=mapped_names, inplace=True)



In [15]:

    
gtex[gtex.columns.dropna()].head()









    Out[15]:







  
    
      
      DDX11L1
      FAM138A
      OR4F5
      LOC100996442
      LOC101928626
      FAM87B
      LINC00115
      LINC01128
      FAM41C
      LOC100130417
      ...
      COX2
      ATP8
      ATP6
      COX3
      ND3
      ND4L
      ND4
      ND5
      ND6
      CYTB
    
  
  
    
      Adipose Tissue
      0.05458
      0.03653
      0.04875
      0.146500
      0.071690
      1.3270
      4.6540
      7.925
      0.06479
      0.99640
      ...
      28170.0
      22320.0
      34100.0
      33550.0
      18590.0
      14040.0
      32440.0
      4028.0
      3744.0
      24050.0
    
    
      Adrenal Gland
      0.07460
      0.04050
      0.06136
      0.087785
      0.071095
      1.2460
      2.5845
      4.057
      0.05932
      3.78950
      ...
      47140.0
      28810.0
      50570.0
      51070.0
      21265.0
      23285.0
      53425.0
      7246.0
      8498.0
      28275.0
    
    
      Bladder
      0.05878
      0.04113
      0.05461
      0.143000
      0.070550
      0.9377
      7.7790
      10.270
      0.04074
      11.41000
      ...
      29100.0
      16230.0
      29280.0
      26990.0
      12620.0
      14800.0
      34110.0
      9434.0
      11080.0
      23030.0
    
    
      Blood
      0.08195
      0.02570
      0.00000
      0.121100
      0.043400
      0.1281
      2.0060
      1.305
      0.03328
      0.06641
      ...
      6785.0
      3143.0
      5815.0
      5354.0
      2193.0
      1635.0
      5565.0
      969.1
      1446.0
      3129.0
    
    
      Blood Vessel
      0.04535
      0.03668
      0.04108
      0.138400
      0.079530
      1.4160
      7.6760
      10.120
      0.05732
      5.80600
      ...
      15510.0
      12060.0
      19210.0
      19010.0
      9435.0
      6670.0
      16210.0
      3832.0
      5166.0
      13160.0
    
  

5 rows × 24407 columns



In [25]:

    
gtex_clean =  gtex[gtex.columns.dropna()]



In [34]:

    
# gtex.loc['Heart'][gtex.loc['Heart']  > 1].index.tolist()



In [36]:

    
gtex_clean[gtex_clean > 1]









    Out[36]:







  
    
      
      DDX11L1
      FAM138A
      OR4F5
      LOC100996442
      LOC101928626
      FAM87B
      LINC00115
      LINC01128
      FAM41C
      LOC100130417
      ...
      COX2
      ATP8
      ATP6
      COX3
      ND3
      ND4L
      ND4
      ND5
      ND6
      CYTB
    
  
  
    
      Adipose Tissue
      NaN
      NaN
      NaN
      NaN
      NaN
      1.327
      4.6540
      7.9250
      NaN
      NaN
      ...
      28170.0
      22320.0
      34100.0
      33550.0
      18590.0
      14040.0
      32440.0
      4028.0
      3744.0
      24050.0
    
    
      Adrenal Gland
      NaN
      NaN
      NaN
      NaN
      NaN
      1.246
      2.5845
      4.0570
      NaN
      3.7895
      ...
      47140.0
      28810.0
      50570.0
      51070.0
      21265.0
      23285.0
      53425.0
      7246.0
      8498.0
      28275.0
    
    
      Bladder
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      7.7790
      10.2700
      NaN
      11.4100
      ...
      29100.0
      16230.0
      29280.0
      26990.0
      12620.0
      14800.0
      34110.0
      9434.0
      11080.0
      23030.0
    
    
      Blood
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      2.0060
      1.3050
      NaN
      NaN
      ...
      6785.0
      3143.0
      5815.0
      5354.0
      2193.0
      1635.0
      5565.0
      969.1
      1446.0
      3129.0
    
    
      Blood Vessel
      NaN
      NaN
      NaN
      NaN
      NaN
      1.416
      7.6760
      10.1200
      NaN
      5.8060
      ...
      15510.0
      12060.0
      19210.0
      19010.0
      9435.0
      6670.0
      16210.0
      3832.0
      5166.0
      13160.0
    
    
      Brain
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      2.5100
      5.5420
      NaN
      NaN
      ...
      63130.0
      30470.0
      56970.0
      54350.0
      22890.0
      20980.0
      50420.0
      8518.0
      8231.0
      30680.0
    
    
      Breast
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      4.7510
      6.4490
      NaN
      1.4500
      ...
      28380.0
      21890.0
      33700.0
      34805.0
      18225.0
      13720.0
      32170.0
      3820.5
      3444.0
      23110.0
    
    
      Cervix Uteri
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      10.1200
      7.8920
      NaN
      2.7350
      ...
      16400.0
      12310.0
      18300.0
      19120.0
      9362.0
      9776.0
      22370.0
      3280.0
      3346.0
      15060.0
    
    
      Colon
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      5.0350
      8.1730
      NaN
      2.7200
      ...
      42740.0
      23570.0
      38880.0
      37890.0
      18860.0
      16910.0
      39570.0
      5754.0
      6239.0
      25400.0
    
    
      Esophagus
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      4.5780
      7.4410
      NaN
      3.0570
      ...
      33310.0
      17610.0
      29240.0
      29850.0
      14100.0
      13840.0
      32280.0
      5952.0
      6509.0
      22050.0
    
    
      Fallopian Tube
      NaN
      NaN
      NaN
      NaN
      NaN
      1.067
      4.5550
      8.2320
      NaN
      11.8500
      ...
      21860.0
      12990.0
      23120.0
      22420.0
      11210.0
      11480.0
      25190.0
      8669.0
      12350.0
      18920.0
    
    
      Heart
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      2.0585
      14.9400
      NaN
      1.0013
      ...
      52475.0
      37660.0
      64245.0
      62890.0
      29830.0
      28020.0
      58560.0
      11150.0
      10520.0
      41610.0
    
    
      Kidney
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      3.8130
      2.5620
      NaN
      7.2100
      ...
      65550.0
      32530.0
      62490.0
      76160.0
      28660.0
      19700.0
      55540.0
      7165.0
      6304.0
      36450.0
    
    
      Liver
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      3.2400
      3.6740
      NaN
      1.5870
      ...
      46290.0
      29760.0
      50770.0
      43080.0
      18760.0
      24730.0
      55510.0
      6101.0
      5401.0
      31210.0
    
    
      Lung
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      7.0240
      6.0160
      NaN
      2.7630
      ...
      21820.0
      13950.0
      22210.0
      18100.0
      8035.0
      10210.0
      24080.0
      5371.0
      7023.0
      12240.0
    
    
      Muscle
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      2.0010
      12.0400
      NaN
      NaN
      ...
      37470.0
      27720.0
      45115.0
      41125.0
      14910.0
      18345.0
      41105.0
      9633.5
      12105.0
      29965.0
    
    
      Nerve
      NaN
      NaN
      NaN
      NaN
      NaN
      1.247
      12.1100
      5.8720
      NaN
      4.3470
      ...
      18740.0
      14460.0
      22305.0
      22595.0
      11800.0
      10425.0
      24250.0
      4447.5
      4613.0
      15970.0
    
    
      Ovary
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      13.3400
      6.1750
      NaN
      NaN
      ...
      13580.0
      12700.0
      19570.0
      17410.0
      9586.0
      9180.0
      21320.0
      2974.0
      2723.0
      12930.0
    
    
      Pancreas
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0955
      2.3085
      NaN
      NaN
      ...
      12385.0
      9212.5
      14365.0
      11675.0
      7522.5
      6393.0
      14485.0
      1840.5
      2167.5
      9239.5
    
    
      Pituitary
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      9.8980
      18.6000
      NaN
      19.3100
      ...
      23480.0
      15990.0
      25810.0
      26930.0
      11940.0
      9185.0
      22050.0
      2693.0
      2039.0
      19400.0
    
    
      Prostate
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      14.5300
      8.2300
      NaN
      18.3750
      ...
      36980.0
      25645.0
      40380.0
      36250.0
      19560.0
      17940.0
      39640.0
      4731.5
      3838.5
      26740.0
    
    
      Salivary Gland
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      5.4880
      3.2690
      NaN
      NaN
      ...
      24950.0
      16620.0
      25630.0
      25610.0
      11110.0
      12200.0
      27900.0
      3426.0
      3195.0
      17270.0
    
    
      Skin
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      4.3440
      3.9110
      NaN
      1.0910
      ...
      25610.0
      15570.0
      24180.0
      30740.0
      10550.0
      9842.0
      23560.0
      5441.0
      6697.0
      15730.0
    
    
      Small Intestine
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      4.1530
      3.9740
      NaN
      2.8450
      ...
      43720.0
      24690.0
      39320.0
      39950.0
      16420.0
      16810.0
      39650.0
      8593.0
      11440.0
      25850.0
    
    
      Spleen
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      6.9035
      4.5140
      NaN
      17.3550
      ...
      26200.0
      13430.0
      23535.0
      20885.0
      9544.5
      7895.5
      24310.0
      3786.0
      6365.0
      12645.0
    
    
      Stomach
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      3.0680
      3.3805
      NaN
      2.4755
      ...
      40045.0
      24315.0
      40625.0
      38630.0
      19780.0
      18930.0
      44110.0
      8302.0
      10280.0
      27085.0
    
    
      Testis
      1.76
      NaN
      NaN
      NaN
      NaN
      1.485
      8.1510
      8.1980
      NaN
      4.4640
      ...
      26090.0
      16720.0
      27980.0
      35450.0
      12420.0
      13330.0
      31150.0
      2649.0
      1774.0
      15430.0
    
    
      Thyroid
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      9.7615
      6.5845
      NaN
      4.4255
      ...
      28490.0
      19855.0
      30775.0
      34950.0
      14390.0
      14310.0
      32350.0
      3679.5
      2765.5
      19830.0
    
    
      Uterus
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      10.9200
      8.7280
      NaN
      5.9170
      ...
      17580.0
      13750.0
      22430.0
      18940.0
      9669.0
      9905.0
      22360.0
      3571.0
      3729.0
      14820.0
    
    
      Vagina
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      10.5800
      5.2070
      NaN
      3.7420
      ...
      19090.0
      12850.0
      18820.0
      21010.0
      9243.0
      7718.0
      18250.0
      2767.0
      2550.0
      12750.0
    
  

30 rows × 24407 columns

Add miRNA expression defined in Fehlmann2017 et al

Downloaded supplementar file btx814_supp and used Supplemental_Data_2.gff3



In [ ]:



In [13]:

    
!head /Volumes/prj/dorina2/raw/Fehlmann2017etal.gff3









    



##gff-version 3
#
# Reference genome: GCF_000001405.36_GRCh38.p10
# Stem loops are annotated as miRNA_primary_transcript.
# They correspond to the predicted precursor +- 15 bases flanks
chr1	.	miRNA_primary_transcript	925514	925604	.	+	.	Name=hsa-15534-17188.1
chr1	.	miRNA	925529	925551	.	+	.	Name=m-15534;Organism=hsa
chr1	.	miRNA	925569	925589	.	+	.	Name=m-17188;Organism=hsa
chr1	.	miRNA_primary_transcript	1218437	1218523	.	+	.	Name=hsa-19867-15383.1
chr1	.	miRNA	1218452	1218473	.	+	.	Name=m-19867;Organism=hsa



In [54]:

    
bt = pybedtools.BedTool('/Volumes/prj/dorina2/raw/Fehlmann2017etal.gff3')

bt_new = bt.each(change_name_fehlmannetal2017).saveas()

'/Volumes/prj/dorina2/regulators/h_sapiens/hg38/targetScan_miRNA_hg38.bed'
for entry in bt:
    entry[2] = entry.name



In [55]:

    
bt.head()









    



chr1	.	miRNA_primary_transcript	925514	925604	.	+	.	Name=hsa-15534-17188.1
 chr1	.	miRNA	925529	925551	.	+	.	Name=m-15534;Organism=hsa
 chr1	.	miRNA	925569	925589	.	+	.	Name=m-17188;Organism=hsa
 chr1	.	miRNA_primary_transcript	1218437	1218523	.	+	.	Name=hsa-19867-15383.1
 chr1	.	miRNA	1218452	1218473	.	+	.	Name=m-19867;Organism=hsa
 chr1	.	miRNA	1218486	1218508	.	+	.	Name=m-15383;Organism=hsa
 chr1	.	miRNA_primary_transcript	1235894	1235991	.	+	.	Name=hsa-14706-23591.1
 chr1	.	miRNA	1235909	1235930	.	+	.	Name=m-14706;Organism=hsa
 chr1	.	miRNA	1235955	1235976	.	+	.	Name=m-23591;Organism=hsa
 chr1	.	miRNA_primary_transcript	1254213	1254300	.	-	.	Name=hsa-14369-19525.1

Add targetScan predictions on 17 April 2018

Add targetScan prediction from http://www.targetscan.org/vert_72/vert_72_data_download/Predicted_Target_Locations.default_predictions.hg19.bed.zip

Data was lift over to hg38 with Crossmap: conda activate crossmap python2 /home/tbrittoborges/bin/miniconda3/envs/crossmap/bin/CrossMap.py bed /prj/dorina2/crossmap/hg19ToHg38.over.chain Predicted_Target_Locations.default_predictions.hg19.bed targetScan_hg38.bed



In [3]:

    
!head /Volumes/prj/dorina2/raw/targetScan_hg38.bed









    



chr1	965219	965226	KLHL17:miR-299-3p	86	+	965219	965226	255,0,0	1	7	0
chr1	965225	965232	KLHL17:miR-124-3p.1	88	+	965225	965232	30,144,255	1	7	0
chr1	965225	965233	KLHL17:miR-124-3p.2/506-3p	96	+	965225	965233	128,0,128	1	8	0
chr1	965553	965561	KLHL17:miR-19-3p	91	+	965553	965561	128,0,128	1	8	0
chr1	965674	965681	KLHL17:miR-137	84	+	965674	965681	255,0,0	1	7	0
chr1	976040	976047	C1orf170:miR-1197	52	-	976040	976047	255,0,0	1	7	0
chr1	1055054	1055061	AGRN:miR-31-5p	91	+	1055054	1055061	255,0,0	1	7	0
chr1	1055449	1055456	AGRN:miR-144-3p	59	+	1055449	1055456	255,0,0	1	7	0
chr1	1055451	1055458	AGRN:miR-128-3p	77	+	1055451	1055458	30,144,255	1	7	0
chr1	1055451	1055458	AGRN:miR-27-3p	84	+	1055451	1055458	255,0,0	1	7	0



In [4]:

    
bt = pybedtools.BedTool('/Volumes/prj/dorina2/raw/targetScan_hg38.bed')
bt_new = bt.each(change_name_targetscan).saveas()
bt_new.moveto(
    '/Volumes/prj/dorina2/regulators/h_sapiens/hg38/targetScan_miRNA_hg38.bed')









    Out[4]:





<BedTool(/Volumes/prj/dorina2/regulators/h_sapiens/hg38/targetScan_miRNA_hg38.bed)>



In [5]:

    
bt = pybedtools.BedTool('/Volumes/prj/dorina2/regulators/h_sapiens/hg38/targetScan_miRNA_hg38.bed')
regulators = [entry.name for entry in bt]



In [6]:

    
len(regulators)









    Out[6]:





122604



In [7]:

    
json_path = '/Volumes/prj/dorina2/regulators/h_sapiens/hg38/targetScan_miRNA_hg38.json'
json_ = []

for regulator in set(regulators):
    n_sites = regulators.count(regulator)
    json_.append(
        {
            "description": f'{n_sites} sites predicted as regulated by {regulator}' 
            'predictions were obtained from TargetScan 7.2 from March 2018. '  
            'The original data, mapped to hg19 assembly, was lift over with Crossmap.',
            "experiment": 'TargetScan',
            "id": regulator,
            "references": 'Agarwal, Vikram, et al. Predicting effective microRNA '
            'target sites in mammalian mRNAs. elife 4 (2015).',
            "summary": regulator
                }
            ) 

with open(json_path, 'w') as fout:
   json.dump(json_, fout, indent=True)



In [11]:

    
bt = pybedtools.BedTool('/Volumes/prj/dorina2/raw/Predicted_Targets.mm10.bed')
bt_new = bt.each(change_name_targetscan).saveas()
regulators = [entry.name for entry in bt_new]
bt_new.moveto(
    '/Volumes/prj/dorina2/regulators/m_musculus/mm10/targetScan_miRNA_mm10.bed')









    Out[11]:





<BedTool(/Volumes/prj/dorina2/regulators/m_musculus/mm10/targetScan_miRNA_mm10.bed)>



In [12]:

    
json_path = '/Volumes/prj/dorina2/regulators/m_musculus/mm10/targetScan_miRNA_mm10.json'
json_ = []

for regulator in set(regulators):
    n_sites = regulators.count(regulator)
    json_.append(
        {
            "description": f'{n_sites} sites predicted as regulated by {regulator}' 
            'predictions were obtained from TargetScan 7.1 from March 2018. '  
            'The original data is mapped to mm10 assembly.',
            "experiment": 'TargetScan',
            "id": regulator,
            "references": 'Agarwal, Vikram, et al. Predicting effective microRNA '
            'target sites in mammalian mRNAs. elife 4 (2015).',
            "summary": regulator
                }
            ) 

with open(json_path, 'w') as fout:
   json.dump(json_, fout, indent=True)

Add eClip data on 16 April 2018

Add eClip data from Van Nostrand, Eric L., et al. "Robust transcriptome-wide discovery of RNA-binding protein binding sites with enhanced CLIP (eCLIP)." Nature methods 13.6 (2016): 508.



In [5]:

    
for cell in ('K562', 'HepG2'):
    path = Path(f'/Volumes/biodb/encode/encode_hg38_clip_peaks/{cell}/replicates/')
    bed_files = path.glob('*.bed')
    regulators = [x.stem[: x.stem.find('_')] for x in bed_files]
    for regulator in regulators:
        bt1 = pybedtools.BedTool(str(path / f'{regulator}_{cell}_rep01.bed'))
        bt2 = pybedtools.BedTool(str(path / f'{regulator}_{cell}_rep02.bed'))
        bt_inter = bt1.intersect(bt2, s=True).sort()
        bt_merged = bt1.cat(bt2, postmerge=False).sort()
        bt_inter = bt_inter.map(b=bt_merged, c='5', s=True, o='median').bed6()
        new_name = f'{cell}-{regulator}|eClip'
        bt_inter.each(change_name, new_name=new_name).saveas().moveto(
            f'/Volumes/tbrittoborges/dorina_eclip/eClip_{cell}_{regulator}_vanNostrand2016_hg38.bed')



In [6]:

    
!cat /Volumes/tbrittoborges/dorina_eclip/* > /Volumes/prj/dorina2/regulators/h_sapiens/hg38/eClip_RBP_hg38.bed



In [7]:

    
pd.read_json('/Volumes/prj/dorina2/regulators/h_sapiens/hg38/TargetScanCons_mirna_hg19.json').head()









    Out[7]:







  
    
      
      description
      experiment
      id
      methods
      references
      summary
    
  
  
    
      0
      Liftover to hg38 assembly with crossmap.
      TargetScan Cons. miRNA:targets
      miR-504|4725-5p|TargetScan
      
      Benjamin P, et al. Cell 2005;20:15-20.
      miR-504|4725-5p|TargetScan
    
    
      1
      Liftover to hg38 assembly with crossmap.
      TargetScan Cons. miRNA:targets
      miR-124|124ab|506|TargetScan
      
      Benjamin P, et al. Cell 2005;20:15-20.
      miR-124|124ab|506|TargetScan
    
    
      2
      Liftover to hg38 assembly with crossmap.
      TargetScan Cons. miRNA:targets
      miR-19ab|TargetScan
      
      Benjamin P, et al. Cell 2005;20:15-20.
      miR-19ab|TargetScan
    
    
      3
      Liftover to hg38 assembly with crossmap.
      TargetScan Cons. miRNA:targets
      miR-137|137ab|TargetScan
      
      Benjamin P, et al. Cell 2005;20:15-20.
      miR-137|137ab|TargetScan
    
    
      4
      Liftover to hg38 assembly with crossmap.
      TargetScan Cons. miRNA:targets
      miR-31|TargetScan
      
      Benjamin P, et al. Cell 2005;20:15-20.
      miR-31|TargetScan



In [8]:

    
eclip_json = []
for cell in ('K562', 'HepG2'):
    path = Path(f'/Volumes/biodb/encode/encode_hg38_clip_peaks/{cell}/replicates/')
    bed_files = path.glob('*.bed')
    regulators = [x.stem[: x.stem.find('_')] for x in bed_files]
    for regulator in regulators:
        n_sites = len(pybedtools.BedTool(
            f'/Volumes/tbrittoborges/dorina_eclip/eClip_{cell}_{regulator}_vanNostrand2016_hg38.bed'))
        eclip_json.append(
            {
            "description": f'sites regulated by {n_sites} obtained from {cell} cells. The ' 
            'duplicated were merged with intersect and the median score is presented.'  
            'The original data was mapped to hg38 assembly',
            "experiment": 'eClip',
            "id": f'{cell}-{regulator}|eClip',
            "references": 'Van Nostrand, Eric L., et al. Robust transcriptome-wide discovery ' 
            'of RNA-binding protein binding sites with enhanced CLIP (eCLIP). Nature methods ' 
            '13.6 (2016): 508.',
            "summary": f'{cell}-{regulator}|eClip'                
                }
            )



In [9]:

    
with open('/Volumes/prj/dorina2/regulators/h_sapiens/hg38/eClip_RBP_hg38.json', 'w') as fout:
   json.dump(eclip_json, fout, indent=True)



In [10]:

    
pd.read_json('/Volumes/prj/dorina2/regulators/h_sapiens/hg38/eClip_RBP_hg38.json').head()









    Out[10]:







  
    
      
      description
      experiment
      id
      references
      summary
    
  
  
    
      0
      sites regulated by 101732 obtained from K562 c...
      eClip
      K562-U2AF2|eClip
      Van Nostrand, Eric L., et al. Robust transcrip...
      K562-U2AF2|eClip
    
    
      1
      sites regulated by 42087 obtained from K562 ce...
      eClip
      K562-UPF1|eClip
      Van Nostrand, Eric L., et al. Robust transcrip...
      K562-UPF1|eClip
    
    
      2
      sites regulated by 31614 obtained from K562 ce...
      eClip
      K562-SMNDC1|eClip
      Van Nostrand, Eric L., et al. Robust transcrip...
      K562-SMNDC1|eClip
    
    
      3
      sites regulated by 27191 obtained from K562 ce...
      eClip
      K562-CPSF6|eClip
      Van Nostrand, Eric L., et al. Robust transcrip...
      K562-CPSF6|eClip
    
    
      4
      sites regulated by 81632 obtained from K562 ce...
      eClip
      K562-PRPF8|eClip
      Van Nostrand, Eric L., et al. Robust transcrip...
      K562-PRPF8|eClip

Fix for mm10 regulators on 13 April 2018

dorina and webdorina were not working with mm10 regulatos due to mismatch of the regulators record name and .bed file name.
This section fixes this issue



In [3]:

    
bed_files = Path('/Volumes/prj/dorina2/regulators/m_musculus/mm10/').glob('*.bed')



In [5]:

    
for bed_file in bed_files:
    bt = pybedtools.BedTool(str(bed_file))
    new_name = bed_file.stem + '*'
    bt.each(change_name, new_name=new_name).saveas().moveto(str(bed_file))

there are still problems with the pictar files



In [ ]:

	ENSG00000223972.4	ENSG00000227232.4	ENSG00000243485.2	ENSG00000237613.2	ENSG00000268020.2	ENSG00000240361.1	ENSG00000186092.4	ENSG00000238009.2	ENSG00000233750.3	ENSG00000237683.5	...	ENSG00000198886.2	ENSG00000210176.1	ENSG00000210184.1	ENSG00000210191.1	ENSG00000198786.2	ENSG00000198695.2	ENSG00000210194.1	ENSG00000198727.2	ENSG00000210195.2	ENSG00000210196.2
Adipose Tissue	0.05458	11.200	0.06012	0.03653	0.02565	0.041180	0.04875	0.146500	0.1299	6.9240	...	32440.0	0.6692	0.6153	0.5386	4028.0	3744.0	12.340	24050.0	0.6773	1.3320
Adrenal Gland	0.07460	8.023	0.08179	0.04050	0.03479	0.049065	0.06136	0.087785	0.1488	6.5955	...	53425.0	2.7575	1.4325	2.9195	7246.0	8498.0	28.845	28275.0	1.5510	1.8340
Bladder	0.05878	14.240	0.06097	0.04113	0.00000	0.000000	0.05461	0.143000	0.0761	4.2820	...	34110.0	0.7275	0.0000	0.0000	9434.0	11080.0	44.760	23030.0	0.0000	0.8238
Blood	0.08195	8.560	0.00000	0.02570	0.00000	0.000000	0.00000	0.121100	0.5804	150.7000	...	5565.0	0.3868	0.0000	0.3539	969.1	1446.0	5.105	3129.0	0.0000	0.4556
Blood Vessel	0.04535	11.990	0.05278	0.03668	0.00000	0.038950	0.04108	0.138400	0.1108	4.3950	...	16210.0	0.0000	0.0000	0.0000	3832.0	5166.0	19.600	13160.0	0.0000	1.7400

	DDX11L1	FAM138A	OR4F5	LOC100996442	LOC101928626	FAM87B	LINC00115	LINC01128	FAM41C	LOC100130417	...	COX2	ATP8	ATP6	COX3	ND3	ND4L	ND4	ND5	ND6	CYTB
Adipose Tissue	NaN	NaN	NaN	NaN	NaN	1.327	4.6540	7.9250	NaN	NaN	...	28170.0	22320.0	34100.0	33550.0	18590.0	14040.0	32440.0	4028.0	3744.0	24050.0
Adrenal Gland	NaN	NaN	NaN	NaN	NaN	1.246	2.5845	4.0570	NaN	3.7895	...	47140.0	28810.0	50570.0	51070.0	21265.0	23285.0	53425.0	7246.0	8498.0	28275.0
Bladder	NaN	NaN	NaN	NaN	NaN	NaN	7.7790	10.2700	NaN	11.4100	...	29100.0	16230.0	29280.0	26990.0	12620.0	14800.0	34110.0	9434.0	11080.0	23030.0
Blood	NaN	NaN	NaN	NaN	NaN	NaN	2.0060	1.3050	NaN	NaN	...	6785.0	3143.0	5815.0	5354.0	2193.0	1635.0	5565.0	969.1	1446.0	3129.0
Blood Vessel	NaN	NaN	NaN	NaN	NaN	1.416	7.6760	10.1200	NaN	5.8060	...	15510.0	12060.0	19210.0	19010.0	9435.0	6670.0	16210.0	3832.0	5166.0	13160.0
Brain	NaN	NaN	NaN	NaN	NaN	NaN	2.5100	5.5420	NaN	NaN	...	63130.0	30470.0	56970.0	54350.0	22890.0	20980.0	50420.0	8518.0	8231.0	30680.0
Breast	NaN	NaN	NaN	NaN	NaN	NaN	4.7510	6.4490	NaN	1.4500	...	28380.0	21890.0	33700.0	34805.0	18225.0	13720.0	32170.0	3820.5	3444.0	23110.0
Cervix Uteri	NaN	NaN	NaN	NaN	NaN	NaN	10.1200	7.8920	NaN	2.7350	...	16400.0	12310.0	18300.0	19120.0	9362.0	9776.0	22370.0	3280.0	3346.0	15060.0
Colon	NaN	NaN	NaN	NaN	NaN	NaN	5.0350	8.1730	NaN	2.7200	...	42740.0	23570.0	38880.0	37890.0	18860.0	16910.0	39570.0	5754.0	6239.0	25400.0
Esophagus	NaN	NaN	NaN	NaN	NaN	NaN	4.5780	7.4410	NaN	3.0570	...	33310.0	17610.0	29240.0	29850.0	14100.0	13840.0	32280.0	5952.0	6509.0	22050.0
Fallopian Tube	NaN	NaN	NaN	NaN	NaN	1.067	4.5550	8.2320	NaN	11.8500	...	21860.0	12990.0	23120.0	22420.0	11210.0	11480.0	25190.0	8669.0	12350.0	18920.0
Heart	NaN	NaN	NaN	NaN	NaN	NaN	2.0585	14.9400	NaN	1.0013	...	52475.0	37660.0	64245.0	62890.0	29830.0	28020.0	58560.0	11150.0	10520.0	41610.0
Kidney	NaN	NaN	NaN	NaN	NaN	NaN	3.8130	2.5620	NaN	7.2100	...	65550.0	32530.0	62490.0	76160.0	28660.0	19700.0	55540.0	7165.0	6304.0	36450.0
Liver	NaN	NaN	NaN	NaN	NaN	NaN	3.2400	3.6740	NaN	1.5870	...	46290.0	29760.0	50770.0	43080.0	18760.0	24730.0	55510.0	6101.0	5401.0	31210.0
Lung	NaN	NaN	NaN	NaN	NaN	NaN	7.0240	6.0160	NaN	2.7630	...	21820.0	13950.0	22210.0	18100.0	8035.0	10210.0	24080.0	5371.0	7023.0	12240.0
Muscle	NaN	NaN	NaN	NaN	NaN	NaN	2.0010	12.0400	NaN	NaN	...	37470.0	27720.0	45115.0	41125.0	14910.0	18345.0	41105.0	9633.5	12105.0	29965.0
Nerve	NaN	NaN	NaN	NaN	NaN	1.247	12.1100	5.8720	NaN	4.3470	...	18740.0	14460.0	22305.0	22595.0	11800.0	10425.0	24250.0	4447.5	4613.0	15970.0
Ovary	NaN	NaN	NaN	NaN	NaN	NaN	13.3400	6.1750	NaN	NaN	...	13580.0	12700.0	19570.0	17410.0	9586.0	9180.0	21320.0	2974.0	2723.0	12930.0
Pancreas	NaN	NaN	NaN	NaN	NaN	NaN	1.0955	2.3085	NaN	NaN	...	12385.0	9212.5	14365.0	11675.0	7522.5	6393.0	14485.0	1840.5	2167.5	9239.5
Pituitary	NaN	NaN	NaN	NaN	NaN	NaN	9.8980	18.6000	NaN	19.3100	...	23480.0	15990.0	25810.0	26930.0	11940.0	9185.0	22050.0	2693.0	2039.0	19400.0
Prostate	NaN	NaN	NaN	NaN	NaN	NaN	14.5300	8.2300	NaN	18.3750	...	36980.0	25645.0	40380.0	36250.0	19560.0	17940.0	39640.0	4731.5	3838.5	26740.0
Salivary Gland	NaN	NaN	NaN	NaN	NaN	NaN	5.4880	3.2690	NaN	NaN	...	24950.0	16620.0	25630.0	25610.0	11110.0	12200.0	27900.0	3426.0	3195.0	17270.0
Skin	NaN	NaN	NaN	NaN	NaN	NaN	4.3440	3.9110	NaN	1.0910	...	25610.0	15570.0	24180.0	30740.0	10550.0	9842.0	23560.0	5441.0	6697.0	15730.0
Small Intestine	NaN	NaN	NaN	NaN	NaN	NaN	4.1530	3.9740	NaN	2.8450	...	43720.0	24690.0	39320.0	39950.0	16420.0	16810.0	39650.0	8593.0	11440.0	25850.0
Spleen	NaN	NaN	NaN	NaN	NaN	NaN	6.9035	4.5140	NaN	17.3550	...	26200.0	13430.0	23535.0	20885.0	9544.5	7895.5	24310.0	3786.0	6365.0	12645.0
Stomach	NaN	NaN	NaN	NaN	NaN	NaN	3.0680	3.3805	NaN	2.4755	...	40045.0	24315.0	40625.0	38630.0	19780.0	18930.0	44110.0	8302.0	10280.0	27085.0
Testis	1.76	NaN	NaN	NaN	NaN	1.485	8.1510	8.1980	NaN	4.4640	...	26090.0	16720.0	27980.0	35450.0	12420.0	13330.0	31150.0	2649.0	1774.0	15430.0
Thyroid	NaN	NaN	NaN	NaN	NaN	NaN	9.7615	6.5845	NaN	4.4255	...	28490.0	19855.0	30775.0	34950.0	14390.0	14310.0	32350.0	3679.5	2765.5	19830.0
Uterus	NaN	NaN	NaN	NaN	NaN	NaN	10.9200	8.7280	NaN	5.9170	...	17580.0	13750.0	22430.0	18940.0	9669.0	9905.0	22360.0	3571.0	3729.0	14820.0
Vagina	NaN	NaN	NaN	NaN	NaN	NaN	10.5800	5.2070	NaN	3.7420	...	19090.0	12850.0	18820.0	21010.0	9243.0	7718.0	18250.0	2767.0	2550.0	12750.0

	description	experiment	id	references	summary
0	Liftover to hg38 assembly with crossmap.	TargetScan Cons. miRNA:targets	miR-504\|4725-5p\|TargetScan	Benjamin P, et al. Cell 2005;20:15-20.	miR-504\|4725-5p\|TargetScan
1	Liftover to hg38 assembly with crossmap.	TargetScan Cons. miRNA:targets	miR-124\|124ab\|506\|TargetScan	Benjamin P, et al. Cell 2005;20:15-20.	miR-124\|124ab\|506\|TargetScan
2	Liftover to hg38 assembly with crossmap.	TargetScan Cons. miRNA:targets	miR-19ab\|TargetScan	Benjamin P, et al. Cell 2005;20:15-20.	miR-19ab\|TargetScan
3	Liftover to hg38 assembly with crossmap.	TargetScan Cons. miRNA:targets	miR-137\|137ab\|TargetScan	Benjamin P, et al. Cell 2005;20:15-20.	miR-137\|137ab\|TargetScan
4	Liftover to hg38 assembly with crossmap.	TargetScan Cons. miRNA:targets	miR-31\|TargetScan	Benjamin P, et al. Cell 2005;20:15-20.	miR-31\|TargetScan

	description	experiment	id	references	summary
0	sites regulated by 101732 obtained from K562 c...	eClip	K562-U2AF2\|eClip	Van Nostrand, Eric L., et al. Robust transcrip...	K562-U2AF2\|eClip
1	sites regulated by 42087 obtained from K562 ce...	eClip	K562-UPF1\|eClip	Van Nostrand, Eric L., et al. Robust transcrip...	K562-UPF1\|eClip
2	sites regulated by 31614 obtained from K562 ce...	eClip	K562-SMNDC1\|eClip	Van Nostrand, Eric L., et al. Robust transcrip...	K562-SMNDC1\|eClip
3	sites regulated by 27191 obtained from K562 ce...	eClip	K562-CPSF6\|eClip	Van Nostrand, Eric L., et al. Robust transcrip...	K562-CPSF6\|eClip
4	sites regulated by 81632 obtained from K562 ce...	eClip	K562-PRPF8\|eClip	Van Nostrand, Eric L., et al. Robust transcrip...	K562-PRPF8\|eClip