In [1]:
from pathlib import Path
import pybedtools
import pandas as pd
import numpy as np
import json
This notebook documents the process for extending Dorina regulators database. Dorina depends on the correct naming of both the bedfile and and bed record (the rows in the bed file). Dorina filter most bed files by the name, and if the name of the file and the bed record do not match, there will be problems. Please see dorina.regulator.py line 80 for implementation details.
Also, webdorina depends on a specific parsing to process the regulator site. This specification is:
'{experiment}#{regname}{assembly}*{reg_name}'
PARCLIP#TAF15_hg19*TAF15
To keep the database consistent with the old data
This may not be optimal for every experiment, but the database contains various types of experiments.
In [2]:
def change_name(rec, new_name=""):
rec.name = new_name
return rec
def change_name_targetscan(rec):
"""Changes the names of each entry in the targetScan bedfile
Default name example:
KLHL17:miR-299-3p
"""
rec.name = rec.name[ rec.name.find(':') + 1 :] + "|TargetScan"
return rec
def change_name_fehlmannetal2017(rec):
"""Changes the names of each entry in the targetScan bedfile
Default name example:
KLHL17:miR-299-3p
"""
rec[2] = rec.name
return rec
supll material was obtained from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4142047/bin/NIHMS598961-supplement-02.xlsx
In [38]:
lambert_RBFOX2 = pd.read_excel('/Volumes/prj/dorina2/raw/NIHMS598961-supplement-02.xlsx',
header=3, sheetname='RBFOX2', skiprows=1)
# lambert_CELF1 = pd.read_excel('/Volumes/prj/dorina2/raw/NIHMS598961-supplement-02.xlsx',
# header=3, sheetname='CELF1')
# lambert_MBNL1 = pd.read_excel('/Volumes/prj/dorina2/raw/NIHMS598961-supplement-02.xlsx',
# header=3, sheetname='MBNL1')
lambert_RBFOX2 = lambert_RBFOX2\
.set_index('kmer', drop=True)\
.drop('Unnamed: 0', axis=1)
In [43]:
lambert_RBFOX2.columns = '0 1.5 4.5 14 40 121 365 1100 3300 9800'.split()
In [60]:
lambert_RBFOX2.shape
Out[60]:
In [62]:
mean2sd = lambert_RBFOX2.mean(axis=1) + lambert_RBFOX2.std(axis=1) * 2
In [80]:
sig_motifs = lambert_RBFOX2.gt(mean2sd, axis='rows').any(axis=1)
In [90]:
with open('/Volumes/tbrittoborges/bindndseek/bg.txt', 'w') as fou:
fou.write("\n".join(lambert_RBFOX2.index))
with open('/Volumes/tbrittoborges/bindndseek/RBFOX2.txt', 'w') as fou:
fou.write("\n".join(sig_motifs[sig_motifs].index))
In [ ]:
%%bash
cd /Volumes/tbrittoborges/sites2meme
module load meme
sites2meme bindndseek/ > bindndseek/RBFOX2.meme
fimo bindndseek/RBFOX2.meme /biodb/genomes/homo_sapiens/GRCh38_90/GRCh38_90.fa
fasta-grep TGCATG -dna -p < /biodb/genomes/homo_sapiens/GRCh38_90/GRCh38_90.fa
In [5]:
%%script false
# this one was processed in the computer cluster
!cd /biodb/gtex/
import pandas as pd
rnaseq = 'GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct.gz'
sample = 'GTEx_v7_Annotations_SampleAttributesDS.txt'
samples = pd.read_table(sample)
gtex = pd.read_table(rnaseq, skiprows=2, compression='gzip', header=0, )
samples_mapping = dict(zip(samples['SAMPID'], samples['SMTS']))
gtex.set_index('Name', inplace=True)
gtex.drop(columns=['Description'], inplace=True)
gtex = gtex.rename(columns=samples_mapping)
gtex = gtex.T
names = [x.split('.')[0] for x in gtex.columns]
# alternative solution for id mapping
import numpy as np
from bioservices import BioDBNet
bioservice = BioDBNet()
input_db = "Ensembl Gene ID"
output_db = ["Gene Symbol"]
input_values = ['ENSG00000214688']
xref = [bioservice.db2db(input_db, output_db, list(x), 9606) for x in np.array_split(names, 562)]
xref = pd.concat(xref)
In [3]:
gtex = pd.read_csv('/Volumes/tbrittoborges-1/gtex_posprocessed2.csv',
index_col=0, header=0)
In [4]:
gtex = gtex.groupby(level=0).median()
In [5]:
gtex.head()
Out[5]:
In [7]:
gtex.loc['Heart'][gtex.loc['Heart'] > 1].shape
Out[7]:
In [8]:
id_map = pd.read_csv('/Volumes/tbrittoborges-1/gtexv7_id_mapping.csv', header=0, index_col=0,sep=';')
In [9]:
mapping = id_map.set_index(['ensembl_id'])['hgnc_symbol']
In [10]:
mapped_names = mapping[names]
In [11]:
mapped_names.head()
Out[11]:
In [13]:
gtex.columns = [x[:x.index('.')] for x in gtex.columns]
gtex.head()
Out[13]:
In [15]:
# mapped_names.fillna(value={k:k for k in mapped_names.index}, inplace=True)
In [14]:
gtex.rename(columns=mapped_names, inplace=True)
In [15]:
gtex[gtex.columns.dropna()].head()
Out[15]:
In [25]:
gtex_clean = gtex[gtex.columns.dropna()]
In [34]:
# gtex.loc['Heart'][gtex.loc['Heart'] > 1].index.tolist()
In [36]:
gtex_clean[gtex_clean > 1]
Out[36]:
In [ ]:
In [13]:
!head /Volumes/prj/dorina2/raw/Fehlmann2017etal.gff3
In [54]:
bt = pybedtools.BedTool('/Volumes/prj/dorina2/raw/Fehlmann2017etal.gff3')
bt_new = bt.each(change_name_fehlmannetal2017).saveas()
'/Volumes/prj/dorina2/regulators/h_sapiens/hg38/targetScan_miRNA_hg38.bed'
for entry in bt:
entry[2] = entry.name
In [55]:
bt.head()
Add targetScan prediction from http://www.targetscan.org/vert_72/vert_72_data_download/Predicted_Target_Locations.default_predictions.hg19.bed.zip
Data was lift over to hg38 with Crossmap:
conda activate crossmap
python2 /home/tbrittoborges/bin/miniconda3/envs/crossmap/bin/CrossMap.py bed /prj/dorina2/crossmap/hg19ToHg38.over.chain Predicted_Target_Locations.default_predictions.hg19.bed targetScan_hg38.bed
In [3]:
!head /Volumes/prj/dorina2/raw/targetScan_hg38.bed
In [4]:
bt = pybedtools.BedTool('/Volumes/prj/dorina2/raw/targetScan_hg38.bed')
bt_new = bt.each(change_name_targetscan).saveas()
bt_new.moveto(
'/Volumes/prj/dorina2/regulators/h_sapiens/hg38/targetScan_miRNA_hg38.bed')
Out[4]:
In [5]:
bt = pybedtools.BedTool('/Volumes/prj/dorina2/regulators/h_sapiens/hg38/targetScan_miRNA_hg38.bed')
regulators = [entry.name for entry in bt]
In [6]:
len(regulators)
Out[6]:
In [7]:
json_path = '/Volumes/prj/dorina2/regulators/h_sapiens/hg38/targetScan_miRNA_hg38.json'
json_ = []
for regulator in set(regulators):
n_sites = regulators.count(regulator)
json_.append(
{
"description": f'{n_sites} sites predicted as regulated by {regulator}'
'predictions were obtained from TargetScan 7.2 from March 2018. '
'The original data, mapped to hg19 assembly, was lift over with Crossmap.',
"experiment": 'TargetScan',
"id": regulator,
"references": 'Agarwal, Vikram, et al. Predicting effective microRNA '
'target sites in mammalian mRNAs. elife 4 (2015).',
"summary": regulator
}
)
with open(json_path, 'w') as fout:
json.dump(json_, fout, indent=True)
In [11]:
bt = pybedtools.BedTool('/Volumes/prj/dorina2/raw/Predicted_Targets.mm10.bed')
bt_new = bt.each(change_name_targetscan).saveas()
regulators = [entry.name for entry in bt_new]
bt_new.moveto(
'/Volumes/prj/dorina2/regulators/m_musculus/mm10/targetScan_miRNA_mm10.bed')
Out[11]:
In [12]:
json_path = '/Volumes/prj/dorina2/regulators/m_musculus/mm10/targetScan_miRNA_mm10.json'
json_ = []
for regulator in set(regulators):
n_sites = regulators.count(regulator)
json_.append(
{
"description": f'{n_sites} sites predicted as regulated by {regulator}'
'predictions were obtained from TargetScan 7.1 from March 2018. '
'The original data is mapped to mm10 assembly.',
"experiment": 'TargetScan',
"id": regulator,
"references": 'Agarwal, Vikram, et al. Predicting effective microRNA '
'target sites in mammalian mRNAs. elife 4 (2015).',
"summary": regulator
}
)
with open(json_path, 'w') as fout:
json.dump(json_, fout, indent=True)
In [5]:
for cell in ('K562', 'HepG2'):
path = Path(f'/Volumes/biodb/encode/encode_hg38_clip_peaks/{cell}/replicates/')
bed_files = path.glob('*.bed')
regulators = [x.stem[: x.stem.find('_')] for x in bed_files]
for regulator in regulators:
bt1 = pybedtools.BedTool(str(path / f'{regulator}_{cell}_rep01.bed'))
bt2 = pybedtools.BedTool(str(path / f'{regulator}_{cell}_rep02.bed'))
bt_inter = bt1.intersect(bt2, s=True).sort()
bt_merged = bt1.cat(bt2, postmerge=False).sort()
bt_inter = bt_inter.map(b=bt_merged, c='5', s=True, o='median').bed6()
new_name = f'{cell}-{regulator}|eClip'
bt_inter.each(change_name, new_name=new_name).saveas().moveto(
f'/Volumes/tbrittoborges/dorina_eclip/eClip_{cell}_{regulator}_vanNostrand2016_hg38.bed')
In [6]:
!cat /Volumes/tbrittoborges/dorina_eclip/* > /Volumes/prj/dorina2/regulators/h_sapiens/hg38/eClip_RBP_hg38.bed
In [7]:
pd.read_json('/Volumes/prj/dorina2/regulators/h_sapiens/hg38/TargetScanCons_mirna_hg19.json').head()
Out[7]:
In [8]:
eclip_json = []
for cell in ('K562', 'HepG2'):
path = Path(f'/Volumes/biodb/encode/encode_hg38_clip_peaks/{cell}/replicates/')
bed_files = path.glob('*.bed')
regulators = [x.stem[: x.stem.find('_')] for x in bed_files]
for regulator in regulators:
n_sites = len(pybedtools.BedTool(
f'/Volumes/tbrittoborges/dorina_eclip/eClip_{cell}_{regulator}_vanNostrand2016_hg38.bed'))
eclip_json.append(
{
"description": f'sites regulated by {n_sites} obtained from {cell} cells. The '
'duplicated were merged with intersect and the median score is presented.'
'The original data was mapped to hg38 assembly',
"experiment": 'eClip',
"id": f'{cell}-{regulator}|eClip',
"references": 'Van Nostrand, Eric L., et al. Robust transcriptome-wide discovery '
'of RNA-binding protein binding sites with enhanced CLIP (eCLIP). Nature methods '
'13.6 (2016): 508.',
"summary": f'{cell}-{regulator}|eClip'
}
)
In [9]:
with open('/Volumes/prj/dorina2/regulators/h_sapiens/hg38/eClip_RBP_hg38.json', 'w') as fout:
json.dump(eclip_json, fout, indent=True)
In [10]:
pd.read_json('/Volumes/prj/dorina2/regulators/h_sapiens/hg38/eClip_RBP_hg38.json').head()
Out[10]:
In [3]:
bed_files = Path('/Volumes/prj/dorina2/regulators/m_musculus/mm10/').glob('*.bed')
In [5]:
for bed_file in bed_files:
bt = pybedtools.BedTool(str(bed_file))
new_name = bed_file.stem + '*'
bt.each(change_name, new_name=new_name).saveas().moveto(str(bed_file))
there are still problems with the pictar files
In [ ]: