In [2]:
# pycl imports
from pycl import *
#Std lib imports
import datetime
from glob import glob
from pprint import pprint as pp
from os.path import basename
from os import listdir, remove, rename
from os.path import abspath, basename, isdir
from collections import OrderedDict
# Third party import
import numpy as np
import scipy.stats as stats
import pylab as pl
from Bio import Entrez
# Jupyter specific imports
from IPython.core.display import display, HTML, Markdown, Image
# Pyplot tweaking
%matplotlib inline
pl.rcParams['figure.figsize'] = 30, 10 # that's default image size for this interactive session
# Jupyter display tweaking
toogle_code()
larger_display(75)
# Simplify warning reporting to lighten the notebook style
import warnings
warnings.formatwarning=(lambda message, *args: "{}\n".format(message))
# Allow to use R directly
%load_ext rpy2.ipython
In [8]:
# Specific helper functions
def generate_header (PMID, cell, modification, method):
h = "# Data cleaned, converted to BED6, coordinate converted to hg38 using liftOver\n"
h+= "# Maurits Evers (maurits.evers@anu.edu.au)\n"
h+= "# Data cleaned and standardized. {}\n".format(str (datetime.datetime.today()))
h+= "# Adrien Leger (aleg@ebi.ac.uk)\n"
h+= "# RNA_modification={}|Cell_type={}|Analysis_method={}|Pubmed_ID={}\n".format(modification, cell, method, PMID)
h+= "# chrom chromstart chromend modif|cell_type|method|PMID|loci score strand\n"
return h
def file_summary(file, separator=["\t", "|"], max_items=10):
n_line = fastcount(file)
print("Filename:\t{}".format(file))
print("Total lines:\t{}\n".format(n_line))
linerange(file, range_list=[[0,9],[n_line-5, n_line-1]])
print(colsum(file, header=False, ignore_hashtag_line=True, separator=separator, max_items=max_items))
def distrib_peak_len (file, range=None, bins=50, normed=True):
h = []
for line in open (file):
if line[0] != "#":
ls = line.split("\t")
delta = abs(int(ls[1])-int(ls[2]))
h.append(delta)
h.sort()
pl.hist(h,normed=normed, range=range, bins=bins)
pl.show()
def pubmed_fetch(pmid):
Entrez.email = 'your.email@example.com'
handle = Entrez.efetch (db='pubmed', id=pmid, retmode='xml', )
return Entrez.read(handle)[0]
def pmid_to_info(pmid):
results = pubmed_fetch(pmid)
try:
title = results['MedlineCitation']['Article']['ArticleTitle']
except (KeyError, IndexError) as E:
title = "NA"
try:
first_name = results['MedlineCitation']['Article']['AuthorList'][0]['LastName']
except (KeyError, IndexError) as E:
first_name = "NA"
try:
Year = results['MedlineCitation']['Article']['ArticleDate'][0]['Year']
except (KeyError, IndexError) as E:
Year = "NA"
try:
Month = results['MedlineCitation']['Article']['ArticleDate'][0]['Month']
except (KeyError, IndexError) as E:
Month = "NA"
try:
Day = results['MedlineCitation']['Article']['ArticleDate'][0]['Day']
except (KeyError, IndexError) as E:
Day = "NA"
d = {"title":title, "first_name":first_name, "Year":Year, "Month":Month, "Day":Day}
return d
I collected datasets from different sources. The 2 big database containing Inosine edition RADAR and DARNED as well as all the datasets cited in the recent review about lncRNA and epitranscriptomics from Shaffik et al. In addition I also have 2 recent datasets for m1A and m6A/Am6A. All the datasets need to be carefully reviewed, reformated to BED6 format, converted to hg38 reference genome and reannotated with recent genecode annotations. See overview of the datasets in the table below
Modification | Article | Initial number of peaks found (Shafik et al) | Final number of peaks found | Number of peaks in lncRNA found (Shafik et al) | Number of uniq lncRNA (Shafik et al) |
---|---|---|---|---|---|
Inosine | Z Peng et al. Nat Biotechnol 30, 253-60 (2012) | 22686 (22686) | 21,111 | 3382 (4425) | 505 (846) |
Inosine | M Sakurai et al. Genome Res 24, 522-34 (2014) | 20482 (20482) | 20,482 | 2550 (3050) | 319 (400) |
m5C | S Hussain et al. Cell Rep 4, 255-61 (2013) | 1084 (1084) | 1,084 | 107 (110) | 39 (41) |
m5C | V Khoddami et al. Nat Biotechnol 31, 458-64 (2013) | 20553 (20533) | 20,553 | 1523 (1580) | 39 (38) |
m5C | JE Squires et al. Nucleic Acids Res 40, 5023-33 (2012) | 10490 (10490) | 10,490 | 281 (1544) | 112 (711) |
m6A | D Dominissini et al. Nature 485, 201-6 (2012) | 25918 (25776) | 2,894 | 115 (7397) | 84 (6165) |
m6A | KD Meyer et al. Cell 149, 1635-46 (2012) | 4341 (4341) | 4,341 | 48 (57) | 16 (20) |
m6A and m6Am | B Linder et al. Nat Methods 12, 767-72 (2015) | 15167 (NA) | 15,167 | 385 (NA) | 168 (NA) |
m1A | D Dominissini et al. Nature 530, 441-6 (2016) | 32136 (NA) | 19552 (HeLa:8873, HepG2:8550, HEK293:2129) | 606 (NA) | 338 (NA) |
pseudouridylation | TM Carlile et al. Nature 515, 143-6 (2014) | 8 (8) | 8 | 4 (4) | 3 (3) |
pseudouridylation | X Li et al. Nat Chem Biol 11, 592-7 (2015) | 1489 (1489) | 1,489 | 48 (58) | 44 (54) |
pseudouridylation | S Schwartz et al. Cell 159, 148-62 (2014) | 402 (396) | 402 | 14 (15) | 10 (11) |
Inosine | DARNED database | 333216 (259654) | 290,002 | 24152 (23574) | 1300 (1833) |
Inosine | RADAR database | 2576460 (2576289) | 1,342,374 | 97118 (218793) | 3343 (6376) |
In [62]:
file_summary("./PTM_Original_Datasets/DARNED_human_hg19_all_sites.txt")
DARNED is a little messy and hard to convert since the position with the same PMID/OR sample types where fused in the same Site. It makes it difficult to parse. I think it would be better if I duplicate the site with the several PMID and sample type. I just need to verify if the number of fields in PMID and cell type is similar and if they correspond to each other, ie first in cell type = first in PMID
In [55]:
# chrom chromstart chromend modif|cell_type|method|PMID|loci score strand\n"
header = "# DARNED database Human all sites hg19 coordinates\n"
header+= "# Data cleaned, filtered for Inosine editing, standardized and converted to BED6 format\n"
header+= "# Adrien Leger (aleg@ebi.ac.uk) - {}\n".format(str (datetime.datetime.today()))
header+= "# chrom chromstart chromend modif|cell_type|method|PMID|loci score strand\n"
reformat_table(
input_file = "./PTM_Original_Datasets/DARNED_human_hg19_all_sites.txt",
output_file = "./PTM_Original_Datasets/DARNED_human_hg19_inosine.bed",
init_template = [0,"\t",1,"\t",2,"\t",3,"\t",4,"\t",5,"\t",6,"\t",7,"\t",8,"\t",9],
final_template = ["chr",0,"\t",1,"\t",1,"\t",3,">",4,"|",8,"|-|",9,"|",5,"\t0\t",2],
keep_original_header = False,
header = header,
replace_internal_space = '_',
replace_null_val = "-",
filter_dict = {0:["-"],1:["-"],2:["?"],3:["T","G","C"],4:["C","T","A","U"],8:["-"],9:["-"]},
subst_dict = {4:{"G":"I"}}
)
file_summary("./PTM_Original_Datasets/DARNED_human_hg19_inosine.bed")
I reformated and filtered the database with reformat_table to be BED6 compatible. I removed the fields lacking either chromosome, position, tissue, and PID as well as with unknown strand. In addition I also selected only A>I and A>G transitions (same thing). This filtering step eliminated 47965 sites
In [56]:
d={}
with open("./PTM_Original_Datasets/DARNED_human_hg19_inosine.bed", "r") as f:
for line in f:
if line [0] !="#":
ls = supersplit(line, ["\t","|"])
n_tissue = len(ls[4].split(","))
n_PMID = len(ls[6].split(","))
key="{}:{}".format(n_tissue,n_PMID)
if key not in d:
d[key]=0
d[key]+=1
print (d)
I tried to see if the PMID and the tissue field always had the same lengh, so I could de multiplex the fused positions. The answer is no, the number of PMID and tissues could be different. However 272551 positions have only 1 tissue and 1 PMID. These are maybe not the more relialable positions but they might be more easy to interprete. The sites with only 1 PMID but several tissues can also be used. Same think for several PMID, 1 tissue. I will demultiplex them so as to have only 1 PMID and 1 tissue by site. Concerning the site with more than 1 PMID and 1 tissue, I will extract then in another backup file.
In [57]:
infile = "./PTM_Original_Datasets/DARNED_human_hg19_inosine.bed"
outclean = "./PTM_Original_Datasets/DARNED_human_hg19_inosine_cleaned.bed"
outunclean = "./PTM_Original_Datasets/DARNED_human_hg19_inosine_unclean.bed"
with open(infile, "r") as inf, open(outclean, "w") as outf_clean, open(outunclean, "w") as outf_unclean:
init_sites = uniq = several_tissue = several_pmid = several_all = final_sites = 0
for line in inf:
if line [0] == "#":
outf_clean.write(line)
outf_unclean.write(line)
else:
init_sites += 1
ls = supersplit(line, ["\t","|"])
tissue_list = ls[4].split(",")
PMID_list = ls[6].split(",")
n_tissue = len(tissue_list)
n_PMID = len(PMID_list)
if n_tissue == 1:
# 1 PMID, 1 tissue = no problem
if n_PMID == 1:
uniq += 1
final_sites += 1
outf_clean.write(line)
# Several PMID, 1 tissue = demultiplex PMID lines
else:
several_pmid += 1
for PMID in PMID_list:
final_sites += 1
outf_clean.write("{0}\t{1}\t{2}\t{3}|{4}|{5}|{6}|{7}\t{8}\t{9}".format(
ls[0],ls[1],ls[2],ls[3],ls[4],ls[5],PMID.strip(),ls[7],ls[8],ls[9]))
else:
# 1 PMID, several tissues = demultiplex tissues lines
if n_PMID == 1:
several_tissue += 1
for tissue in tissue_list:
final_sites += 1
outf_clean.write("{0}\t{1}\t{2}\t{3}|{4}|{5}|{6}|{7}\t{8}\t{9}".format(
ls[0],ls[1],ls[2],ls[3],tissue.strip().strip("_").strip("."),ls[5],ls[6],ls[7],ls[8],ls[9]))
# Several PMID, several tissues = extract the line in uncleanable datasets
else:
several_all += 1
outf_unclean.write(line)
print("Initial sites: ", init_sites)
print("Final clean sites: ", final_sites)
print("1 PMID 1 tissu: ", uniq)
print("1 PMID ++ tissue: ", several_tissue)
print("++ PMID 1 tissue: ", several_pmid)
print("++ PMID ++ tissue: ", several_all)
file_summary(outclean)
file_summary(outunclean)
I only lost 4270 sites with several PMID and several tissue. Some sites where demultiplexed and I now have 290018 sites with 1 PMID and 1 tissue
In [58]:
# Conversion to hg38 with Crossmap/liftover
lifover_chainfile = "../LiftOver_chain_files/hg19ToHg38.over.chain.gz"
input_bed = "./PTM_Original_Datasets/DARNED_human_hg19_inosine_cleaned.bed"
temp_bed = "./PTM_Original_Datasets/DARNED_human_hg38_inosine_temp.bed"
cmd = "CrossMap.py bed {} {} {}".format(lifover_chainfile, input_bed, temp_bed)
bash(cmd)
# Rewriting and updating of the header removed by Crossmap
final_bed = "./PTM_Original_Datasets/DARNED_human_hg38_inosine_cleaned.bed"
header = "# DARNED database Human all sites hg38 coordinates\n"
header+= "# Data cleaned, filtered for Inosine editing, standardized, converted to BED6 format and updated to hg38 coordinates\n"
header+= "# Adrien Leger (aleg@ebi.ac.uk) - {}\n".format(str (datetime.datetime.today()))
header+= "# chrom chromstart chromend modif|cell_type|method|PMID|loci score strand\n"
with open (temp_bed, "r") as infile, open (final_bed, "w") as outfile:
outfile.write (header)
for line in infile:
outfile.write (line)
file_summary(final_bed)
The conversion resulted in the lost of 16 sites, which is neglectable compared with the 290002 sites in the database
The file dowloaded from DARNED is not a bed file. I need to modify it to comply with the standard format. The format of the mane file Human_AG_all_hg19_v2.txt is the following:
In [45]:
file_summary("./PTM_Original_Datasets/RADAR_human_hg19_v2_primary.txt", separator=["\t"])
I am not interested by the conservation fields, the annotation and the repetitive nature, but I will keep chromosome, position, gene and strand. Additional information is also available in a secondary database file that I found hidden on RADAR website. The information includes the original publication and the source of the biological sample. The information is only available for around half of the sites from 4 publications. I will only retain these richly annotated sites. The same sites can have bmultiple entries in the secondary file since I could have been reported by several papers in several tissues. I will keep a line for each independantly discovered site. The coverage and editing level could be interesting to save too.
In [46]:
file_summary("./PTM_Original_Datasets/RADAR_human_hg19_v2_secondary.txt")
The operation is quite complex since I will have to fuse the 2 files and extract only specific values. I need to code a specific parser. The main RADAR file will be parsed and organised as a simple embed dict. The secondary file will be the more important since I will start parsing from it to find the complementary information in the main database file. Each site will be added to a list of Site objects, that will be subsequently iterated to combine with the main database before writing in a new Bed formated file.
In [108]:
# Create a structured dict of dict to parse the main database file
from collections import OrderedDict
def parse_RADAR_main (file):
# Define the top level access dict
radar_dict = OrderedDict()
for line in open (file, "r"):
if line[0] != "#":
sl = line.split("\t")
assert len(sl) == 11
chromosome, position, gene, strand = sl[0].strip(), int(sl[1].strip()), sl[2].strip(), sl[3].strip()
if chromosome not in radar_dict:
radar_dict[chromosome] = OrderedDict()
# There should be only one line per position
assert position not in radar_dict[chromosome]
radar_dict[chromosome][position] = {"gene":gene,"strand":strand}
return radar_dict
In [109]:
# Create a class to store a line of the additional file.
from collections import OrderedDict
class Site (object):
#~~~~~~~CLASS FIELDS~~~~~~~#
# Table of correspondance reference => PMID
TITLE_TO_PMID = {
"Peng et al 2012":"22327324",
"Bahn et al 2012":"21960545",
"Ramaswami et al 2012":"22484847",
"Ramaswami et al 2013":"23291724",
}
# Table of correspondance reference => PMID
TISSUE_TO_SAMPLE = {
"Brain":"Brain",
"Illumina Bodymap":"Illumina_Bodymap",
"Lymphoblastoid cell line":"YH",
"U87 cell line":"U87MG"
}
#~~~~~~~FONDAMENTAL METHODS~~~~~~~#
# Parse a line of the aditional information file
def __init__(self, line, ):
sl = line.strip().split("\t")
self.chromosome = sl[0].split(":")[0].strip()
self.position = int(sl[0].split(":")[1].strip())
self.PMID = self.TITLE_TO_PMID[sl[1].strip()]
self.tissue = self.TISSUE_TO_SAMPLE[sl[2].strip()]
self.coverage = sl[3].strip()
self.editing_level = sl[4].strip()
# Fundamental class methods str and repr
def __repr__(self):
msg = "SITE CLASS\n"
# list all values in object dict in alphabetical order
keylist = [key for key in self.__dict__.keys()]
keylist.sort()
for key in keylist:
msg+="\t{}\t{}\n".format(key, self.__dict__[key])
return (msg)
def __str__(self):
return self.__repr__()
In [110]:
a = Site("chr1:1037916 Peng et al 2012 Lymphoblastoid cell line 9 66.67")
print (a)
In [111]:
# Create a structured dict of dict to parse the secondary database file
def parse_RADAR_secondary (file):
# Define a list to store Site object (not a dict because of redundancy)
radar_list = []
for line in open (file, "r"):
if line[0] != "#":
radar_list.append(Site(line))
# return a list sorted by chromosome and positions
return sorted(radar_list, key=lambda Site: (Site.chromosome, Site.position))
Read the original file, reformat the field and write a new file BED6 compliant.
chrom chromstart chromend name score orient
chr4 774138 774138 A>I|LOC100129917|LUNG:LYMPHOBLASTOID_CELL_LINE|15342557:15258596:22327324 0 -
In [112]:
# chrom chromstart chromend modif|cell_type|method|PMID|loci score strand\n"
def reformat_RADAR (main_file, secondary_file, outfile, errfile, header):
# Read and structure the 2 database files
print("Parse the main database file")
main = parse_RADAR_main(main_file)
print("Parse the secondary database file")
secondary = parse_RADAR_secondary (secondary_file)
print("Combine the data together in a new bed formated file")
with open (outfile, "w+") as csvout, open (errfile, "w+") as errout:
# rewrite header
csvout.write(header)
fail = success = 0
for total, site in enumerate(secondary):
try:
line = "{0}\t{1}\t{1}\t{2}|{3}|{4}|{5}|{6}\t{7}\t{8}\n".format(
site.chromosome,
site.position,
"A>I",
site.tissue,
"-",
site.PMID,
main[site.chromosome][site.position]["gene"],
site.editing_level,
main[site.chromosome][site.position]["strand"],
)
csvout.write(line)
success += 1
except KeyError as E:
line = "{0}\t{1}\t{2}\t{3}\t{4}\n".format(
site.chromosome,
site.position,
site.tissue,
site.PMID,
site.editing_level
)
errout.write(line)
fail += 1
print ("{} Sites processed\t{} Sites pass\t{} Sites fail".format(total, success, fail))
In [113]:
header = "# RADAR database Human v2 all sites hg19 coordinates\n"
header+= "# Data cleaned, standardized and converted to BED6 format\n"
header+= "# Adrien Leger (aleg@ebi.ac.uk) - {}\n".format(str (datetime.datetime.today()))
header+= "# chrom chromstart chromend modif|cell_type|method|PMID|loci score strand\n"
reformat_RADAR(
main_file = "./PTM_Original_Datasets/RADAR_human_hg19_v2_primary.txt",
secondary_file = "./PTM_Original_Datasets/RADAR_human_hg19_v2_secondary.txt",
outfile = "./PTM_Original_Datasets/RADAR_Human_hg19_inosine_cleaned.bed",
errfile = "./PTM_Original_Datasets/RADAR_Human_hg19_inosine_orphan.bed",
header = header)
file_summary("./PTM_Original_Datasets/RADAR_Human_hg19_inosine_cleaned.bed")
After combining information, out of the 1343463 sites in the RADAR secondary file and 2576460 in the RADAR primary file, 1342423 consistant sites were found in both files, ie half of the database site were filtered out because they where not in the 2 database files.
In [114]:
# Conversion to hg38 with Crossmap/liftover
lifover_chainfile = "../LiftOver_chain_files/hg19ToHg38.over.chain.gz"
input_bed = "./PTM_Original_Datasets/RADAR_Human_hg19_inosine_cleaned.bed"
temp_bed = "./PTM_Original_Datasets/RADAR_Human_hg38_inosine_temp.bed"
cmd = "CrossMap.py bed {} {} {}".format(lifover_chainfile, input_bed, temp_bed)
bash(cmd)
# Rewriting and updating of the header removed by Crossmap
final_bed = "./PTM_Original_Datasets/RADAR_Human_hg38_inosine_cleaned.bed"
header = "# RADAR database Human v2 all sites hg38 coordinates\n"
header+= "# Data cleaned, standardized, converted to BED6 format and updated to hg38 coordinates\n"
header+= "# Adrien Leger (aleg@ebi.ac.uk) - {}\n".format(str (datetime.datetime.today()))
header+= "# chrom chromstart chromend modif|cell_type|method|PMID|loci score strand\n"
with open (temp_bed, "r") as infile, open (final_bed, "w") as outfile:
outfile.write (header)
for line in infile:
outfile.write (line)
file_summary(final_bed)
Around 10000 additional sites were lost during the conversion from hg19 to hg38
The datasets a pretty cleaned and already converted to hg38 build coordinates. However, they are not strandardize and the synthax could be different since the bioinformatician in charge of the analysis tried to keep as much information as possible from the original datasets. I will strandardize the file names, the bed name fields and add reformat the file header. In addition I will explore the datasets to see if the are consistant and decide if I use them or not.
In [35]:
listdir("./PTM_Original_Datasets/")
Out[35]:
In [5]:
infile="./PTM_Original_Datasets/editing_Peng_hg38.bed"
PMID = "22327324"
cell = "YH"
modification = "A>I"
method = "A_to_I_editing"
author = "Peng"
outfile = "./PTM_Clean_Datasets/{}_{}_{}_hg38_cleaned.bed".format(author, modification, cell, method)
file_summary(infile)
In [5]:
print(colsum(infile, colrange=[9], header=False, ignore_hashtag_line=True, separator=["\t", "|"], max_items=20, ret_type="report"))
The column 9 contains more than just Inosine transitions(A>G) but also all the other editing sites they found. Here I will focuss on the Inosine only. I need to filter out all the other values
In [61]:
# chrom chromstart chromend modif|cell_type|method|PMID|loci score strand\n"
init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"|",5,"|",6,"|",7,"|",8,"|",9,"|",10,"%|",11,"|",12,"|",13,"|",14,"|",15,"|",16,"|",17,"|",18,"\t",19,"\t",20]
final_template=[0,"\t",1,"\t",2,"\t",9,"|",cell,"|",method,"|",PMID,"|",18,"\t",10,"\t",20]
# filter out all but A>G transition which are Inosine transition
filter_dict={9:["T->C","G->A","C->T","T->G","C->G","G->C","A->C","T->A","C->A","G->T","A->T"]}
# Reformat the field value A->G to A>I for standardization
subst_dict={9:{"A->G":"A>I"}}
reformat_table(
input_file=infile,
output_file=outfile,
init_template=init_template,
final_template=final_template,
keep_original_header = False,
header = generate_header(PMID, cell, modification, method),
replace_internal_space='_',
replace_null_val="-",
subst_dict = subst_dict,
filter_dict = filter_dict )
file_summary(outfile)
In [17]:
distrib_peak_len(outfile)
Contains a lot of fields, some of which I don't even have an idea of what the contains. The dataset was not filtered and contains not only A>G A>I transitions. There is a total of 22686 sites but only 21111 are A>G transitions. I filtered out all the other modifications and retained only the A>G transition.
I am not sure that I should use this dataset, especially since it is already included in the RADAR database.
In [18]:
infile = "./PTM_Original_Datasets/editing_Sakurai_hg38.bed"
PMID = "24407955"
cell = "Brain"
modification = "A>I"
method = "ICE_seq"
author = "Sakurai"
outfile = "./PTM_Clean_Datasets/{}_{}_{}_hg38_cleaned.bed".format(author, modification, cell, method)
file_summary(infile)
In [7]:
# chrom chromstart chromend modif|cell_type|method|PMID|loci score strand\n"
init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"|",5,"|",6,"\t",7,"\t",8]
final_template=[0,"\t",1,"\t",2,"\t",modification,"|",cell,"|",method,"|",PMID,"|",4,"\t",7,"\t",8]
reformat_table(
input_file=infile,
output_file=outfile,
init_template=init_template,
final_template=final_template,
keep_original_header = False,
header = generate_header(PMID, cell, modification, method),
replace_internal_space='_',
replace_null_val="-")
file_summary(outfile)
In [19]:
distrib_peak_len(outfile)
No problem with this dataset, I kept the gene loci name for future comparison after reannotation with GENCODE
In [20]:
infile = "./PTM_Original_Datasets/m5C_Hussain_hg38.bed"
PMID = "23871666"
cell = "HEK293"
modification = "m5C"
method = "miCLIP"
author = "Hussain"
outfile = "./PTM_Clean_Datasets/{}_{}_{}_hg38_cleaned.bed".format(author, modification, cell, method)
file_summary(infile)
In [9]:
# chrom chromstart chromend modif|cell_type|method|PMID|loci score strand\n"
init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"\t",5,"\t",6]
final_template=[0,"\t",1,"\t",2,"\t",modification,"|",cell,"|",method,"|",PMID,"|-\t",5,"\t",6]
reformat_table(
input_file=infile,
output_file=outfile,
init_template=init_template,
final_template=final_template,
keep_original_header = False,
header = generate_header(PMID, cell, modification, method),
replace_internal_space='_',
replace_null_val="-")
file_summary(outfile)
In [13]:
distrib_peak_len(outfile)
No problem with this dataset, Since there is no gene loci, I just filed the field with a dash to indicate that it is empty
In [21]:
infile="./PTM_Original_Datasets/m5C_Khoddami_hg38.bed"
PMID = "23604283"
cell = "MEF"
modification = "m5C"
method = "AzaIP"
author = "Khoddami"
outfile = "./PTM_Clean_Datasets/{}_{}_{}_hg38_cleaned.bed".format(author, modification, cell, method)
file_summary(infile)
In [11]:
# chrom chromstart chromend modif|cell_type|method|PMID|loci score strand\n"
init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"\t",5,"\t",6]
final_template=[0,"\t",1,"\t",2,"\t",modification,"|",cell,"|",method,"|",PMID,"|",4,"\t",5,"\t",6]
reformat_table(
input_file=infile,
output_file=outfile,
init_template=init_template,
final_template=final_template,
keep_original_header = False,
header = generate_header(PMID, cell, modification, method),
replace_internal_space='_',
replace_null_val="-")
file_summary(outfile)
In [22]:
distrib_peak_len(outfile)
No problem with this dataset. It seems to be focussed on tRNA gene that are clearly over-represented in the gene list
In [23]:
infile="./PTM_Original_Datasets/m5C_Squires_hg38.bed"
PMID = "22344696"
cell = "HeLa"
modification = "m5C"
method = "bisulfite_seq"
author = "Squires"
outfile = "./PTM_Clean_Datasets/{}_{}_{}_hg38_cleaned.bed".format(author, modification, cell, method)
file_summary(infile)
In [13]:
# chrom chromstart chromend modif|cell_type|method|PMID|loci score strand\n"
init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"\t",5,"\t",6]
final_template=[0,"\t",1,"\t",2,"\t",modification,"|",cell,"|",method,"|",PMID,"|-\t",5,"\t",6]
reformat_table(
input_file=infile,
output_file=outfile,
init_template=init_template,
final_template=final_template,
keep_original_header = False,
header = generate_header(PMID, cell, modification, method),
replace_internal_space='_',
replace_null_val="-")
file_summary(outfile)
In [24]:
distrib_peak_len(outfile)
1 nt wide peaks = No problem with this dataset. Since there are no gene loci, I just filed the field with a dash to indicate that it is empty
In [51]:
infile="./PTM_Original_Datasets/m6A_Dominissini_hg38.bed"
PMID = "22575960"
cell = "HepG2"
modification = "m6A"
method = "M6A_seq"
author = "Dominissini"
outfile = "./PTM_Clean_Datasets/{}_{}_{}_hg38_cleaned.bed".format(author, modification, cell, method)
file_summary(infile)
In [4]:
distrib_peak_len(infile, normed=False, bins=500)
Apparently there is a problem in the data since some of the peaks can be up to 3 500 000 with is much longer than initially described in the paper. I dowloaded the original peak calling file from the sup data of the paper to compare with this datafile.
In [30]:
infile="./PTM_Original_Datasets/m6A_Dominissini_hg19_original_table.csv"
file_summary(infile)
In [26]:
distrib_peak_len(infile, normed=False, bins=500)
In [27]:
distrib_peak_len(infile, normed=False, range=[1,2000], bins=500)
In [28]:
distrib_peak_len(infile, normed=False, range=[1,200], bins=200)
The same problem is also found in the original data, though the values only goes up to 2 500 000... I think that long peaks were improperly called... Looking at the data in detail we can see that most of the peaks are found in the 1 to 130 range. There is also a second smaller peak around 1000. Looking at the original article the mapping was done on the human transcriptome and not the genome. Apparently the coordinates were converted to the genome after and that might explain this decrepancy. I have 2 options => Starting from scratch with a recent genome build, but the dataset seems quite tricky and I am not sure I could do it as well as the original authors. The second option is to be retrictive and keep only the small peaks ie > 1000 pb. I think that I will start by this alternative and go back to the data again if needed. To be sure of the quality of the data I will start from the original data and do the liftover conversion myself.
In [52]:
# chrom chromstart chromend modif|cell_type|method|PMID|loci score strand\n"
infile="./PTM_Original_Datasets/m6A_Dominissini_hg19_original_table.csv"
outfile = "./PTM_Clean_Datasets/Dominissini_m6A_HepG2_hg19_cleaned.bed"
init_template=[0,"\t",1,"\t",2,"\t",3,"\t",4]
final_template=[0,"\t",1,"\t",2,"\t",modification,"|",cell,"|",method,"|",PMID,"|",4,"\t-\t",3]
# Predicate function to filter out large peaks
predicate = lambda val_list: abs(int(val_list[1])-int(val_list[2])) <= 1000
reformat_table(
input_file=infile,
output_file=outfile,
init_template=init_template,
final_template=final_template,
keep_original_header = False,
header = generate_header(PMID, cell, modification, method),
replace_internal_space='_',
replace_null_val="-",
predicate = predicate)
file_summary(outfile)
The filtering based on the peak length size remove a lot of peaks = nearly 90 % of the dataset
In [53]:
# Conversion to hg38 with Crossmap/liftover
lifover_chainfile = "../LiftOver_chain_files/hg19ToHg38.over.chain.gz"
input_bed = "./PTM_Clean_Datasets/Dominissini_m6A_HepG2_hg19_cleaned.bed"
temp_bed = "./PTM_Clean_Datasets/Dominissini_m6A_HepG2_hg38_temp.bed"
cmd = "CrossMap.py bed {} {} {}".format(lifover_chainfile, input_bed, temp_bed)
bash(cmd)
# Rewriting and updating of the header removed by Crossmap
final_bed = "./PTM_Clean_Datasets/Dominissini_m6A_HepG2_hg38_cleaned.bed"
header = generate_header(PMID, cell, modification, method)
with open (temp_bed, "r") as infile, open (final_bed, "w") as outfile:
outfile.write (header)
for line in infile:
outfile.write (line)
file_summary(final_bed)
In [55]:
distrib_peak_len(final_bed, normed=False, bins=200)
It is much better but I only have 2894 sites out of the 25000 initially in the dataset. For this first exploratory study it should be OK, but I will probably have to go back to the original data again later
In [14]:
infile="./PTM_Original_Datasets/m6A_Meyer_hg38.bed"
PMID = "22608085"
cell = "HEK293"
modification = "m6A"
method = "MeRIP_Seq"
author = "Meyer"
outfile = "./PTM_Clean_Datasets/{}_{}_{}_hg38_cleaned.bed".format(author, modification, cell, method)
file_summary(infile)
In [16]:
# chrom chromstart chromend modif|cell_type|method|PMID|loci score strand\n"
init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"|",5,"\t",6,"\t",7]
final_template=[0,"\t",1,"\t",2,"\t",modification,"|",cell,"|",method,"|",PMID,"|",4,"\t",6,"\t",7]
reformat_table(
input_file=infile,
output_file=outfile,
init_template=init_template,
final_template=final_template,
keep_original_header = False,
header = generate_header(PMID, cell, modification, method),
replace_internal_space='_',
replace_null_val="-")
file_summary(outfile)
In [17]:
distrib_peak_len(outfile, bins = 100)
The original dataset was OK and clean. Contrary to the previous dataset the width of the peaks are between 100 and 220 pb wich is clearly better. It is however interesting to notice that the peak lengths are not randomly distributed
In [27]:
infile="./PTM_Original_Datasets/miCLIP_m6A_Linder2015_hg38.bed"
PMID = "26121403"
cell = "HEK293"
modification = "m6A:m6Am"
method = "miCLIP"
author = "Linder"
outfile = "./PTM_Clean_Datasets/{}_{}_{}_hg38_cleaned.bed".format(author, modification, cell, method)
file_summary(infile)
In [22]:
# chrom chromstart chromend modif|cell_type|method|PMID|loci score strand\n"
init_template=[0,"\t",1,"\t",2,"\t",3,"\t",4,"\t",5]
final_template=[0,"\t",1,"\t",2,"\t",modification,"|",cell,"|",method,"|",PMID,"|-\t",4,"\t",5]
reformat_table(
input_file=infile,
output_file=outfile,
init_template=init_template,
final_template=final_template,
keep_original_header = False,
header = generate_header(PMID, cell, modification, method),
replace_internal_space='_',
replace_null_val="-")
file_summary(outfile)
In [28]:
distrib_peak_len(outfile)
The name field is unusable since it contains a random number of fieds... I cannot parse it easily. That is not a big problem since I will reannotate the data based on the last gencode annotation realease, I did not save any of the informations contained in the original name field. With miCLIP data the peak are 1 nt wide
In [29]:
infile="./PTM_Original_Datasets/MeRIPseq_m1A_Dominissini2016_hg38.bed"
PMID = "26863196"
cell = "HeLa:HEK293:HepG2"
modification = "m1A"
method = "M1A_seq"
author = "Dominissini"
outfile = "./PTM_Clean_Datasets/{}_{}_{}_hg38_cleaned.bed".format(author, modification, cell, method)
file_summary(infile)
I don't know understand some of the categories for HEPG2 cells
In [37]:
###### chrom chromstart chromend modif|cell_type|method|PMID|loci score strand\n"
init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"|",5,"\t",6,"\t",7]
final_template=[0,"\t",1,"\t",2,"\t",modification,"|",5,"|",method,"|",PMID,"|",3,"\t",6,"\t",7]
# filter out all but A>G transition which are Inosine transition
filter_dict={5:["HEPG2_heat_shock_4h","HEPG2_Glucose_starv_4h","HEPG2_common_total_RNA"]}
# Reformat the field value A->G to A>I for standardization
subst_dict={5:{"HEPG2_common_mRNA":"HepG2"}}
reformat_table(
input_file=infile,
output_file=outfile,
init_template=init_template,
final_template=final_template,
keep_original_header = False,
header = generate_header(PMID, cell, modification, method),
replace_internal_space='_',
replace_null_val="-",
subst_dict = subst_dict,
filter_dict = filter_dict
)
file_summary(outfile)
In [30]:
distrib_peak_len(outfile)
In [31]:
infile="./PTM_Original_Datasets/pseudoU_Carlile_hg38.bed"
PMID = "25192136"
cell = "HeLa"
modification = "Y"
method = "Pseudo_seq"
author = "Carlile"
outfile = "./PTM_Clean_Datasets/{}_{}_{}_hg38_cleaned.bed".format(author, modification, cell, method)
file_summary(infile)
In [32]:
# chrom chromstart chromend modif|cell_type|method|PMID|loci score strand\n"
init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"\t",5,"\t",6]
final_template=[0,"\t",1,"\t",2,"\t",modification,"|",cell,"|",method,"|",PMID,"|",4,"\t",5,"\t",6]
reformat_table(
input_file=infile,
output_file=outfile,
init_template=init_template,
final_template=final_template,
keep_original_header = False,
header = generate_header(PMID, cell, modification, method),
replace_internal_space='_',
replace_null_val="-")
file_summary(outfile)
In [32]:
distrib_peak_len(outfile)
Dataset OK but only contains only 8 peaks in lncRNA. 1 of the peaks is really wide = 40000 pb The dataset contains only the 8 peaks identify in the lncRNA... The coordinates correspond to the gen coordinates rather than the peaks themselves.. Use the dataset?? Is it really worthy to remap everything for such a low number of eventual peaks?
In [33]:
infile="./PTM_Original_Datasets/pseudoU_Li_hg38.bed"
PMID = "26075521"
cell = "HEK293"
modification = "Y"
method = "CeU_Seq"
author = "Li"
outfile = "./PTM_Clean_Datasets/{}_{}_{}_hg38_cleaned.bed".format(author, modification, cell, method)
file_summary(infile)
In [35]:
# chrom chromstart chromend modif|cell_type|method|PMID|loci score strand\n"
init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"|",5,"|",6,"\t",7,"\t",8]
final_template=[0,"\t",1,"\t",2,"\t",modification,"|",cell,"|",method,"|",PMID,"|",4,"\t",7,"\t",8]
reformat_table(
input_file=infile,
output_file=outfile,
init_template=init_template,
final_template=final_template,
keep_original_header = False,
header = generate_header(PMID, cell, modification, method),
replace_internal_space='_',
replace_null_val="-")
file_summary(outfile)
In [34]:
distrib_peak_len(outfile)
No problem with this dataset
In [35]:
infile="./PTM_Original_Datasets/pseudoU_Schwartz_hg38.bed"
PMID = "25219674"
cell = "HEK293:Fibroblast"
modification = "Y"
method = "Psi-seq"
author = "Schwartz"
outfile = "./PTM_Clean_Datasets/{}_{}_{}_hg38_cleaned.bed".format(author, modification, cell, method)
file_summary(infile)
In [39]:
# chrom chromstart chromend modif|cell_type|method|PMID|loci score strand\n"
init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"\t",5,"\t",6]
final_template=[0,"\t",1,"\t",2,"\t",modification,"|",cell,"|",method,"|",PMID,"|",4,"\t",5,"\t",6]
reformat_table(
input_file=infile,
output_file=outfile,
init_template=init_template,
final_template=final_template,
keep_original_header = False,
header = generate_header(PMID, cell, modification, method),
replace_internal_space='_',
replace_null_val="-")
file_summary(outfile)
In [36]:
distrib_peak_len(outfile)
No problem with this dataset
In [57]:
for f in sorted(glob("./PTM_Clean_Datasets/*.bed")):
print (f)
linerange(f, [[10,12]])
OK for all the datasets
In [58]:
for f in sorted(glob("./PTM_Clean_Datasets/*.bed")):
print ("\n", "-"*100)
print ("Dataset Name\t{}".format(basename(f)))
print ("Number sites\t{}".format(simplecount(f, ignore_hashtag_line=True)))
a = colsum(
f,
colrange = [3,4,5,6],
header=False,
ignore_hashtag_line=True,
separator=["\t", "|"],
max_items=20,
ret_type="dict"
)
# Get more info via pubmed
print ("PMID")
for pmid,count in a[6].items():
pubmed_info = pmid_to_info(pmid)
print ("\t*{}\t{}\n\t {}. et al, {}\{}\{}\n\t {}".format(
pmid,count,
pubmed_info["first_name"],
pubmed_info["Year"],
pubmed_info["Month"],
pubmed_info["Day"],
pubmed_info["title"]))
# Simple listing for the other fields
for title, col in [["RNA PTM",3],["Tissue/cell",4],["Method",5]]:
print (title)
print(dict_to_report(a[col], ntab=1, max_items=10, tab="\t", sep="\t"))
The original annotations might not be optimal, and probably not made from an uniq reference annotation file. I will reanotate all the datasets will the last version of gencodegencode.v24.long_noncoding_RNAs.gff3. I split the file in 3 to retain only genes, transcript and exons. I also got the general gencode file containing all the annotated genes in the primary assembly
In [120]:
# New dir to create annotated files
mkdir("PTM_Annotated_Datasets")
mkdir("Test")
Out[120]:
I found a python wrapper package for bedtools to manipulate bed files. I will use it to intersect my bed files containing the positions of the PTM (or peaks) and the gff3 annotation files. This will allow me to get gene names for each positions
In [17]:
help(reformat_table)
In [23]:
head("../../Reference_Annotation/gencode_v24.gff3.gz")
In [29]:
import pybedtools
annotation_file = "../../Reference_Annotation/gencode_v24.gff3"
peak_file = "./PTM_Clean_Datasets/Dominissini_m6A_HepG2_hg38_cleaned.bed"
output_file = "./test.bed"
peak = pybedtools.BedTool(peak_file)
annotation = pybedtools.BedTool(annotation_file)
intersection = peak.intersect(annotation, wo=True, s=True)
# Reformat the file generated by pybedtools to a simple Bed format
init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"|",5,"|",6,"|",7,"\t",8,"\t",9,"\t",10,"\t",11,"\t",12,"\t",13,
"\t",14,"\t",15,"\t",16,"\t",17,"\tID=",18,";gene_id=",19,";gene_type=",20,";gene_status=",21,
";gene_name=",22,";level=",23,";havana_gene=",24]
final_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"|",5,"|",6,"|",18,"|",20,"|",22,"\t",8,"\t",9]
print(intersection.head())
print("Post processing results")
reformat_table(
input_file=intersection.fn,
output_file=output_file,
init_template=init_template,
final_template=final_template,
replace_internal_space='_',
replace_null_val="-",
keep_original_header = False,
predicate = lambda v: v[12] == "gene"
)
head(output_file)
In [36]:
import pybedtools
def intersect_extract_genecodeID (annotation_file, peak_file, outdir):
output_file = "{}/{}_{}.bed".format(outdir, file_basename(peak_file), file_basename(annotation_file))
genecount_file = "{}/{}_{}_uniq-gene.csv".format(outdir, file_basename(peak_file), file_basename(annotation_file))
site_file = "{}/{}_{}_uniq-sites.csv".format(outdir, file_basename(peak_file), file_basename(annotation_file))
peak = pybedtools.BedTool(peak_file)
annotation = pybedtools.BedTool(annotation_file)
# Intersect the 2 files with pybedtools
print("Intersecting {} with {}".format(file_basename(peak_file), file_basename(annotation_file)))
intersection = peak.intersect(annotation, wo=True, s=True)
# Reformat the file generated by pybedtools to a simple Bed format
init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"|",5,"|",6,"|",7,"\t",8,"\t",9,"\t",10,"\t",11,"\t",12,"\t",13,
"\t",14,"\t",15,"\t",16,"\t",17,"\tID=",18,";gene_id=",19,";gene_type=",20,";gene_status=",21,
";gene_name=",22,";level=",23,";havana_gene=",24]
final_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"|",5,"|",6,"|",18,"|",20,"|",22,"\t",8,"\t",9]
h = "# Data cleaned, converted to BED6, standardized and coordinates converted to hg38 using liftOver\n"
h+= "# Overlaping gene annotation with gencodev24\n"
h+= "# Adrien Leger (aleg@ebi.ac.uk) {}\n".format(str (datetime.datetime.today()))
h+= "# chrom\tchromstart\tchromend\tmodif|cell_type|method|PMID|ensembl_id|gene_type|gene_name\tscore\tstrand\n"
print("Post processing results")
reformat_table(
input_file=intersection.fn,
output_file=output_file,
init_template=init_template,
final_template=final_template,
replace_internal_space='_',
replace_null_val="-",
header = h,
keep_original_header = False,
predicate = lambda v: v[12] == "gene"
)
# Count the number of lines in the initial and final peak files
i, j = simplecount(peak_file, ignore_hashtag_line=True), simplecount(output_file, ignore_hashtag_line=True)
print("Total initial positions: {}\tTotal final positions: {}".format(i, j))
# Count uniq gene id and uniq positions found in the dataset
geneid_dict = OrderedDict()
coord_dict = OrderedDict()
with open (output_file, "r") as fp:
for line in fp:
if line[0] != "#":
sl= supersplit(line, separator=["\t", "|"])
# write gene id, gene_type,
gene_id = "{}\t{}\t{}".format(sl[7],sl[9],sl[8])
if gene_id not in geneid_dict:
geneid_dict[gene_id] = 0
geneid_dict[gene_id] += 1
coord = "{}:{}-{}".format(sl[0],sl[1],sl[2])
if coord not in coord_dict:
coord_dict[coord] = 0
coord_dict[coord] += 1
print ("Uniq genes found\t{}\nUniq position found\t{}\n".format(
len(geneid_dict.values()), len(coord_dict.values()) ))
# Write each gene id found with the number of time found
with open (genecount_file, "w") as fp:
fp.write(dict_to_report (geneid_dict, max_items=0, sep="\t"))
# Write each gene id found with the number of time found
with open (site_file, "w") as fp:
fp.write(dict_to_report (coord_dict, max_items=0, sep="\t"))
Add the gene name and ensembl gene ID to the bed name field
Test with one file
In [44]:
annotation_file = "../../Reference_Annotation/gencode_v24.gff3"
peak_file = "./PTM_Clean_Datasets/DARNED_human_hg38_inosine_cleaned.bed"
outdir = "./"
output_file = "./DARNED_human_hg38_inosine_cleaned_gencode_v24.bed"
genecount_file = "./DARNED_human_hg38_inosine_cleaned_gencode_v24_uniq-gene.csv"
site_file = "./DARNED_human_hg38_inosine_cleaned_gencode_v24_uniq-sites.csv"
intersect_extract_genecodeID (annotation_file, peak_file, outdir)
file_summary(output_file, separator=["\t","|"])
head (genecount_file)
head (site_file)
remove(output_file)
remove(genecount_file)
remove(site_file)
It is working ok > looping over all the cleaned PTM files
In [45]:
# Annotation vs gencodev23 lncRNA genes
annotation_file = '/home/aleg/Data/Reference_Annotation/gencode_v24_lncRNAs.gff3'
for peak_file in sorted(glob("./PTM_Clean_Datasets/*.bed")):
outdir = "./PTM_Annotated_Datasets"
intersect_extract_genecodeID (annotation_file, peak_file, outdir)
Between 1 and 16% (excluding the weird carlile dataset) of the peaks are found in lncRNA annotated in gencode v24. Iterate through the datasets to find the number of uniq genes found
In [46]:
# Annotation vs gencodev23 All genes
annotation_file = "/home/aleg/Data/Reference_Annotation/gencode_v24.gff3"
for peak_file in sorted(glob("./PTM_Clean_Datasets/*.bed")):
outdir = "./PTM_Annotated_Datasets"
intersect_extract_genecodeID (annotation_file, peak_file, outdir)
Article | Modification | Total sites initial (Shafik) | Total sites final | All sites in RNA | Uniq sites in RNA | Uniq RNA with site | All sites in lncRNA (Shafik) | Uniq sites in lncRNA | Uniq lncRNA with site (Shafik) |
---|---|---|---|---|---|---|---|---|---|
Peng et al | Inosine | 22686 (22686) | 21,111 | 18,827 | 17,081 | 4,034 | 3382 (4425) | 3259 | 505 (846) |
Sakurai et al | Inosine | 20482 (20482) | 20,482 | 19,906 | 17,230 | 2,648 | 2550 (3050) | 2330 | 319 (400) |
Hussain et al | m5C | 1084 (1084) | 1,084 | 332 | 296 | 158 | 107 (110) | 106 | 39 (41) |
Khoddami et al | m5C | 20553 (20533) | 20,553 | 14,234 | 12,011 | 321 | 1523 (1580) | 1523 | 36 (38) |
Squires et al. | m5C | 10490 (10490) | 10,490 | 11,038 | 10,169 | 4,659 | 281 (1544) | 272 | 112 (711) |
Dominissini et al. | m6A | 25918 (25776) | 2,894 | 1,080 | 1,019 | 466 | 115 (7397) | 114 | 84 (6165) |
Meyer et al | m6A | 4341 (4341) | 4,341 | 4,564 | 4,312 | 1,525 | 48 (57) | 48 | 16 (20) |
Linder et al | m6A and m6Am | 15167 (NA) | 15,167 | 15,945 | 14,851 | 6,191 | 385 (NA) | 375 | 168 (NA) |
Dominissini et al. | m1A | 32136 (NA) | 19552 (HeLa:8873, HepG2:8550, HEK293:2129) | 20,786 | 19,063 | 8,516 | 606 (NA) | 578 | 338 (NA) |
Carlile et al | pseudouridylation | 8 (8) | 8 | 25 | 7 | 22 | 4 (4) | 3 | 3 (3) |
Li et al | pseudouridylation | 1489 (1489) | 1,489 | 1,579 | 1,471 | 1,425 | 48 (58) | 47 | 44 (54) |
Schwartz et al | pseudouridylation | 402 (396) | 402 | 421 | 372 | 378 | 14 (15) | 14 | 10 (11) |
DARNED | Inosine | 333216 (259654) | 290,002 | 294,441 | 254,964 | 10,558 | 24152 (23574) | 19814 | 1300 (1833) |
RADAR | Inosine | 2576460 (2576289) | 1,342,374 | 1,372,609 | 918,049 | 17,532 | 97118 (218793) | 68396 | 3343 (6376) |
I obtained less peaks and less uniq genes than shafik et al, probably since I used more stringent criteria for filtering and cleaning (especially for dominissini 2012 paper). Overall the results are consistant though for later analysis I will probably have to go back from the data for several papers.
In [ ]: