In [28]:
import ontobio.golr.golr_associations as ga
In [5]:
# Fetch all Dmel TF genes
DNA_BINDING_TF = 'GO:0003700'
DMEL = 'NCBITaxon:7227'
tf_genes = ga.get_subjects_for_object(object=DNA_BINDING_TF, subject_taxon=DMEL)
len(tf_genes)
Out[5]:
In [33]:
# Routine to go to GO and Monarch to fetch all annotations for a gene
def get_pubs_for_gene(g):
# Monarch
r = ga.search_associations(subject=g, rows=-1)
pubs = set()
for a in r['associations']:
pl = a['publications']
if pl is not None:
pubs.update([p['id'] for p in pl if p['id'].startswith('PMID')])
# GO
r = ga.search_associations(subject=g, rows=-1, object_category='function')
for a in r['associations']:
pl = a['reference']
if pl is not None:
pubs.update([p for p in pl if p.startswith('PMID')])
return pubs
len(get_pubs_for_gene(tf_genes[0]))
Out[33]:
In [15]:
# find all gene,numberOfPub pairs
pairs = []
for g in tf_genes:
np = len(get_pubs_for_gene(g))
pairs.append((g,np))
In [16]:
# Check
vals = [np for _,np in pairs]
vals[0:5]
Out[16]:
In [20]:
# Check
tf_genes_with_no_pubs = [g for g,np in pairs if np==0]
tf_genes_with_no_pubs
Out[20]:
In [21]:
# genes with fewer than 5 pubs
[g for g,np in pairs if np < 5]
Out[21]:
In [23]:
import matplotlib.pyplot as plt
%matplotlib inline
In [24]:
# Histogram
plt.hist(vals, bins=40)
plt.ylabel('No of genes')
plt.xlabel('No of pubs')
plt.show()
In [27]:
# Save results
import csv
with open('gene-pubs.csv', 'w', newline='') as csvfile:
w = csv.writer(csvfile, delimiter=',')
for g,np in pairs:
w.writerow([g,np])
In [ ]: