Distribution of publication count for Dmel TF genes

For each TF gene, count the number of curated publications, using data from GO and Monarch



In [28]:

    
import ontobio.golr.golr_associations as ga



In [5]:

    
# Fetch all Dmel TF genes
DNA_BINDING_TF = 'GO:0003700'
DMEL = 'NCBITaxon:7227'
tf_genes = ga.get_subjects_for_object(object=DNA_BINDING_TF, subject_taxon=DMEL)
len(tf_genes)









    Out[5]:





478



In [33]:

    
# Routine to go to GO and Monarch to fetch all annotations for a gene
def get_pubs_for_gene(g):
    
    # Monarch
    r = ga.search_associations(subject=g, rows=-1)
    pubs = set()
    for a in r['associations']:
        pl = a['publications']
        if pl is not None:
            pubs.update([p['id'] for p in pl if p['id'].startswith('PMID')])
    
    # GO
    r = ga.search_associations(subject=g, rows=-1, object_category='function')
    for a in r['associations']:
        pl = a['reference']
        if pl is not None:
            pubs.update([p for p in pl if p.startswith('PMID')])
   
    return pubs
    
len(get_pubs_for_gene(tf_genes[0]))









    Out[33]:





140



In [15]:

    
# find all gene,numberOfPub pairs
pairs = []
for g in tf_genes:
    np = len(get_pubs_for_gene(g))
    pairs.append((g,np))



In [16]:

    
# Check
vals = [np for _,np in pairs]
vals[0:5]









    Out[16]:





[140, 97, 34, 107, 110]



In [20]:

    
# Check
tf_genes_with_no_pubs = [g for g,np in pairs if np==0]
tf_genes_with_no_pubs









    Out[20]:





['FB:FBgn0085253']



In [21]:

    
# genes with fewer than 5 pubs
[g for g,np in pairs if np < 5]









    Out[21]:





['FB:FBgn0038626',
 'UniProtKB:A0A0B4LH09',
 'FB:FBgn0024975',
 'FB:FBgn0025185',
 'FB:FBgn0028647',
 'FB:FBgn0029173',
 'FB:FBgn0029928',
 'FB:FBgn0030008',
 'FB:FBgn0030012',
 'FB:FBgn0032694',
 'FB:FBgn0033449',
 'FB:FBgn0033627',
 'FB:FBgn0037317',
 'FB:FBgn0039078',
 'FB:FBgn0039329',
 'FB:FBgn0039937',
 'FB:FBgn0052006',
 'FB:FBgn0053213',
 'FB:FBgn0053557',
 'FB:FBgn0085253',
 'FB:FBgn0263511',
 'UniProtKB:A0A0B4K653',
 'UniProtKB:A0A0B4KGA3',
 'UniProtKB:A0A0B4KGM5',
 'UniProtKB:A0A0B4KGW2',
 'UniProtKB:A0A0B4KGW6',
 'UniProtKB:A0A0B4KHC8',
 'UniProtKB:A0A0B4LGG8']



In [23]:

    
import matplotlib.pyplot as plt
%matplotlib inline



In [24]:

    
# Histogram
plt.hist(vals, bins=40)
plt.ylabel('No of genes')
plt.xlabel('No of pubs')
plt.show()



In [27]:

    
# Save results
import csv
with open('gene-pubs.csv', 'w', newline='') as csvfile:
    w = csv.writer(csvfile, delimiter=',')
    for g,np in pairs:
        w.writerow([g,np])



In [ ]: