OHNOLOG DB candidates

Using candidates from http://ohnologs.curie.fr/

Question: Are ohnologs more likely to be protein complexes?


In [6]:
import numpy as np
import pandas as pd
from scipy.stats import fisher_exact
import json

In [8]:
def read_gid_to_chr(infile):
    gid_to_chr = {}
    with open(infile, "r") as handle:
        for line in handle:
            if line.startswith("Ensembl"):
                continue
            elif line.startswith("ENS"):
                line = line.rstrip().split()
                gid_to_chr[line[0]] = line[1]
    return gid_to_chr

gid_to_chr = read_gid_to_chr("Gallus_gallus.75.gid.chr.tsv")
pid_to_chr = read_gid_to_chr("Gallus_gallus.75.pid.chr.tsv")

# Get a list of Z linked gene IDs
Z_genes = [g for g, pos in gid_to_chr.items() if pos == "Z"]
print("Genes on the Z: ",len(Z_genes))


Genes on the Z:  859

In [12]:
def read_protein_complex_json(json_infile):
    with open(json_infile, "r") as infile:  
        data = json.load(infile)
        return data

protein_complexes = read_protein_complex_json("2015-05-14-ChickenComplexes.json")
print("Number of protein complexes:", len(protein_complexes))


Number of protein complexes: 1991

In [13]:
Z_protein_complexes = []
all_protein_complexes = []
for c in protein_complexes:
    chicken_orthologs = protein_complexes[c]["chicken_orthologs"]
    if chicken_orthologs:
        for ortholog in chicken_orthologs:
            gid = ortholog["id"]
            pid = ortholog["protein_id"]
            if pid_to_chr[pid] == "Z":
                Z_protein_complexes.append(gid)
            all_protein_complexes.append(gid)
            
Z_protein_complexes = list(set(Z_protein_complexes))
all_protein_complexes = list(set(all_protein_complexes))
print("Number of unique protein complex genes on the Z chromosome:", len(Z_protein_complexes))
print("Number of unique protein complex genes in the chicken genome:", len(all_protein_complexes))


Number of unique protein complex genes on the Z chromosome: 103
Number of unique protein complex genes in the chicken genome: 2341

In [14]:
def read_ohnolog_pairs(infile):
    ohnolog_pairs = pd.read_csv(infile,  sep='\t')
    return ohnolog_pairs

def get_categories(ohnologs, Z_genes, protein_complexes):
    results={"COMPLEX_OHNO": [],
             "NO_COMPLEX_OHNO": [],
             "COMPLEX_NO_OHNO": [],
             "NO_COMPLEX_NO_OHNO": []
            }
    
    for g in Z_genes:
        
        if g in protein_complexes:
            if g in ohnologs:
                results["COMPLEX_OHNO"].append(g)
            
            else:
                results["COMPLEX_NO_OHNO"].append(g)
            
        else:
            if g in ohnologs:
                results["NO_COMPLEX_OHNO"].append(g)
            else:
                results["NO_COMPLEX_NO_OHNO"].append(g)
                        
    print("OHNO  NO-OHNO")
    print("COMPLEX", len(results["COMPLEX_OHNO"]), len(results["COMPLEX_NO_OHNO"]))
    print("NO-COMPLEX",len(results["NO_COMPLEX_OHNO"]), len(results["NO_COMPLEX_NO_OHNO"]))
    print (fisher_exact([[len(results["COMPLEX_OHNO"]),len(results["COMPLEX_NO_OHNO"])],
                        [len(results["NO_COMPLEX_OHNO"]), len(results["NO_COMPLEX_NO_OHNO"])]]))
    print("------")
    return results

ohnolog_curie_pairs = read_ohnolog_pairs("CHICKEN.Pairs.Relaxed.2R.txt")
ohno1 = list(ohnolog_curie_pairs["Ohnolog-1 Id"])
ohno2 = list(ohnolog_curie_pairs["Ohnolog-2 Id"])
ohnolog_curie_pairs_relaxed_ids = list(set(ohno1+ohno2))
print(len(ohnolog_curie_pairs_relaxed_ids))

Z_ohnologs_curie = []
for gid in ohnolog_curie_pairs_relaxed_ids:
    # Some genes in the DB are not in this genome release Ensembl 75
    if gid in gid_to_chr:
        if gid_to_chr[gid] == "Z":
            Z_ohnologs_curie.append(gid)
print(len(Z_ohnologs_curie))

dc_complex = get_categories(Z_ohnologs_curie, Z_genes, Z_protein_complexes)
dc_complex = get_categories(ohnolog_curie_pairs_relaxed_ids, gid_to_chr.keys(), all_protein_complexes)


5228
223
OHNO  NO-OHNO
COMPLEX 28 75
NO-COMPLEX 195 561
(1.074051282051282, 0.81069774426237795)
------
OHNO  NO-OHNO
COMPLEX 736 1605
NO-COMPLEX 4135 10632
(1.1790771734339862, 0.00066956883057440195)
------

In [19]:
print 410./(1914+410)
print 1931./(1931+12853)


0.176419965577
0.130614177489

In [1]:
736./(4135+736)


Out[1]:
0.15109833709710532

In [2]:
1605./(10632+1605)


Out[2]:
0.13115959794067172

In [ ]: