Using candidates from http://ohnologs.curie.fr/
Question: Are ohnologs more likely to be protein complexes?
In [6]:
import numpy as np
import pandas as pd
from scipy.stats import fisher_exact
import json
In [8]:
def read_gid_to_chr(infile):
gid_to_chr = {}
with open(infile, "r") as handle:
for line in handle:
if line.startswith("Ensembl"):
continue
elif line.startswith("ENS"):
line = line.rstrip().split()
gid_to_chr[line[0]] = line[1]
return gid_to_chr
gid_to_chr = read_gid_to_chr("Gallus_gallus.75.gid.chr.tsv")
pid_to_chr = read_gid_to_chr("Gallus_gallus.75.pid.chr.tsv")
# Get a list of Z linked gene IDs
Z_genes = [g for g, pos in gid_to_chr.items() if pos == "Z"]
print("Genes on the Z: ",len(Z_genes))
In [12]:
def read_protein_complex_json(json_infile):
with open(json_infile, "r") as infile:
data = json.load(infile)
return data
protein_complexes = read_protein_complex_json("2015-05-14-ChickenComplexes.json")
print("Number of protein complexes:", len(protein_complexes))
In [13]:
Z_protein_complexes = []
all_protein_complexes = []
for c in protein_complexes:
chicken_orthologs = protein_complexes[c]["chicken_orthologs"]
if chicken_orthologs:
for ortholog in chicken_orthologs:
gid = ortholog["id"]
pid = ortholog["protein_id"]
if pid_to_chr[pid] == "Z":
Z_protein_complexes.append(gid)
all_protein_complexes.append(gid)
Z_protein_complexes = list(set(Z_protein_complexes))
all_protein_complexes = list(set(all_protein_complexes))
print("Number of unique protein complex genes on the Z chromosome:", len(Z_protein_complexes))
print("Number of unique protein complex genes in the chicken genome:", len(all_protein_complexes))
In [14]:
def read_ohnolog_pairs(infile):
ohnolog_pairs = pd.read_csv(infile, sep='\t')
return ohnolog_pairs
def get_categories(ohnologs, Z_genes, protein_complexes):
results={"COMPLEX_OHNO": [],
"NO_COMPLEX_OHNO": [],
"COMPLEX_NO_OHNO": [],
"NO_COMPLEX_NO_OHNO": []
}
for g in Z_genes:
if g in protein_complexes:
if g in ohnologs:
results["COMPLEX_OHNO"].append(g)
else:
results["COMPLEX_NO_OHNO"].append(g)
else:
if g in ohnologs:
results["NO_COMPLEX_OHNO"].append(g)
else:
results["NO_COMPLEX_NO_OHNO"].append(g)
print("OHNO NO-OHNO")
print("COMPLEX", len(results["COMPLEX_OHNO"]), len(results["COMPLEX_NO_OHNO"]))
print("NO-COMPLEX",len(results["NO_COMPLEX_OHNO"]), len(results["NO_COMPLEX_NO_OHNO"]))
print (fisher_exact([[len(results["COMPLEX_OHNO"]),len(results["COMPLEX_NO_OHNO"])],
[len(results["NO_COMPLEX_OHNO"]), len(results["NO_COMPLEX_NO_OHNO"])]]))
print("------")
return results
ohnolog_curie_pairs = read_ohnolog_pairs("CHICKEN.Pairs.Relaxed.2R.txt")
ohno1 = list(ohnolog_curie_pairs["Ohnolog-1 Id"])
ohno2 = list(ohnolog_curie_pairs["Ohnolog-2 Id"])
ohnolog_curie_pairs_relaxed_ids = list(set(ohno1+ohno2))
print(len(ohnolog_curie_pairs_relaxed_ids))
Z_ohnologs_curie = []
for gid in ohnolog_curie_pairs_relaxed_ids:
# Some genes in the DB are not in this genome release Ensembl 75
if gid in gid_to_chr:
if gid_to_chr[gid] == "Z":
Z_ohnologs_curie.append(gid)
print(len(Z_ohnologs_curie))
dc_complex = get_categories(Z_ohnologs_curie, Z_genes, Z_protein_complexes)
dc_complex = get_categories(ohnolog_curie_pairs_relaxed_ids, gid_to_chr.keys(), all_protein_complexes)
In [19]:
print 410./(1914+410)
print 1931./(1931+12853)
In [1]:
736./(4135+736)
Out[1]:
In [2]:
1605./(10632+1605)
Out[2]:
In [ ]: