In [57]:
import sys
import os
from collections import defaultdict
from scipy import stats as stats
from scipy.stats import fisher_exact, mannwhitneyu, wilcoxon, ranksums, chi2_contingency, chisquare
# Nice dataframes
import pandas as pd
import numpy as np
import random
In [58]:
from collections import defaultdict
def filter_vcf(source, expressed_genes):
SNPs = defaultdict(list)
try:
with open(source, "r") as infile:
for line in infile:
if line.startswith("##"):
continue
line = line.rstrip().split()
GID = line[22][1:-2]
if GID in expressed_genes:
if is_biallelic(line):
if is_valid_base(line[3]) and is_valid_base(line[4]):
SNPs[GID].append(line)
return SNPs
except IOError:
print "File does not exit!"
def is_biallelic(line):
"Only include biallelic sites"
# If ALT and REF has exactly one allele pass
# An allele is a string with length 1 eg A or G
if (len(line[3]) == 1 and len(line[4]) == 1):
return True
# In any other case return false
else:
# print line[3], line[4]
return False
def is_valid_base(b):
# Make sure letters are capital
# Has no impact as varscan uses only capitals
if b.upper() in ["A", "C", "G", "T"]:
return True
else:
return False
def read_call(vcf_line, sample):
'''Field format is:
GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR
0 1 2 3 4 5 6 7 8 9 10 11 12 13
'''
call1 = vcf_line[sample]
POS = int(vcf_line[1])
if (call1.split(":")[4] == "." or call1.split(":")[5] == "."):
return 0.0, 0.0, POS, DP
RD = float(call1.split(":")[4])
AD = float(call1.split(":")[5])
DP = float(call1.split(":")[3])
return RD, AD, POS, DP
def read_quality(vcf_line, sample):
'''Field format is:
GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR
0 1 2 3 4 5 6 7 8 9 10 11 12 13
'''
call1 = vcf_line[sample]
RBQ = float(call1.split(":")[8])
ABQ = float(call1.split(":")[9])
return RBQ, ABQ
def read_freq(vcf_line, sample):
'''Field format is:
GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR
0 1 2 3 4 5 6 7 8 9 10 11 12 13
'''
call1 = vcf_line[sample]
freq = float(call1.split(":")[6].strip("%"))/100
# print call1
return freq
def binom_test_ase(rd, ad):
'''
RD: Depth of reference-supporting bases (reads1)
AD: Depth of variant-supporting bases (reads2)
This is the chance of observing either more successes, or fewer successes, in all trials.
'''
# Two tailed binomial test
return stats.binom_test(rd, (rd+ad), p=0.5)
def random_sample(data, num_samples):
"Randomly choose num_sample items from data"
return random.sample(data, num_samples)
def anti_log2(i):
return np.power(2, i)
def log2(i):
return np.log2(i)
In [59]:
# Strictly filtered SNPS, where a call is valid in all samples
input_vcf = ["../snp-calling/5mismatches/all-tissues/combined/2016-filteredallmin17bin/soma100/HEART_M_5mismatches_filteredallmin17_noclusters_coding_gene.vcf",
"../snp-calling/5mismatches/all-tissues/combined/2016-filteredallmin17bin/soma100/LIVER_M_5mismatches_filteredallmin17_noclusters_coding_gene.vcf",
"../snp-calling/5mismatches/all-tissues/combined/2016-filteredallmin17bin/soma100/SPLEEN_M_5mismatches_filteredallmin17_noclusters_coding_gene.vcf",
"../snp-calling/5mismatches/all-tissues/combined/2016-filteredallmin17bin/gonad51/GONAD_M_5mismatches_filteredallmin17_noclusters_coding_gene.vcf",]
de_gonad = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-GONAD-detags-edgeR.csv")
de_spleen = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-SPLEEN-detags-edgeR.csv")
de_heart = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-HEART-detags-edgeR.csv")
de_liver = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-LIVER-detags-edgeR.csv")
rpkm_gonad = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-GONAD-RPKM-NORM-log2-edgeR.csv")
rpkm_spleen = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-SPLEEN-RPKM-NORM-log2-edgeR.csv")
rpkm_heart = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-HEART-RPKM-NORM-log2-edgeR.csv")
rpkm_liver = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-LIVER-RPKM-NORM-log2-edgeR.csv")
In [60]:
de_gonad.head()
Out[60]:
In [61]:
rpkm_gonad.head()
Out[61]:
In [62]:
def read_gid_to_chr(infile):
'''Reads in a file obtained from Ensembl mapping each gene to its chromosomal location'''
gid_to_chr = {}
with open(infile, "rb") as handle:
for line in handle:
if line.startswith("Ensembl"):
continue
else:
line = line.rstrip().split()
gid_to_chr[line[0]] = line[1]
return gid_to_chr
gid_to_chr = read_gid_to_chr("/data/ucbtzim/ohnolog-dc/dc-comparison/Gallus_gallus.75.gid.chr.proteincoding.tsv")
# Get a list of Z linked gene IDs
Z_genes = [g for g, pos in gid_to_chr.items() if pos == "Z"]
print "Number of genes (protein coding) on the Z chromosome:", len(Z_genes)
In [63]:
comb_gonad = pd.concat([rpkm_gonad, de_gonad], axis=1)
comb_heart = pd.concat([rpkm_heart, de_heart], axis=1)
comb_liver = pd.concat([rpkm_liver, de_liver], axis=1)
comb_spleen = pd.concat([rpkm_spleen, de_spleen], axis=1)
print comb_gonad.shape
print comb_heart.shape
# Liver has one female sample less
print comb_liver.shape
print comb_spleen.shape
In [64]:
# No expression on chromosome 32, excluded
chicken_chromosomes = [str(i) for i in range(1, 29)] + ["Z"]
print "Valid chromosomes: ", chicken_chromosomes
comb_gonad = comb_gonad[comb_gonad.index.map(lambda x: gid_to_chr[x] in chicken_chromosomes)]
comb_heart = comb_heart[comb_heart.index.map(lambda x: gid_to_chr[x] in chicken_chromosomes)]
comb_liver = comb_liver[comb_liver.index.map(lambda x: gid_to_chr[x] in chicken_chromosomes)]
comb_spleen = comb_spleen[comb_spleen.index.map(lambda x: gid_to_chr[x] in chicken_chromosomes)]
print comb_gonad.shape
print comb_heart.shape
print comb_liver.shape
print comb_spleen.shape
In [65]:
def filter_cpm(cpm, mincpm):
'''Filter out lowly expressed genes based on a fixed minimum CPM threshold'''
filtered = cpm[cpm["logCPM"] > mincpm]
return filtered
# Beware average CPM is log2 transformed.
# For avgCPM of 2 use 1 | log2(2) = 1
comb_gonad_min2 = filter_cpm(comb_gonad, 1)
comb_heart_min2 = filter_cpm(comb_heart, 1)
comb_liver_min2 = filter_cpm(comb_liver, 1)
comb_spleen_min2 = filter_cpm(comb_spleen, 1)
print comb_gonad_min2.shape
print comb_heart_min2.shape
print comb_liver_min2.shape
print comb_spleen_min2.shape
In [66]:
def split_dataframe(data, genes):
'''Returns two dataframes. The first return value contains all rows that did match the genes.
The second dataframe contains only genes that did NOT match.
'''
match = data[data.index.isin(genes)]
nomatch = data[~data.index.isin(genes)]
return match, nomatch
# Get a list of Z linked gene IDs
Z_genes = [g for g, pos in gid_to_chr.items() if pos == "Z"]
print "Number of genes (protein coding + non coding) on the Z chromosome:", len(Z_genes)
print ""
# Filter the expression so it only contains Z expression
Z_comb_gonad_min2, AUTO_comb_gonad_min2 = split_dataframe(comb_gonad_min2, Z_genes)
Z_comb_heart_min2, AUTO_comb_heart_min2 = split_dataframe(comb_heart_min2, Z_genes)
Z_comb_liver_min2, AUTO_comb_liver_min2 = split_dataframe(comb_liver_min2, Z_genes)
Z_comb_spleen_min2, AUTO_comb_spleen_min2 = split_dataframe(comb_spleen_min2, Z_genes)
In [67]:
expressed_Z_genes = { "HEART":Z_comb_heart_min2.index,
"LIVER":Z_comb_liver_min2.index,
"SPLEEN":Z_comb_spleen_min2.index,
"GONAD":Z_comb_gonad_min2.index}
for i in expressed_Z_genes:
print i, len(expressed_Z_genes[i])
# print expressed_Z_genes["HEART"]
In [68]:
Z_tissue_snps = {}
for i, f in enumerate(input_vcf):
tissue = f.split("/")[-1].split("_")[0]
print tissue
SNPs = filter_vcf(f, expressed_Z_genes[tissue])
Z_tissue_snps[tissue] = SNPs
In [69]:
for t in Z_tissue_snps:
print t, len(Z_tissue_snps[t])
In [70]:
HEART 171
GONAD 122
LIVER 135
SPLEEN 269
In [71]:
for t in Z_tissue_snps:
c = 0
for gene in Z_tissue_snps[t]:
c += len(Z_tissue_snps[t][gene])
print t, c
In [72]:
HEART 503
GONAD 405
LIVER 439
SPLEEN 826
In [73]:
# Genes taken from http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1003635
# Not all IDs could be mapped
livernois_genes = ["ENSGALG00000010164", # SMARCA2
"ENSGALG00000010158", # Kank1
"ENSGALG00000010160", # DMRT1
"ENSGALG00000010161", # DMRT3
"ENSGALG00000015058", # PTPRD
"ENSGALG00000015105", # PSIP1
"ENSGALG00000015101", # BNC2
"ENSGALG00000015078", # MLLT3
"ENSGALG00000015071", # FOCAD
"ENSGALG00000014690", # SLC12A2
"ENSGALG00000017706", # RASA1
"ENSGALG00000001869", # LRRN6C, LINGO2
"ENSGALG00000002162", # ACO1
"ENSGALG00000002187", # HSD17B4
"ENSGALG00000002294"] # SEMA6A
livernois_dict = defaultdict(list)
for t in Z_tissue_snps:
for gene in Z_tissue_snps[t]:
if gene in livernois_genes:
livernois_dict[gene].append(t)
print len(livernois_dict)
for g in livernois_dict:
print g, livernois_dict[g]
Note:
Instead of calculating the reference allele fraction ourselves, we could also use the varscan2 supplied "Variant allele frequency" in this field:
##FORMAT=<ID=FREQ,Number=1,Type=String,Description="Variant allele frequency">
We can use the read_freq function varscan_freq = read_freq(snp,i) and filter like this:
for i in [9, 10, 11, 12]:
RD, AD, POS = read_call(snp, i)
varscan_freq = read_freq(snp,i)
# Check if heterozygote
if RD != 0 and AD != 0:
if binom_test_ase(RD, AD) < 0.05:
if varscan_freq >= 0.7:
valid.append(gene)
if varscan_freq <= 0.3:
valid.append(gene)
The results are identical, even though varscan calculates the allele frequency a bit differently
In [74]:
Z_ASE_genes = defaultdict(list)
Z_NOASE_genes = defaultdict(list)
for t in Z_tissue_snps:
for gene in Z_tissue_snps[t]:
# There may be multiple SNPs per gene
for snp in Z_tissue_snps[t][gene]:
valid = []
rd_ad = []
for i in [9, 10, 11, 12]:
RD, AD, POS, DP = read_call(snp, i)
RBQ, ABQ = read_quality(snp, i)
# Check if heterozygote
if (RD != 0 and AD != 0):
#Check for quality
if (RBQ >= 20 and ABQ >=20):
ref_allele_fraction = RD/(RD+AD)
# print type(RD), type(AD), type(ref_allele_fraction), ref_allele_fraction
if binom_test_ase(RD, AD) < 0.05:
if ref_allele_fraction >= 0.7:
valid.append(gene)
rd_ad.append((RD,AD))
if ref_allele_fraction <= 0.3:
valid.append(gene)
rd_ad.append((RD,AD))
# If all samples pass the check the gene is valid with ASE
if len(valid) == 4:
Z_ASE_genes[t].append(gene)
# Other SNPs may not pass the threshold, remove those from NOASE list
# Via http://stackoverflow.com/questions/4915920/how-to-delete-an-item-in-a-list-if-it-exists-python
while gene in Z_NOASE_genes[t]:
Z_NOASE_genes[t].remove(gene)
else:
if gene not in Z_NOASE_genes[t] and gene not in Z_ASE_genes[t]:
Z_NOASE_genes[t].append(gene)
for t in Z_ASE_genes:
print t, len(set(Z_ASE_genes[t])), set(Z_ASE_genes[t])
print "====="
for t in Z_NOASE_genes:
print t, len(set(Z_NOASE_genes[t]))
In [48]:
HEART 7 set(['ENSGALG00000014789', 'ENSGALG00000013809', 'ENSGALG00000015617', 'ENSGALG00000021340', 'ENSGALG00000000428', 'ENSGALG00000026553', 'ENSGALG00000025842'])
GONAD 13 set(['ENSGALG00000027891', 'ENSGALG00000015082', 'ENSGALG00000027128', 'ENSGALG00000010158', 'ENSGALG00000003726', 'ENSGALG00000015617', 'ENSGALG00000014857', 'ENSGALG00000027487', 'ENSGALG00000028328', 'ENSGALG00000000428', 'ENSGALG00000025842', 'ENSGALG00000014942', 'ENSGALG00000028137'])
LIVER 7 set(['ENSGALG00000015544', 'ENSGALG00000008229', 'ENSGALG00000005423', 'ENSGALG00000015617', 'ENSGALG00000021340', 'ENSGALG00000012613', 'ENSGALG00000000428'])
SPLEEN 10 set(['ENSGALG00000015216', 'ENSGALG00000015082', 'ENSGALG00000005814', 'ENSGALG00000014789', 'ENSGALG00000027154', 'ENSGALG00000015617', 'ENSGALG00000014857', 'ENSGALG00000005433', 'ENSGALG00000000428', 'ENSGALG00000025842'])
=====
HEART 164
GONAD 109
LIVER 128
SPLEEN 259
In [75]:
# There should be no overlap
for t in Z_ASE_genes:
print set(Z_ASE_genes[t]).intersection(Z_NOASE_genes[t])
In [76]:
expressed_AUTO_genes = { "HEART":AUTO_comb_heart_min2.index,
"LIVER":AUTO_comb_liver_min2.index,
"SPLEEN":AUTO_comb_spleen_min2.index,
"GONAD":AUTO_comb_gonad_min2.index}
for i in expressed_AUTO_genes:
print i, len(expressed_AUTO_genes[i])
# print expressed_AUTO_genes["HEART"]
In [49]:
HEART 9478
GONAD 11340
LIVER 8938
SPLEEN 10318
In [80]:
AUTO_tissue_snps = {}
for i, f in enumerate(input_vcf):
tissue = f.split("/")[-1].split("_")[0]
print tissue, f
SNPs = filter_vcf(f, expressed_AUTO_genes[tissue])
AUTO_tissue_snps[tissue] = SNPs
In [81]:
for t in AUTO_tissue_snps:
print t, len(AUTO_tissue_snps[t])
In [81]:
In [82]:
for t in AUTO_tissue_snps:
c = 0
for gene in AUTO_tissue_snps[t]:
c += len(AUTO_tissue_snps[t][gene])
print t, c
# Z SNPs
for t in Z_tissue_snps:
c = 0
for gene in Z_tissue_snps[t]:
c += len(Z_tissue_snps[t][gene])
print t, c
In [53]:
HEART 23735
GONAD 11378
LIVER 17812
SPLEEN 34679
HEART 503
GONAD 405
LIVER 439
SPLEEN 826
Combination is used as number in supplemental!
In [83]:
AUTO_ASE_snps= defaultdict(list)
AUTO_ASE_genes= defaultdict(list)
AUTO_NOASE_genes = defaultdict(list)
for t in AUTO_tissue_snps:
for gene in AUTO_tissue_snps[t]:
# There may be multiple SNPs per gene
for snp in AUTO_tissue_snps[t][gene]:
valid = []
rd_ad = []
for i in [9, 10, 11, 12]:
RD, AD, POS, DP = read_call(snp, i)
RBQ, ABQ = read_quality(snp, i)
varscan_freq = read_freq(snp,i)
# Check if heterozygote
if (RD != 0 and AD != 0):
#Check for quality
if (RBQ >= 20 and ABQ >=20):
ref_allele_fraction = RD/(RD+AD)
if binom_test_ase(RD, AD) < 0.05:
if ref_allele_fraction >= 0.7:
valid.append(gene)
rd_ad.append((RD,AD))
if ref_allele_fraction <= 0.3:
valid.append(gene)
rd_ad.append((RD,AD))
# If all samples pass the check the gene is valid with ASE
if len(valid) == 4:
snp = {"gene": gene, "ref_allele_fraction": ref_allele_fraction, "p-value": binom_test_ase(RD, AD), "pos": POS}
AUTO_ASE_snps[t].append(snp)
AUTO_ASE_genes[t].append(gene)
# Other SNPs may not pass the threshold, remove those from NOASE list
while gene in AUTO_NOASE_genes[t]:
AUTO_NOASE_genes[t].remove(gene)
else:
if gene not in AUTO_NOASE_genes[t] and gene not in AUTO_ASE_genes[t]:
AUTO_NOASE_genes[t].append(gene)
In [85]:
for t in AUTO_ASE_genes:
print t, len(set(AUTO_ASE_genes[t]))
print "====="
for t in AUTO_NOASE_genes:
print t, len(AUTO_NOASE_genes[t])
In [86]:
HEART 5138
GONAD 2843
LIVER 4114
SPLEEN 6675
In [87]:
for t in AUTO_tissue_snps:
print t, len(AUTO_tissue_snps[t])
In [88]:
# Use R via rpy2
import rpy2.robjects as R
def fdr_snps(snps):
'''Corrects the binomal tests using false discovery rate'''
lib = R.r["library"]
lib("qvalue")
corrected = []
p_values = [snp["p-value"] for snp in snps]
padjust = R.r["p.adjust"]
args = {"p": R.FloatVector(p_values), "method": "bonferroni"}
adjusted = list(padjust(**args))
for i, p in enumerate(adjusted):
snps[i]["padjust"] = p
return snps
for t in AUTO_ASE_snps:
AUTO_ASE_snps[t] = fdr_snps(AUTO_ASE_snps[t])
In [89]:
AUTO_ASE_genes_fdr = defaultdict(list)
for t in AUTO_ASE_snps:
for snp in AUTO_ASE_snps[t]:
if snp["padjust"] < 0.05:
AUTO_ASE_genes_fdr[t].append(snp["gene"])
AUTO_ASE_genes_fdr[t] = list(set(AUTO_ASE_genes_fdr[t]))
for t in AUTO_ASE_genes_fdr:
drop = set(AUTO_ASE_genes[t]).difference(AUTO_ASE_genes_fdr[t])
for g in drop:
AUTO_NOASE_genes[t].append(g)
AUTO_NOASE_genes[t] = list(set(AUTO_NOASE_genes[t]))
# There should be no overlap
for t in AUTO_ASE_genes_fdr:
print set(AUTO_ASE_genes_fdr[t]).intersection(AUTO_NOASE_genes[t])
In [90]:
for t in AUTO_ASE_genes_fdr:
print t, len(AUTO_ASE_genes_fdr[t])
print "====="
for t in AUTO_NOASE_genes:
print t, len(AUTO_NOASE_genes[t])
In [91]:
for t in Z_ASE_genes:
print t, len(set(Z_ASE_genes[t]))
print "====="
for t in Z_NOASE_genes:
print t, len(set(Z_NOASE_genes[t]))
Numbers excluding tri-allic SNPs
In [92]:
# ASE no ASE
print "Fisher's exact test (spleen auto vs spleen Z): ", fisher_exact([[263,6412],
[10, 259]])
print "Fisher's exact test (heart auto vs heart Z): ", fisher_exact([[212,4926],
[7, 164]])
print "Fisher's exact test (liver auto vs liver Z): ", fisher_exact([[167,3947],
[7, 128]])
print "Fisher's exact test (gonad auto vs gonad Z): ", fisher_exact([[157,2686],
[13, 109]])
Comparison between autosomes and Z chromosome in the degree of ASE:
In [ ]: