In [57]:
    
import sys
import os
from collections import defaultdict
from scipy import stats as stats
from scipy.stats import fisher_exact, mannwhitneyu, wilcoxon, ranksums, chi2_contingency, chisquare
# Nice dataframes
import pandas as pd
import numpy as np
import random
    
In [58]:
    
from collections import defaultdict
def filter_vcf(source, expressed_genes):
    SNPs = defaultdict(list)
    try:
        with open(source, "r") as infile:
            for line in infile:
                if line.startswith("##"):
                    continue
                line = line.rstrip().split()
                GID = line[22][1:-2]
                if GID in expressed_genes:
                    if is_biallelic(line):
                        if is_valid_base(line[3]) and is_valid_base(line[4]):
                            SNPs[GID].append(line)
            return SNPs
    except IOError:
        print "File does not exit!"
def is_biallelic(line):
    "Only include biallelic sites"
    # If ALT and REF has exactly one allele pass
    # An allele is a string with length 1 eg A or G
    if (len(line[3]) == 1 and len(line[4]) == 1):
        return True
    # In any other case return false
    else:
#         print line[3], line[4]
        return False
def is_valid_base(b):
    # Make sure letters are capital
    # Has no impact as varscan uses only capitals
    if b.upper() in ["A", "C", "G", "T"]:
        return True
    else:
        return False
        
def read_call(vcf_line, sample):
    '''Field format is:
        GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR
        0  1  2   3  4  5  6    7    8   9   10  11  12  13
    '''
    call1 = vcf_line[sample]
    POS = int(vcf_line[1])
    if (call1.split(":")[4] == "." or call1.split(":")[5] == "."):
        return 0.0, 0.0, POS, DP
    RD = float(call1.split(":")[4])
    AD = float(call1.split(":")[5])
    DP = float(call1.split(":")[3])
    return RD, AD, POS, DP
def read_quality(vcf_line, sample):
    '''Field format is:
        GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR
        0  1  2   3  4  5  6    7    8   9   10  11  12  13
    '''
    call1 = vcf_line[sample]
    RBQ = float(call1.split(":")[8])
    ABQ = float(call1.split(":")[9])
    return RBQ, ABQ
    
def read_freq(vcf_line, sample):
    '''Field format is:
        GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR
        0  1  2   3  4  5  6    7    8   9   10  11  12  13
    '''
    call1 = vcf_line[sample]
    freq = float(call1.split(":")[6].strip("%"))/100
#     print call1
    return freq
def binom_test_ase(rd, ad):
    '''
    RD: Depth of reference-supporting bases (reads1)
    AD: Depth of variant-supporting bases (reads2)
    This is the chance of observing either more successes, or fewer successes, in all trials.
    '''
    # Two tailed binomial test
    return stats.binom_test(rd, (rd+ad), p=0.5)
def random_sample(data, num_samples):
    "Randomly choose num_sample items from data"
    return random.sample(data, num_samples)
def anti_log2(i):
    return np.power(2, i)
def log2(i):
    return np.log2(i)
    
In [59]:
    
# Strictly filtered SNPS, where a call is valid in all samples
input_vcf = ["../snp-calling/5mismatches/all-tissues/combined/2016-filteredallmin17bin/soma100/HEART_M_5mismatches_filteredallmin17_noclusters_coding_gene.vcf",
               "../snp-calling/5mismatches/all-tissues/combined/2016-filteredallmin17bin/soma100/LIVER_M_5mismatches_filteredallmin17_noclusters_coding_gene.vcf",
               "../snp-calling/5mismatches/all-tissues/combined/2016-filteredallmin17bin/soma100/SPLEEN_M_5mismatches_filteredallmin17_noclusters_coding_gene.vcf",
               "../snp-calling/5mismatches/all-tissues/combined/2016-filteredallmin17bin/gonad51/GONAD_M_5mismatches_filteredallmin17_noclusters_coding_gene.vcf",]
de_gonad = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-GONAD-detags-edgeR.csv")
de_spleen = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-SPLEEN-detags-edgeR.csv")
de_heart = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-HEART-detags-edgeR.csv")
de_liver = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-LIVER-detags-edgeR.csv")
rpkm_gonad = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-GONAD-RPKM-NORM-log2-edgeR.csv")
rpkm_spleen = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-SPLEEN-RPKM-NORM-log2-edgeR.csv")
rpkm_heart = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-HEART-RPKM-NORM-log2-edgeR.csv")
rpkm_liver = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-LIVER-RPKM-NORM-log2-edgeR.csv")
    
In [60]:
    
de_gonad.head()
    
    Out[60]:
In [61]:
    
rpkm_gonad.head()
    
    Out[61]:
In [62]:
    
def read_gid_to_chr(infile):
    '''Reads in a file obtained from Ensembl mapping each gene to its chromosomal location'''
    gid_to_chr = {}
    with open(infile, "rb") as handle:
        for line in handle:
            if line.startswith("Ensembl"):
                continue
            else:
                line = line.rstrip().split()
                gid_to_chr[line[0]] = line[1]
    return gid_to_chr
gid_to_chr = read_gid_to_chr("/data/ucbtzim/ohnolog-dc/dc-comparison/Gallus_gallus.75.gid.chr.proteincoding.tsv")
# Get a list of Z linked gene IDs
Z_genes = [g for g, pos in gid_to_chr.items() if pos == "Z"]
print "Number of genes (protein coding) on the Z chromosome:", len(Z_genes)
    
    
In [63]:
    
comb_gonad = pd.concat([rpkm_gonad, de_gonad], axis=1)
comb_heart = pd.concat([rpkm_heart, de_heart], axis=1)
comb_liver = pd.concat([rpkm_liver, de_liver], axis=1)
comb_spleen = pd.concat([rpkm_spleen, de_spleen], axis=1)
print comb_gonad.shape
print comb_heart.shape
# Liver has one female sample less
print comb_liver.shape
print comb_spleen.shape
    
    
In [64]:
    
# No expression on chromosome 32, excluded
chicken_chromosomes = [str(i) for i in range(1, 29)] + ["Z"]
print "Valid chromosomes: ", chicken_chromosomes
comb_gonad = comb_gonad[comb_gonad.index.map(lambda x: gid_to_chr[x] in chicken_chromosomes)]
comb_heart = comb_heart[comb_heart.index.map(lambda x: gid_to_chr[x] in chicken_chromosomes)]
comb_liver = comb_liver[comb_liver.index.map(lambda x: gid_to_chr[x] in chicken_chromosomes)]
comb_spleen = comb_spleen[comb_spleen.index.map(lambda x: gid_to_chr[x] in chicken_chromosomes)]
print comb_gonad.shape
print comb_heart.shape
print comb_liver.shape
print comb_spleen.shape
    
    
In [65]:
    
def filter_cpm(cpm, mincpm):
    '''Filter out lowly expressed genes based on a fixed minimum CPM threshold'''
    filtered = cpm[cpm["logCPM"] > mincpm]
    return filtered
# Beware average CPM is log2 transformed.
# For avgCPM of 2 use 1 | log2(2) = 1
comb_gonad_min2 = filter_cpm(comb_gonad, 1)
comb_heart_min2 = filter_cpm(comb_heart, 1)
comb_liver_min2 = filter_cpm(comb_liver, 1)
comb_spleen_min2 = filter_cpm(comb_spleen, 1)
print comb_gonad_min2.shape
print comb_heart_min2.shape
print comb_liver_min2.shape
print comb_spleen_min2.shape
    
    
In [66]:
    
def split_dataframe(data, genes):
    '''Returns two dataframes. The first return value contains all rows that did match the genes.
       The second dataframe contains only genes that did NOT match.
    '''
    match = data[data.index.isin(genes)]
    nomatch = data[~data.index.isin(genes)]
    
    return match, nomatch
# Get a list of Z linked gene IDs
Z_genes = [g for g, pos in gid_to_chr.items() if pos == "Z"]
print "Number of genes (protein coding + non coding) on the Z chromosome:", len(Z_genes)
print ""
# Filter the expression so it only contains Z expression
Z_comb_gonad_min2, AUTO_comb_gonad_min2 = split_dataframe(comb_gonad_min2, Z_genes)
Z_comb_heart_min2, AUTO_comb_heart_min2 = split_dataframe(comb_heart_min2, Z_genes)
Z_comb_liver_min2, AUTO_comb_liver_min2 = split_dataframe(comb_liver_min2, Z_genes)
Z_comb_spleen_min2, AUTO_comb_spleen_min2 = split_dataframe(comb_spleen_min2, Z_genes)
    
    
In [67]:
    
expressed_Z_genes = { "HEART":Z_comb_heart_min2.index,
                     "LIVER":Z_comb_liver_min2.index,
                     "SPLEEN":Z_comb_spleen_min2.index,
                     "GONAD":Z_comb_gonad_min2.index}
for i in expressed_Z_genes:
    print i, len(expressed_Z_genes[i])
# print expressed_Z_genes["HEART"]
    
    
In [68]:
    
Z_tissue_snps = {}
for i, f in enumerate(input_vcf):
    tissue = f.split("/")[-1].split("_")[0]
    print tissue
    SNPs = filter_vcf(f, expressed_Z_genes[tissue])
    Z_tissue_snps[tissue] = SNPs
    
    
In [69]:
    
for t in Z_tissue_snps:
    print t, len(Z_tissue_snps[t])
    
    
In [70]:
    
HEART 171
GONAD 122
LIVER 135
SPLEEN 269
    
    
In [71]:
    
for t in Z_tissue_snps:
    c = 0
    for gene in Z_tissue_snps[t]:
        c += len(Z_tissue_snps[t][gene])
    print t, c
    
    
In [72]:
    
HEART 503
GONAD 405
LIVER 439
SPLEEN 826
    
    
In [73]:
    
# Genes taken from http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1003635
# Not all IDs could be mapped
livernois_genes = ["ENSGALG00000010164", # SMARCA2
                   "ENSGALG00000010158", # Kank1
                   "ENSGALG00000010160", # DMRT1
                   "ENSGALG00000010161", # DMRT3
                   "ENSGALG00000015058", # PTPRD
                   "ENSGALG00000015105", # PSIP1
                   "ENSGALG00000015101", # BNC2
                   "ENSGALG00000015078", # MLLT3
                   "ENSGALG00000015071", # FOCAD
                   "ENSGALG00000014690", # SLC12A2
                   "ENSGALG00000017706", # RASA1
                   "ENSGALG00000001869", # LRRN6C, LINGO2
                   "ENSGALG00000002162", # ACO1
                   "ENSGALG00000002187", # HSD17B4
                   "ENSGALG00000002294"] # SEMA6A
livernois_dict = defaultdict(list)
for t in Z_tissue_snps:
    for gene in Z_tissue_snps[t]:
        if gene in livernois_genes:
            livernois_dict[gene].append(t)
print len(livernois_dict)
for g in livernois_dict:
    print g, livernois_dict[g]
    
    
Note:
Instead of calculating the reference allele fraction ourselves, we could also use the varscan2 supplied "Variant allele frequency" in this field:
##FORMAT=<ID=FREQ,Number=1,Type=String,Description="Variant allele frequency">
We can use the read_freq function varscan_freq = read_freq(snp,i) and filter like this:
for i in [9, 10, 11, 12]: 
    RD, AD, POS = read_call(snp, i)
    varscan_freq = read_freq(snp,i)
    # Check if heterozygote
    if RD != 0 and AD != 0:
          if binom_test_ase(RD, AD) < 0.05:
              if varscan_freq >= 0.7:
                  valid.append(gene)
              if varscan_freq <= 0.3:
                  valid.append(gene)
The results are identical, even though varscan calculates the allele frequency a bit differently
In [74]:
    
Z_ASE_genes = defaultdict(list)
Z_NOASE_genes = defaultdict(list)
for t in Z_tissue_snps:
    for gene in Z_tissue_snps[t]:
        # There may be multiple SNPs per gene
        for snp in Z_tissue_snps[t][gene]:
            valid = []
            rd_ad = []
            for i in [9, 10, 11, 12]: 
                RD, AD, POS, DP = read_call(snp, i)
                RBQ, ABQ = read_quality(snp, i)
                # Check if heterozygote
                if (RD != 0 and AD != 0):
                    #Check for quality
                    if (RBQ >= 20 and ABQ >=20):
                        ref_allele_fraction = RD/(RD+AD)
#                         print type(RD), type(AD), type(ref_allele_fraction), ref_allele_fraction
                        if binom_test_ase(RD, AD) < 0.05:
                            if ref_allele_fraction >= 0.7:
                                valid.append(gene)
                                rd_ad.append((RD,AD))
                            if ref_allele_fraction <= 0.3:
                                valid.append(gene)
                                rd_ad.append((RD,AD))
            # If all samples pass the check the gene is valid with ASE
            if len(valid) == 4:
                Z_ASE_genes[t].append(gene)
                # Other SNPs may not pass the threshold, remove those from NOASE list
                # Via http://stackoverflow.com/questions/4915920/how-to-delete-an-item-in-a-list-if-it-exists-python
                while gene in Z_NOASE_genes[t]:
                   Z_NOASE_genes[t].remove(gene)
            else:
                if gene not in Z_NOASE_genes[t] and gene not in Z_ASE_genes[t]:
                    Z_NOASE_genes[t].append(gene)
for t in Z_ASE_genes:
    print t, len(set(Z_ASE_genes[t])), set(Z_ASE_genes[t])
print "====="
for t in Z_NOASE_genes:
    print t, len(set(Z_NOASE_genes[t]))
    
    
In [48]:
    
HEART 7 set(['ENSGALG00000014789', 'ENSGALG00000013809', 'ENSGALG00000015617', 'ENSGALG00000021340', 'ENSGALG00000000428', 'ENSGALG00000026553', 'ENSGALG00000025842'])
GONAD 13 set(['ENSGALG00000027891', 'ENSGALG00000015082', 'ENSGALG00000027128', 'ENSGALG00000010158', 'ENSGALG00000003726', 'ENSGALG00000015617', 'ENSGALG00000014857', 'ENSGALG00000027487', 'ENSGALG00000028328', 'ENSGALG00000000428', 'ENSGALG00000025842', 'ENSGALG00000014942', 'ENSGALG00000028137'])
LIVER 7 set(['ENSGALG00000015544', 'ENSGALG00000008229', 'ENSGALG00000005423', 'ENSGALG00000015617', 'ENSGALG00000021340', 'ENSGALG00000012613', 'ENSGALG00000000428'])
SPLEEN 10 set(['ENSGALG00000015216', 'ENSGALG00000015082', 'ENSGALG00000005814', 'ENSGALG00000014789', 'ENSGALG00000027154', 'ENSGALG00000015617', 'ENSGALG00000014857', 'ENSGALG00000005433', 'ENSGALG00000000428', 'ENSGALG00000025842'])
=====
HEART 164
GONAD 109
LIVER 128
SPLEEN 259
    
    
In [75]:
    
# There should be no overlap
for t in Z_ASE_genes:
    print set(Z_ASE_genes[t]).intersection(Z_NOASE_genes[t])
    
    
In [76]:
    
expressed_AUTO_genes = { "HEART":AUTO_comb_heart_min2.index,
                     "LIVER":AUTO_comb_liver_min2.index,
                     "SPLEEN":AUTO_comb_spleen_min2.index,
                     "GONAD":AUTO_comb_gonad_min2.index}
for i in expressed_AUTO_genes:
    print i, len(expressed_AUTO_genes[i])
# print expressed_AUTO_genes["HEART"]
    
    
In [49]:
    
HEART 9478
GONAD 11340
LIVER 8938
SPLEEN 10318
    
    
In [80]:
    
AUTO_tissue_snps = {}
for i, f in enumerate(input_vcf):
    tissue = f.split("/")[-1].split("_")[0]
    print tissue, f
    SNPs = filter_vcf(f, expressed_AUTO_genes[tissue])
    AUTO_tissue_snps[tissue] = SNPs
    
    
In [81]:
    
for t in AUTO_tissue_snps:
    print t, len(AUTO_tissue_snps[t])
    
    
In [81]:
    
    
In [82]:
    
for t in AUTO_tissue_snps:
    c = 0
    for gene in AUTO_tissue_snps[t]:
        c += len(AUTO_tissue_snps[t][gene])
    print t, c
# Z SNPs
for t in Z_tissue_snps:
    c = 0
    for gene in Z_tissue_snps[t]:
        c += len(Z_tissue_snps[t][gene])
    print t, c
    
    
In [53]:
    
HEART 23735
GONAD 11378
LIVER 17812
SPLEEN 34679
HEART 503
GONAD 405
LIVER 439
SPLEEN 826
    
    
Combination is used as number in supplemental!
In [83]:
    
AUTO_ASE_snps= defaultdict(list)
AUTO_ASE_genes= defaultdict(list)
AUTO_NOASE_genes = defaultdict(list)
for t in AUTO_tissue_snps:
    for gene in AUTO_tissue_snps[t]:
        # There may be multiple SNPs per gene
        for snp in AUTO_tissue_snps[t][gene]:
            valid = []
            rd_ad = []
            
            for i in [9, 10, 11, 12]: 
                RD, AD, POS, DP = read_call(snp, i)
                RBQ, ABQ = read_quality(snp, i)
                varscan_freq = read_freq(snp,i)
                # Check if heterozygote
                if (RD != 0 and AD != 0):
                    #Check for quality
                    if (RBQ >= 20 and ABQ >=20):
                        ref_allele_fraction = RD/(RD+AD)
                        if binom_test_ase(RD, AD) < 0.05:
                            if ref_allele_fraction >= 0.7:
                                valid.append(gene)
                                rd_ad.append((RD,AD))
                            if ref_allele_fraction <= 0.3:
                                valid.append(gene)
                                rd_ad.append((RD,AD))
            # If all samples pass the check the gene is valid with ASE
            if len(valid) == 4:
                snp = {"gene": gene, "ref_allele_fraction": ref_allele_fraction, "p-value": binom_test_ase(RD, AD), "pos": POS}
                AUTO_ASE_snps[t].append(snp)
                AUTO_ASE_genes[t].append(gene)
                # Other SNPs may not pass the threshold, remove those from NOASE list
                while gene in AUTO_NOASE_genes[t]:
                   AUTO_NOASE_genes[t].remove(gene)
            else:
                if gene not in AUTO_NOASE_genes[t] and gene not in AUTO_ASE_genes[t]:
                    AUTO_NOASE_genes[t].append(gene)
    
In [85]:
    
for t in AUTO_ASE_genes:
    print t, len(set(AUTO_ASE_genes[t]))
print "====="
for t in AUTO_NOASE_genes:
    print t, len(AUTO_NOASE_genes[t])
    
    
In [86]:
    
HEART 5138
GONAD 2843
LIVER 4114
SPLEEN 6675
    
    
In [87]:
    
for t in AUTO_tissue_snps:
    print t, len(AUTO_tissue_snps[t])
    
    
In [88]:
    
# Use R via rpy2
import rpy2.robjects as R
def fdr_snps(snps):
    '''Corrects the binomal tests using false discovery rate'''
    lib = R.r["library"]
    lib("qvalue")
    corrected = []
    p_values = [snp["p-value"] for snp in snps]
    padjust = R.r["p.adjust"]
    args = {"p": R.FloatVector(p_values), "method": "bonferroni"}
    adjusted = list(padjust(**args))
    for i, p in enumerate(adjusted):
        snps[i]["padjust"] =  p
    
    return snps
for t in AUTO_ASE_snps:
    AUTO_ASE_snps[t] = fdr_snps(AUTO_ASE_snps[t])
    
In [89]:
    
AUTO_ASE_genes_fdr = defaultdict(list)
for t in AUTO_ASE_snps:
    for snp in AUTO_ASE_snps[t]:
        if snp["padjust"] < 0.05:
            AUTO_ASE_genes_fdr[t].append(snp["gene"])
    AUTO_ASE_genes_fdr[t] = list(set(AUTO_ASE_genes_fdr[t]))
for t in AUTO_ASE_genes_fdr:
    drop = set(AUTO_ASE_genes[t]).difference(AUTO_ASE_genes_fdr[t])
    for g in drop:
        AUTO_NOASE_genes[t].append(g)
    AUTO_NOASE_genes[t] = list(set(AUTO_NOASE_genes[t]))
# There should be no overlap
for t in AUTO_ASE_genes_fdr:
    print set(AUTO_ASE_genes_fdr[t]).intersection(AUTO_NOASE_genes[t])
    
    
In [90]:
    
for t in AUTO_ASE_genes_fdr:
    print t, len(AUTO_ASE_genes_fdr[t])
print "====="
for t in AUTO_NOASE_genes:
    print t, len(AUTO_NOASE_genes[t])
    
    
In [91]:
    
for t in Z_ASE_genes:
    print t, len(set(Z_ASE_genes[t]))
print "====="
for t in Z_NOASE_genes:
    print t, len(set(Z_NOASE_genes[t]))
    
    
Numbers excluding tri-allic SNPs
In [92]:
    
#  ASE no ASE
print "Fisher's exact test (spleen auto vs spleen Z): ", fisher_exact([[263,6412],
                                                                      [10, 259]])
print "Fisher's exact test (heart auto vs heart Z): ", fisher_exact([[212,4926],
                                                                      [7, 164]])
print "Fisher's exact test (liver auto vs liver Z): ", fisher_exact([[167,3947],
                                                                      [7, 128]])
print "Fisher's exact test (gonad auto vs gonad Z): ", fisher_exact([[157,2686],
                                                                      [13, 109]])
    
    
Comparison between autosomes and Z chromosome in the degree of ASE:
In [ ]: