Imports


In [57]:
import sys
import os
from collections import defaultdict

from scipy import stats as stats
from scipy.stats import fisher_exact, mannwhitneyu, wilcoxon, ranksums, chi2_contingency, chisquare
# Nice dataframes
import pandas as pd
import numpy as np
import random

Helper functions


In [58]:
from collections import defaultdict

def filter_vcf(source, expressed_genes):
    SNPs = defaultdict(list)
    try:
        with open(source, "r") as infile:
            for line in infile:
                if line.startswith("##"):
                    continue
                line = line.rstrip().split()
                GID = line[22][1:-2]
                if GID in expressed_genes:
                    if is_biallelic(line):
                        if is_valid_base(line[3]) and is_valid_base(line[4]):
                            SNPs[GID].append(line)
            return SNPs
    except IOError:
        print "File does not exit!"

def is_biallelic(line):
    "Only include biallelic sites"
    # If ALT and REF has exactly one allele pass
    # An allele is a string with length 1 eg A or G
    if (len(line[3]) == 1 and len(line[4]) == 1):
        return True
    # In any other case return false
    else:
#         print line[3], line[4]
        return False

def is_valid_base(b):
    # Make sure letters are capital
    # Has no impact as varscan uses only capitals
    if b.upper() in ["A", "C", "G", "T"]:
        return True
    else:
        return False
        
def read_call(vcf_line, sample):
    '''Field format is:
        GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR
        0  1  2   3  4  5  6    7    8   9   10  11  12  13
    '''
    call1 = vcf_line[sample]
    POS = int(vcf_line[1])
    if (call1.split(":")[4] == "." or call1.split(":")[5] == "."):
        return 0.0, 0.0, POS, DP
    RD = float(call1.split(":")[4])
    AD = float(call1.split(":")[5])
    DP = float(call1.split(":")[3])
    return RD, AD, POS, DP


def read_quality(vcf_line, sample):
    '''Field format is:
        GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR
        0  1  2   3  4  5  6    7    8   9   10  11  12  13
    '''
    call1 = vcf_line[sample]
    RBQ = float(call1.split(":")[8])
    ABQ = float(call1.split(":")[9])
    return RBQ, ABQ


    
def read_freq(vcf_line, sample):
    '''Field format is:
        GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR
        0  1  2   3  4  5  6    7    8   9   10  11  12  13
    '''
    call1 = vcf_line[sample]
    freq = float(call1.split(":")[6].strip("%"))/100
#     print call1
    return freq

def binom_test_ase(rd, ad):
    '''
    RD: Depth of reference-supporting bases (reads1)
    AD: Depth of variant-supporting bases (reads2)
    This is the chance of observing either more successes, or fewer successes, in all trials.
    '''
    # Two tailed binomial test
    return stats.binom_test(rd, (rd+ad), p=0.5)

def random_sample(data, num_samples):
    "Randomly choose num_sample items from data"
    return random.sample(data, num_samples)

def anti_log2(i):
    return np.power(2, i)

def log2(i):
    return np.log2(i)

Input files


In [59]:
# Strictly filtered SNPS, where a call is valid in all samples
input_vcf = ["../snp-calling/5mismatches/all-tissues/combined/2016-filteredallmin17bin/soma100/HEART_M_5mismatches_filteredallmin17_noclusters_coding_gene.vcf",
               "../snp-calling/5mismatches/all-tissues/combined/2016-filteredallmin17bin/soma100/LIVER_M_5mismatches_filteredallmin17_noclusters_coding_gene.vcf",
               "../snp-calling/5mismatches/all-tissues/combined/2016-filteredallmin17bin/soma100/SPLEEN_M_5mismatches_filteredallmin17_noclusters_coding_gene.vcf",
               "../snp-calling/5mismatches/all-tissues/combined/2016-filteredallmin17bin/gonad51/GONAD_M_5mismatches_filteredallmin17_noclusters_coding_gene.vcf",]

de_gonad = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-GONAD-detags-edgeR.csv")
de_spleen = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-SPLEEN-detags-edgeR.csv")
de_heart = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-HEART-detags-edgeR.csv")
de_liver = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-LIVER-detags-edgeR.csv")

rpkm_gonad = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-GONAD-RPKM-NORM-log2-edgeR.csv")
rpkm_spleen = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-SPLEEN-RPKM-NORM-log2-edgeR.csv")
rpkm_heart = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-HEART-RPKM-NORM-log2-edgeR.csv")
rpkm_liver = pd.read_csv("/data/ucbtzim/ohnolog-dc/differential-expression/2015-10-27-MF-LIVER-RPKM-NORM-log2-edgeR.csv")

In [60]:
de_gonad.head()


Out[60]:
logFC logCPM PValue FDR
ENSGALG00000000386 8.108556 6.594766 0.000000e+00 0.000000e+00
ENSGALG00000001756 8.684406 7.036651 9.761462e-316 7.589536e-312
ENSGALG00000013312 9.834955 5.384945 9.475218e-253 4.911321e-249
ENSGALG00000022674 12.677968 5.112837 3.101470e-219 1.205696e-215
ENSGALG00000022679 12.374174 4.810069 2.111286e-215 6.566100e-212

In [61]:
rpkm_gonad.head()


Out[61]:
WTCHG_23425_05_GONAD_M_1_sequence WTCHG_23425_04_GONAD_M_1_sequence WTCHG_23425_12_GONAD_M_1_sequence WTCHG_23425_06_GONAD_M_1_sequence WTCHG_23702_04_GONAD_F_1_sequence WTCHG_23702_05_GONAD_F_1_sequence WTCHG_23702_12_GONAD_F_1_sequence WTCHG_23702_06_GONAD_F_1_sequence
ENSGALG00000000003 2.220145 2.480927 2.182589 2.065128 1.511206 1.005258 1.209041 1.443923
ENSGALG00000000004 3.032695 3.244164 2.891725 2.751503 2.446796 2.558455 2.407342 2.614145
ENSGALG00000000011 3.493343 3.531948 3.828556 3.722546 3.984308 3.480030 3.767212 3.670157
ENSGALG00000000013 3.407879 3.364665 3.446097 3.305962 3.119093 3.022892 3.193507 3.207058
ENSGALG00000000014 -0.004977 0.173711 -0.512172 -0.301640 0.007394 0.174725 0.358401 0.407864

In [62]:
def read_gid_to_chr(infile):
    '''Reads in a file obtained from Ensembl mapping each gene to its chromosomal location'''
    gid_to_chr = {}
    with open(infile, "rb") as handle:
        for line in handle:
            if line.startswith("Ensembl"):
                continue
            else:
                line = line.rstrip().split()
                gid_to_chr[line[0]] = line[1]
    return gid_to_chr


gid_to_chr = read_gid_to_chr("/data/ucbtzim/ohnolog-dc/dc-comparison/Gallus_gallus.75.gid.chr.proteincoding.tsv")
# Get a list of Z linked gene IDs
Z_genes = [g for g, pos in gid_to_chr.items() if pos == "Z"]
print "Number of genes (protein coding) on the Z chromosome:", len(Z_genes)


Number of genes (protein coding) on the Z chromosome: 764

In [63]:
comb_gonad = pd.concat([rpkm_gonad, de_gonad], axis=1)
comb_heart = pd.concat([rpkm_heart, de_heart], axis=1)
comb_liver = pd.concat([rpkm_liver, de_liver], axis=1)
comb_spleen = pd.concat([rpkm_spleen, de_spleen], axis=1)

print comb_gonad.shape
print comb_heart.shape
# Liver has one female sample less
print comb_liver.shape
print comb_spleen.shape


(15550, 12)
(15550, 12)
(15550, 11)
(15550, 12)

In [64]:
# No expression on chromosome 32, excluded
chicken_chromosomes = [str(i) for i in range(1, 29)] + ["Z"]

print "Valid chromosomes: ", chicken_chromosomes

comb_gonad = comb_gonad[comb_gonad.index.map(lambda x: gid_to_chr[x] in chicken_chromosomes)]
comb_heart = comb_heart[comb_heart.index.map(lambda x: gid_to_chr[x] in chicken_chromosomes)]
comb_liver = comb_liver[comb_liver.index.map(lambda x: gid_to_chr[x] in chicken_chromosomes)]
comb_spleen = comb_spleen[comb_spleen.index.map(lambda x: gid_to_chr[x] in chicken_chromosomes)]

print comb_gonad.shape
print comb_heart.shape
print comb_liver.shape
print comb_spleen.shape


Valid chromosomes:  ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', 'Z']
(14496, 12)
(14496, 12)
(14496, 11)
(14496, 12)

In [65]:
def filter_cpm(cpm, mincpm):
    '''Filter out lowly expressed genes based on a fixed minimum CPM threshold'''
    filtered = cpm[cpm["logCPM"] > mincpm]
    return filtered

# Beware average CPM is log2 transformed.
# For avgCPM of 2 use 1 | log2(2) = 1
comb_gonad_min2 = filter_cpm(comb_gonad, 1)
comb_heart_min2 = filter_cpm(comb_heart, 1)
comb_liver_min2 = filter_cpm(comb_liver, 1)
comb_spleen_min2 = filter_cpm(comb_spleen, 1)

print comb_gonad_min2.shape
print comb_heart_min2.shape
print comb_liver_min2.shape
print comb_spleen_min2.shape


(11944, 12)
(9971, 12)
(9383, 11)
(10847, 12)

In [66]:
def split_dataframe(data, genes):
    '''Returns two dataframes. The first return value contains all rows that did match the genes.
       The second dataframe contains only genes that did NOT match.
    '''
    match = data[data.index.isin(genes)]
    nomatch = data[~data.index.isin(genes)]
    
    return match, nomatch

# Get a list of Z linked gene IDs
Z_genes = [g for g, pos in gid_to_chr.items() if pos == "Z"]
print "Number of genes (protein coding + non coding) on the Z chromosome:", len(Z_genes)
print ""

# Filter the expression so it only contains Z expression
Z_comb_gonad_min2, AUTO_comb_gonad_min2 = split_dataframe(comb_gonad_min2, Z_genes)
Z_comb_heart_min2, AUTO_comb_heart_min2 = split_dataframe(comb_heart_min2, Z_genes)
Z_comb_liver_min2, AUTO_comb_liver_min2 = split_dataframe(comb_liver_min2, Z_genes)
Z_comb_spleen_min2, AUTO_comb_spleen_min2 = split_dataframe(comb_spleen_min2, Z_genes)


Number of genes (protein coding + non coding) on the Z chromosome: 764


In [67]:
expressed_Z_genes = { "HEART":Z_comb_heart_min2.index,
                     "LIVER":Z_comb_liver_min2.index,
                     "SPLEEN":Z_comb_spleen_min2.index,
                     "GONAD":Z_comb_gonad_min2.index}
for i in expressed_Z_genes:
    print i, len(expressed_Z_genes[i])
# print expressed_Z_genes["HEART"]


HEART 493
GONAD 604
LIVER 445
SPLEEN 529

Z SNPs in all tissues


In [68]:
Z_tissue_snps = {}

for i, f in enumerate(input_vcf):
    tissue = f.split("/")[-1].split("_")[0]
    print tissue
    SNPs = filter_vcf(f, expressed_Z_genes[tissue])
    Z_tissue_snps[tissue] = SNPs


HEART
LIVER
SPLEEN
GONAD

Number of genes on the Z with valid SNP calls


In [69]:
for t in Z_tissue_snps:
    print t, len(Z_tissue_snps[t])


HEART 171
GONAD 122
LIVER 135
SPLEEN 269

In [70]:
HEART 171
GONAD 122
LIVER 135
SPLEEN 269


  File "<ipython-input-70-b8c991f53b4c>", line 1
    HEART 171
            ^
SyntaxError: invalid syntax

Number of SNPs on the Z per tissue


In [71]:
for t in Z_tissue_snps:
    c = 0
    for gene in Z_tissue_snps[t]:

        c += len(Z_tissue_snps[t][gene])
    print t, c


HEART 503
GONAD 405
LIVER 439
SPLEEN 826

In [72]:
HEART 503
GONAD 405
LIVER 439
SPLEEN 826


  File "<ipython-input-72-6a2b4f16a7ef>", line 1
    HEART 503
            ^
SyntaxError: invalid syntax

In [73]:
# Genes taken from http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1003635
# Not all IDs could be mapped
livernois_genes = ["ENSGALG00000010164", # SMARCA2
                   "ENSGALG00000010158", # Kank1
                   "ENSGALG00000010160", # DMRT1
                   "ENSGALG00000010161", # DMRT3
                   "ENSGALG00000015058", # PTPRD
                   "ENSGALG00000015105", # PSIP1
                   "ENSGALG00000015101", # BNC2
                   "ENSGALG00000015078", # MLLT3
                   "ENSGALG00000015071", # FOCAD
                   "ENSGALG00000014690", # SLC12A2
                   "ENSGALG00000017706", # RASA1
                   "ENSGALG00000001869", # LRRN6C, LINGO2
                   "ENSGALG00000002162", # ACO1
                   "ENSGALG00000002187", # HSD17B4
                   "ENSGALG00000002294"] # SEMA6A

livernois_dict = defaultdict(list)
for t in Z_tissue_snps:
    for gene in Z_tissue_snps[t]:
        if gene in livernois_genes:
            livernois_dict[gene].append(t)

print len(livernois_dict)
for g in livernois_dict:
    print g, livernois_dict[g]


10
ENSGALG00000010160 ['GONAD']
ENSGALG00000002187 ['HEART', 'GONAD', 'LIVER', 'SPLEEN']
ENSGALG00000002162 ['GONAD', 'LIVER']
ENSGALG00000010164 ['HEART', 'GONAD', 'LIVER', 'SPLEEN']
ENSGALG00000002294 ['SPLEEN']
ENSGALG00000010158 ['HEART', 'GONAD', 'LIVER', 'SPLEEN']
ENSGALG00000014690 ['SPLEEN']
ENSGALG00000015105 ['HEART', 'GONAD', 'SPLEEN']
ENSGALG00000015071 ['SPLEEN']
ENSGALG00000017706 ['SPLEEN']

Calculate which genes on the Z contain SNPs with ASE

Note:

Instead of calculating the reference allele fraction ourselves, we could also use the varscan2 supplied "Variant allele frequency" in this field:

##FORMAT=<ID=FREQ,Number=1,Type=String,Description="Variant allele frequency">

We can use the read_freq function varscan_freq = read_freq(snp,i) and filter like this:

for i in [9, 10, 11, 12]: 
    RD, AD, POS = read_call(snp, i)
    varscan_freq = read_freq(snp,i)
    # Check if heterozygote
    if RD != 0 and AD != 0:
          if binom_test_ase(RD, AD) < 0.05:

              if varscan_freq >= 0.7:
                  valid.append(gene)

              if varscan_freq <= 0.3:
                  valid.append(gene)

The results are identical, even though varscan calculates the allele frequency a bit differently


In [74]:
Z_ASE_genes = defaultdict(list)
Z_NOASE_genes = defaultdict(list)

for t in Z_tissue_snps:

    for gene in Z_tissue_snps[t]:

        # There may be multiple SNPs per gene
        for snp in Z_tissue_snps[t][gene]:
            valid = []
            rd_ad = []
            for i in [9, 10, 11, 12]: 
                RD, AD, POS, DP = read_call(snp, i)
                RBQ, ABQ = read_quality(snp, i)
                # Check if heterozygote
                if (RD != 0 and AD != 0):
                    #Check for quality
                    if (RBQ >= 20 and ABQ >=20):
                        ref_allele_fraction = RD/(RD+AD)
#                         print type(RD), type(AD), type(ref_allele_fraction), ref_allele_fraction
                        if binom_test_ase(RD, AD) < 0.05:
                            if ref_allele_fraction >= 0.7:
                                valid.append(gene)
                                rd_ad.append((RD,AD))

                            if ref_allele_fraction <= 0.3:
                                valid.append(gene)
                                rd_ad.append((RD,AD))
            # If all samples pass the check the gene is valid with ASE
            if len(valid) == 4:
                Z_ASE_genes[t].append(gene)
                # Other SNPs may not pass the threshold, remove those from NOASE list
                # Via http://stackoverflow.com/questions/4915920/how-to-delete-an-item-in-a-list-if-it-exists-python
                while gene in Z_NOASE_genes[t]:
                   Z_NOASE_genes[t].remove(gene)
            else:
                if gene not in Z_NOASE_genes[t] and gene not in Z_ASE_genes[t]:
                    Z_NOASE_genes[t].append(gene)

for t in Z_ASE_genes:
    print t, len(set(Z_ASE_genes[t])), set(Z_ASE_genes[t])
print "====="
for t in Z_NOASE_genes:
    print t, len(set(Z_NOASE_genes[t]))


HEART 7 set(['ENSGALG00000014789', 'ENSGALG00000013809', 'ENSGALG00000015617', 'ENSGALG00000021340', 'ENSGALG00000000428', 'ENSGALG00000026553', 'ENSGALG00000025842'])
GONAD 13 set(['ENSGALG00000027891', 'ENSGALG00000015082', 'ENSGALG00000027128', 'ENSGALG00000010158', 'ENSGALG00000003726', 'ENSGALG00000015617', 'ENSGALG00000014857', 'ENSGALG00000027487', 'ENSGALG00000028328', 'ENSGALG00000000428', 'ENSGALG00000025842', 'ENSGALG00000014942', 'ENSGALG00000028137'])
LIVER 7 set(['ENSGALG00000015544', 'ENSGALG00000008229', 'ENSGALG00000005423', 'ENSGALG00000015617', 'ENSGALG00000021340', 'ENSGALG00000012613', 'ENSGALG00000000428'])
SPLEEN 10 set(['ENSGALG00000015216', 'ENSGALG00000015082', 'ENSGALG00000005814', 'ENSGALG00000014789', 'ENSGALG00000027154', 'ENSGALG00000015617', 'ENSGALG00000014857', 'ENSGALG00000005433', 'ENSGALG00000000428', 'ENSGALG00000025842'])
=====
HEART 164
GONAD 109
LIVER 128
SPLEEN 259

In [48]:
HEART 7 set(['ENSGALG00000014789', 'ENSGALG00000013809', 'ENSGALG00000015617', 'ENSGALG00000021340', 'ENSGALG00000000428', 'ENSGALG00000026553', 'ENSGALG00000025842'])
GONAD 13 set(['ENSGALG00000027891', 'ENSGALG00000015082', 'ENSGALG00000027128', 'ENSGALG00000010158', 'ENSGALG00000003726', 'ENSGALG00000015617', 'ENSGALG00000014857', 'ENSGALG00000027487', 'ENSGALG00000028328', 'ENSGALG00000000428', 'ENSGALG00000025842', 'ENSGALG00000014942', 'ENSGALG00000028137'])
LIVER 7 set(['ENSGALG00000015544', 'ENSGALG00000008229', 'ENSGALG00000005423', 'ENSGALG00000015617', 'ENSGALG00000021340', 'ENSGALG00000012613', 'ENSGALG00000000428'])
SPLEEN 10 set(['ENSGALG00000015216', 'ENSGALG00000015082', 'ENSGALG00000005814', 'ENSGALG00000014789', 'ENSGALG00000027154', 'ENSGALG00000015617', 'ENSGALG00000014857', 'ENSGALG00000005433', 'ENSGALG00000000428', 'ENSGALG00000025842'])
=====
HEART 164
GONAD 109
LIVER 128
SPLEEN 259


  File "<ipython-input-48-8303d57e5beb>", line 1
    HEART 7 set(['ENSGALG00000014789', 'ENSGALG00000013809', 'ENSGALG00000015617', 'ENSGALG00000021340', 'ENSGALG00000000428', 'ENSGALG00000026553', 'ENSGALG00000025842'])
          ^
SyntaxError: invalid syntax

In [75]:
# There should be no overlap
for t in Z_ASE_genes:
    print set(Z_ASE_genes[t]).intersection(Z_NOASE_genes[t])


set([])
set([])
set([])
set([])

The autosomes (repeated all Z analyses for autosomes)

How many SNPs are expressed on the Autosomes?


In [76]:
expressed_AUTO_genes = { "HEART":AUTO_comb_heart_min2.index,
                     "LIVER":AUTO_comb_liver_min2.index,
                     "SPLEEN":AUTO_comb_spleen_min2.index,
                     "GONAD":AUTO_comb_gonad_min2.index}

for i in expressed_AUTO_genes:
    print i, len(expressed_AUTO_genes[i])
# print expressed_AUTO_genes["HEART"]


HEART 9478
GONAD 11340
LIVER 8938
SPLEEN 10318

In [49]:
HEART 9478
GONAD 11340
LIVER 8938
SPLEEN 10318


  File "<ipython-input-49-c32751d13aba>", line 1
    HEART 9478
             ^
SyntaxError: invalid syntax

In [80]:
AUTO_tissue_snps = {}

for i, f in enumerate(input_vcf):
    tissue = f.split("/")[-1].split("_")[0]
    print tissue, f
    SNPs = filter_vcf(f, expressed_AUTO_genes[tissue])
    AUTO_tissue_snps[tissue] = SNPs


HEART ../snp-calling/5mismatches/all-tissues/combined/2016-filteredallmin17bin/soma100/HEART_M_5mismatches_filteredallmin17_noclusters_coding_gene.vcf
LIVER ../snp-calling/5mismatches/all-tissues/combined/2016-filteredallmin17bin/soma100/LIVER_M_5mismatches_filteredallmin17_noclusters_coding_gene.vcf
SPLEEN ../snp-calling/5mismatches/all-tissues/combined/2016-filteredallmin17bin/soma100/SPLEEN_M_5mismatches_filteredallmin17_noclusters_coding_gene.vcf
GONAD ../snp-calling/5mismatches/all-tissues/combined/2016-filteredallmin17bin/gonad51/GONAD_M_5mismatches_filteredallmin17_noclusters_coding_gene.vcf

Number of genes with SNPs on the Autosomes


In [81]:
for t in AUTO_tissue_snps:
    print t, len(AUTO_tissue_snps[t])


HEART 5138
GONAD 2843
LIVER 4114
SPLEEN 6675

In [81]:

Number of SNPs on the Autosomes


In [82]:
for t in AUTO_tissue_snps:
    c = 0
    for gene in AUTO_tissue_snps[t]:
        c += len(AUTO_tissue_snps[t][gene])
    print t, c
# Z SNPs
for t in Z_tissue_snps:
    c = 0
    for gene in Z_tissue_snps[t]:

        c += len(Z_tissue_snps[t][gene])
    print t, c


HEART 23735
GONAD 11378
LIVER 17812
SPLEEN 34679
HEART 503
GONAD 405
LIVER 439
SPLEEN 826

In [53]:
HEART 23735
GONAD 11378
LIVER 17812
SPLEEN 34679
HEART 503
GONAD 405
LIVER 439
SPLEEN 826


  File "<ipython-input-53-4d0afdb4e391>", line 1
    HEART 23735
              ^
SyntaxError: invalid syntax

Combination is used as number in supplemental!

Calculate which genes on the Autosomes contain SNPs with ASE


In [83]:
AUTO_ASE_snps= defaultdict(list)
AUTO_ASE_genes= defaultdict(list)
AUTO_NOASE_genes = defaultdict(list)

for t in AUTO_tissue_snps:

    for gene in AUTO_tissue_snps[t]:

        # There may be multiple SNPs per gene
        for snp in AUTO_tissue_snps[t][gene]:
            valid = []
            rd_ad = []
            
            for i in [9, 10, 11, 12]: 
                RD, AD, POS, DP = read_call(snp, i)
                RBQ, ABQ = read_quality(snp, i)
                varscan_freq = read_freq(snp,i)
                # Check if heterozygote
                if (RD != 0 and AD != 0):
                    #Check for quality
                    if (RBQ >= 20 and ABQ >=20):
                        ref_allele_fraction = RD/(RD+AD)
                        if binom_test_ase(RD, AD) < 0.05:

                            if ref_allele_fraction >= 0.7:
                                valid.append(gene)
                                rd_ad.append((RD,AD))

                            if ref_allele_fraction <= 0.3:
                                valid.append(gene)
                                rd_ad.append((RD,AD))
            # If all samples pass the check the gene is valid with ASE
            if len(valid) == 4:
                snp = {"gene": gene, "ref_allele_fraction": ref_allele_fraction, "p-value": binom_test_ase(RD, AD), "pos": POS}
                AUTO_ASE_snps[t].append(snp)
                AUTO_ASE_genes[t].append(gene)
                # Other SNPs may not pass the threshold, remove those from NOASE list
                while gene in AUTO_NOASE_genes[t]:
                   AUTO_NOASE_genes[t].remove(gene)
            else:
                if gene not in AUTO_NOASE_genes[t] and gene not in AUTO_ASE_genes[t]:
                    AUTO_NOASE_genes[t].append(gene)

In [85]:
for t in AUTO_ASE_genes:
    print t, len(set(AUTO_ASE_genes[t]))
print "====="
for t in AUTO_NOASE_genes:
    print t, len(AUTO_NOASE_genes[t])


HEART 221
GONAD 191
LIVER 184
SPLEEN 295
=====
HEART 4917
GONAD 2652
LIVER 3930
SPLEEN 6380

In [86]:
HEART 5138
GONAD 2843
LIVER 4114
SPLEEN 6675


  File "<ipython-input-86-a28c2a3a4cf4>", line 1
    HEART 5138
             ^
SyntaxError: invalid syntax

In [87]:
for t in AUTO_tissue_snps:
    print t, len(AUTO_tissue_snps[t])


HEART 5138
GONAD 2843
LIVER 4114
SPLEEN 6675

Add multiple correction for all autosomes


In [88]:
# Use R via rpy2
import rpy2.robjects as R

def fdr_snps(snps):
    '''Corrects the binomal tests using false discovery rate'''
    lib = R.r["library"]
    lib("qvalue")
    corrected = []
    p_values = [snp["p-value"] for snp in snps]
    padjust = R.r["p.adjust"]
    args = {"p": R.FloatVector(p_values), "method": "bonferroni"}
    adjusted = list(padjust(**args))
    for i, p in enumerate(adjusted):
        snps[i]["padjust"] =  p
    
    return snps

for t in AUTO_ASE_snps:
    AUTO_ASE_snps[t] = fdr_snps(AUTO_ASE_snps[t])

In [89]:
AUTO_ASE_genes_fdr = defaultdict(list)
for t in AUTO_ASE_snps:
    for snp in AUTO_ASE_snps[t]:
        if snp["padjust"] < 0.05:
            AUTO_ASE_genes_fdr[t].append(snp["gene"])
    AUTO_ASE_genes_fdr[t] = list(set(AUTO_ASE_genes_fdr[t]))

for t in AUTO_ASE_genes_fdr:
    drop = set(AUTO_ASE_genes[t]).difference(AUTO_ASE_genes_fdr[t])
    for g in drop:
        AUTO_NOASE_genes[t].append(g)

    AUTO_NOASE_genes[t] = list(set(AUTO_NOASE_genes[t]))

# There should be no overlap
for t in AUTO_ASE_genes_fdr:
    print set(AUTO_ASE_genes_fdr[t]).intersection(AUTO_NOASE_genes[t])


set([])
set([])
set([])
set([])

In [90]:
for t in AUTO_ASE_genes_fdr:
    print t, len(AUTO_ASE_genes_fdr[t])
print "====="
for t in AUTO_NOASE_genes:
    print t, len(AUTO_NOASE_genes[t])


HEART 212
GONAD 157
LIVER 167
SPLEEN 263
=====
HEART 4926
GONAD 2686
LIVER 3947
SPLEEN 6412

In [91]:
for t in Z_ASE_genes:
    print t, len(set(Z_ASE_genes[t]))
print "====="
for t in Z_NOASE_genes:
    print t, len(set(Z_NOASE_genes[t]))


HEART 7
GONAD 13
LIVER 7
SPLEEN 10
=====
HEART 164
GONAD 109
LIVER 128
SPLEEN 259

Numbers excluding tri-allic SNPs


In [92]:
#  ASE no ASE
print "Fisher's exact test (spleen auto vs spleen Z): ", fisher_exact([[263,6412],
                                                                      [10, 259]])

print "Fisher's exact test (heart auto vs heart Z): ", fisher_exact([[212,4926],
                                                                      [7, 164]])

print "Fisher's exact test (liver auto vs liver Z): ", fisher_exact([[167,3947],
                                                                      [7, 128]])

print "Fisher's exact test (gonad auto vs gonad Z): ", fisher_exact([[157,2686],
                                                                      [13, 109]])


Fisher's exact test (spleen auto vs spleen Z):  (1.0623362445414848, 1.0)
Fisher's exact test (heart auto vs heart Z):  (1.0082941824720144, 1.0)
Fisher's exact test (liver auto vs liver Z):  (0.77367982916500777, 0.50359844146798238)
Fisher's exact test (gonad auto vs gonad Z):  (0.4900910705080474, 0.026234117705592898)

Comparison between autosomes and Z chromosome in the degree of ASE:


In [ ]: