Excess mutation determination

Use pre-defined genesets implied in different aspects of antibiotic resistance to determine whether we observe any analogous signatures of positive selection.


In [1]:
#Python core packages
from collections import Counter
import string
import pickle
import datetime
import warnings
warnings.filterwarnings("ignore")

#Additional Python packages
import tqdm

#Scientific packages
from scipy import stats as ss
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn import utils as sku
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

mpl.rcParams['font.family']='Arial'
mpl.rcParams['legend.numpoints'] = 1

%matplotlib inline

In [2]:
#LOAD FUNCTIONS
%cd '../src/data/'
import serial_functions as serf
%cd '../../notebooks'

#LOAD DATA
EXTERNAL_PATH = '../data/external/'
PROCESSED_PATH = '../data/processed/'

ALL = pd.read_csv('{}ALLELE_data.csv'.format(PROCESSED_PATH), index_col=0)
EGS = pd.read_csv('{}PREDEFINED_gene_sets.csv'.format(EXTERNAL_PATH), index_col=0)


/Users/Zuco/Documents/Work/BC2_home/160803_serial_deepseq/src/data
/Users/Zuco/Documents/Work/BC2_home/160803_serial_deepseq/notebooks

In [3]:
#Prepare the necessary resources from the MTBC genome
REF_ANNOTATIONS = {}
for line in open('{}H37Rv_annotation2sytems.ptt'.format(EXTERNAL_PATH)):
    split = line.strip().split('\t')
    if len(split)<=6 and len(split)>3 and split[0][0].isdigit():
        REF_ANNOTATIONS[int(split[1])-1] = [int(split[2]), split[3], split[4]]
    if len(split)>6 and split[0][0].isdigit():
        REF_ANNOTATIONS[int(split[1])-1] = [int(split[2]), split[3], split[7]]

LOCUS_MATRIX = np.array([[k,v[0]] for (k,v) in REF_ANNOTATIONS.items()], dtype=float)
LOCUS_MATRIX = LOCUS_MATRIX[np.argsort(LOCUS_MATRIX[:,0])]

LOCI = np.array([REF_ANNOTATIONS[int(x)][-1] for x in LOCUS_MATRIX[:,0]])
SIZE_DICT = {x:float(LOCUS_MATRIX[ind][1])-LOCUS_MATRIX[ind][0] for (ind,x) in enumerate(LOCI)}       
MTB_GENOME = ''.join([line.strip() for line in open('{}MTB_anc.fa'.format(EXTERNAL_PATH)) if line[0]!='>'])

In [4]:
#Define the excluded geneset (Coscolla et al, 2015)
excluded = list(EGS.GENE[EGS.GENE_SET=='Excluded'])

#Define the drug resistance associated gene set (Farhat et al, 2013 "approach overlap genes"
# and Zhang et al 2013 - "Drug resistance group 1")
DR_set = list(EGS.GENE[(EGS.GENE_SET=='DR_associated')&(EGS.EXCLUDED==0)])

#Define the T-cell antigen gene set (Coscolla et al, 2015)
AG_set = list(EGS.GENE[(EGS.GENE_SET=='Antigens')&(EGS.EXCLUDED==0)])

#Define the drug resistance gene set (Walker et al, 2015)
Walker_DR = list(EGS.GENE[(EGS.GENE_SET=='DR_genes')&(EGS.EXCLUDED==0)])

#Define Mycolate superpathway genes (O'Neill et al, 2015)
ONeillMycolate = list(EGS.GENE[(EGS.GENE_SET=='Mycolate_superpathway')&(EGS.EXCLUDED==0)])

Drug resistance genes

As defined by Walker et al.


In [5]:
DR_output_E = serf.excess_mutation(Walker_DR, ALL[(ALL.NON_EFFICACIOUS==0)], 
                               len(MTB_GENOME), SIZE_DICT, exclude_check=excluded)
DR_output_NE = serf.excess_mutation(Walker_DR, ALL[(ALL.NON_EFFICACIOUS==1)], 
                               len(MTB_GENOME), SIZE_DICT, exclude_check=excluded)

print('WALKER DR GENESET\n')
print('OOOO\tEfficaciously treated\tOOOO')
print('--- COUNTS ---')
print('SNPS AT DIAGNOSIS:\n%s\n' % DR_output_E['Data'][0])
print('SNPS DURING TREATMENT:\n%s\n' % DR_output_E['Data'][1])
print('--- STATISTICS ---')
print('Excess binomial: %.3f' % DR_output_E['Excess_binomial'])
print('Excess NSY: %.3f\n' % DR_output_E['NS_binomial'])

print('OOOO\tNon-efficaciously treated\tOOOO')
print('--- COUNTS ---')
print('SNPS AT DIAGNOSIS:\n%s\n' % DR_output_NE['Data'][0])
print('SNPS DURING TREATMENT:\n%s\n' % DR_output_NE['Data'][1])
print('--- STATISTICS ---')
print('Excess binomial: %.3f' % DR_output_NE['Excess_binomial'])
print('Excess NSY: %.3f\n' % DR_output_NE['NS_binomial'])


WALKER DR GENESET

OOOO	Efficaciously treated	OOOO
--- COUNTS ---
SNPS AT DIAGNOSIS:
[[ 1  0]
 [25 17]]

SNPS DURING TREATMENT:
[[ 0  0]
 [66 34]]

--- STATISTICS ---
Excess binomial: 0.501
Excess NSY: 1.000

OOOO	Non-efficaciously treated	OOOO
--- COUNTS ---
SNPS AT DIAGNOSIS:
[[ 0  0]
 [55 24]]

SNPS DURING TREATMENT:
[[ 5  0]
 [58 24]]

--- STATISTICS ---
Excess binomial: 0.001
Excess NSY: 0.177

Drug resistance associated genes

As defined by an apparent signature of positive selection. See Farhat et al, Nature Genetics 2012 and Zhang et al, Nature Genetics 2012.


In [8]:
DR_output_E = serf.excess_mutation(DR_set, ALL[(ALL.NON_EFFICACIOUS==0)], 
                               len(MTB_GENOME), SIZE_DICT, exclude_check=excluded)
DR_output_NE = serf.excess_mutation(DR_set, ALL[(ALL.NON_EFFICACIOUS==1)], 
                               len(MTB_GENOME), SIZE_DICT, exclude_check=excluded)

print('DR-associated GENESET\n')
print('OOOO\tEfficaciously treated\tOOOO')
print('--- COUNTS ---')
print('SNPS AT DIAGNOSIS:\n%s\n' % DR_output_E['Data'][0])
print('SNPS DURING TREATMENT:\n%s\n' % DR_output_E['Data'][1])
print('--- STATISTICS ---')
print('Excess binomial: %.3f' % DR_output_E['Excess_binomial'])
print('Excess NSY: %.3f\n' % DR_output_E['NS_binomial'])

print('OOOO\tNon-efficaciously treated\tOOOO')
print('--- COUNTS ---')
print('SNPS AT DIAGNOSIS:\n%s\n' % DR_output_NE['Data'][0])
print('SNPS DURING TREATMENT:\n%s\n' % DR_output_NE['Data'][1])
print('--- STATISTICS ---')
print('Excess binomial: %.3f' % DR_output_NE['Excess_binomial'])
print('Excess NSY: %.3f\n' % DR_output_NE['NS_binomial'])


DR-associated GENESET

OOOO	Efficaciously treated	OOOO
--- COUNTS ---
SNPS AT DIAGNOSIS:
[[ 0  1]
 [26 16]]

SNPS DURING TREATMENT:
[[ 4  6]
 [62 28]]

--- STATISTICS ---
Excess binomial: 0.545
Excess NSY: 0.987

OOOO	Non-efficaciously treated	OOOO
--- COUNTS ---
SNPS AT DIAGNOSIS:
[[ 2  0]
 [53 24]]

SNPS DURING TREATMENT:
[[ 6  0]
 [57 24]]

--- STATISTICS ---
Excess binomial: 0.946
Excess NSY: 0.121

Mycolate superpathway

The whole geneset as identified by O'Neill et al.


In [9]:
output_E = serf.excess_mutation(ONeillMycolate, ALL[(ALL.NON_EFFICACIOUS==0)], 
                               len(MTB_GENOME), SIZE_DICT, exclude_check=excluded)
output_NE = serf.excess_mutation(ONeillMycolate, ALL[(ALL.NON_EFFICACIOUS==1)], 
                               len(MTB_GENOME), SIZE_DICT, exclude_check=excluded)

print('Mycolate superpathway\n')
print('OOOO\tEfficaciously treated\tOOOO')
print('--- COUNTS ---')
print('SNPS AT DIAGNOSIS:\n%s\n' % output_E['Data'][0])
print('SNPS DURING TREATMENT:\n%s\n' % output_E['Data'][1])
print('--- STATISTICS ---')
print('Excess binomial: %.3f' % output_E['Excess_binomial'])
print('Excess NSY: %.3f\n' % output_E['NS_binomial'])

print('OOOO\tNon-efficaciously treated\tOOOO')
print('--- COUNTS ---')
print('SNPS AT DIAGNOSIS:\n%s\n' % output_NE['Data'][0])
print('SNPS DURING TREATMENT:\n%s\n' % output_NE['Data'][1])
print('--- STATISTICS ---')
print('Excess binomial: %.3f' % output_NE['Excess_binomial'])
print('Excess NSY: %.3f\n' % output_NE['NS_binomial'])


Mycolate superpathway

OOOO	Efficaciously treated	OOOO
--- COUNTS ---
SNPS AT DIAGNOSIS:
[[ 0  0]
 [26 17]]

SNPS DURING TREATMENT:
[[ 1  2]
 [65 32]]

--- STATISTICS ---
Excess binomial: 0.881
Excess NSY: 0.964

OOOO	Non-efficaciously treated	OOOO
--- COUNTS ---
SNPS AT DIAGNOSIS:
[[ 3  0]
 [52 24]]

SNPS DURING TREATMENT:
[[ 3  2]
 [60 22]]

--- STATISTICS ---
Excess binomial: 0.229
Excess NSY: 0.876

T-cell antigens

The immune system may shape the populations. The set of antigens is based on Coscolla et al.


In [10]:
output_E = serf.excess_mutation(AG_set, ALL[(ALL.NON_EFFICACIOUS==0)], 
                               len(MTB_GENOME), SIZE_DICT, exclude_check=excluded)
output_NE = serf.excess_mutation(AG_set, ALL[(ALL.NON_EFFICACIOUS==1)], 
                               len(MTB_GENOME), SIZE_DICT, exclude_check=excluded)

print('T-cell antigens\n')
print('OOOO\tEfficaciously treated\tOOOO')
print('--- COUNTS ---')
print('SNPS AT DIAGNOSIS:\n%s\n' % output_E['Data'][0])
print('SNPS DURING TREATMENT:\n%s\n' % output_E['Data'][1])
print('--- STATISTICS ---')
print('Excess binomial: %.3f' % output_E['Excess_binomial'])
print('Excess NSY: %.3f\n' % output_E['NS_binomial'])

print('OOOO\tNon-efficaciously treated\tOOOO')
print('--- COUNTS ---')
print('SNPS AT DIAGNOSIS:\n%s\n' % output_NE['Data'][0])
print('SNPS DURING TREATMENT:\n%s\n' % output_NE['Data'][1])
print('--- STATISTICS ---')
print('Excess binomial: %.3f' % output_NE['Excess_binomial'])
print('Excess NSY: %.3f\n' % output_NE['NS_binomial'])


T-cell antigens

OOOO	Efficaciously treated	OOOO
--- COUNTS ---
SNPS AT DIAGNOSIS:
[[ 4  0]
 [22 17]]

SNPS DURING TREATMENT:
[[10  4]
 [56 30]]

--- STATISTICS ---
Excess binomial: 0.153
Excess NSY: 0.426

OOOO	Non-efficaciously treated	OOOO
--- COUNTS ---
SNPS AT DIAGNOSIS:
[[ 7  3]
 [48 21]]

SNPS DURING TREATMENT:
[[ 6  0]
 [57 24]]

--- STATISTICS ---
Excess binomial: 0.550
Excess NSY: 0.121


In [ ]: