Functional Impact SNP Set Enrichment (fiSSEA)

Authors: Erick R. Scott, Adam Mark, Ali Torkamani, Chunlei Wu & Andrew Su

November 2014

Libraries


In [1]:
import os, sys, time, gc, argparse
import json, httplib2
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np

PandasVCF, MyGene.info, MyVariant.info Libraries


In [2]:
sys.path.append('../pandasVCF/src/single_sample/')
from pdVCFsingle import *

sys.path.append('../myvariant/src/')
import myvariant
mv = myvariant.MyVariantInfo()

sys.path.append('../src/')
from fiSSEA import *

VCF CDS Variants


In [3]:
def intersect_cds_vcf(vcf_path, cds_bed, write_path):
    '''
    Convenience function to create coding snp vcf file
    '''
    os.system('bedtools intersect -a ' + vcf_path + ' -b ' + cds_bed + ' -wa -header |uniq > ' + write_path )
    return write_path

Running fiSSEA


In [8]:
example_vcf = '../data/snyderome_chr6_cds.vcf.gz'
sample_id = 'TB0001907'
chunk = 100000
fi_score = 'dbnsfp.cadd.phred'

In [9]:
fiSSEA_results = fiSSEA(example_vcf, sample_id=sample_id, chunksize=chunk, fiSSEA_score=fi_score)


input variant rows: 1377
dropping 0 variants with genotype call == "." 
current variant rows: 1377

Results

fiSSEA_results object has the following attributes


In [15]:
dir(fiSSEA_results)[-8:]


Out[15]:
['fiSSEA_score',
 'fiSSEA_status',
 'fi_scores',
 'gene_stat',
 'get_fi_scores',
 'get_multigene_var_records',
 'mv_annot',
 'vcf_df']

Original vcf parsed with pandasVCF


In [12]:
fiSSEA_results.vcf_df.head()


Out[12]:
CHROM POS REF ALT ID FILTER multiallele phase GT1 GT2 a1 a2 zygosity vartype1 vartype2 GT DP GL GQ
0 6 348906 G A rs11242812 PASS 0 / 0 1 G A het-ref ref snp 0/1 46 -102.90,-13.86,-83.18 99
1 6 350829 G A rs1129085 filter 0 / 0 1 G A het-ref ref snp 0/1 75 -68.57,-22.62,-216.17 99
2 6 656143 C A rs1211554 PASS 0 / 1 1 A A hom-alt snp snp 1/1 40 -164.63,-12.04,-0.00 99
3 6 930792 T G rs12202545 PASS 0 / 0 1 T G het-ref ref snp 0/1 43 -74.59,-12.96,-92.95 99
4 6 965576 A G rs2756293 PASS 0 / 1 1 G G hom-alt snp snp 1/1 38 -155.40,-11.44,-0.01 99

Mean functional impact score for each Gene - Input into GSEA


In [22]:
fiSSEA_results.fi_scores.head()


Out[22]:
dbnsfp.genename
AARS2               1.336
ABCC10             12.762
ACAT2              24.255
AIM1                9.095
AKAP12              2.952
Name: dbnsfp.cadd.phred, dtype: float64

In [32]:
print fiSSEA_results.fi_scores.describe()
fiSSEA_results.fi_scores.hist(bins=20)
plt.title('Distribution of Mean CADD Phred Score by Gene (Snyderome, Chr6)')
plt.xlabel('Mean CADD Phred Score')
plt.ylabel('Gene Count')
plt.tight_layout()


count    330.000000
mean       9.893912
std        7.623389
min        0.000000
25%        4.347875
50%        9.104750
75%       13.976250
max       38.000000
dtype: float64

In [34]:
fiSSEA_results.mv_annot[['dbnsfp.genename','dbnsfp.cadd.phred']].head(10)


Out[34]:
dbnsfp.genename dbnsfp.cadd.phred
_id
chr6:g.39041502A>C GLP1R 0.350
chr6:g.99956560T>C USP45 24.000
chr6:g.38800164G>A DNAH8 8.047
chr6:g.3281392C>T NaN NaN
chr6:g.151939181G>A C6orf97 2.969
chr6:g.29142054C>G NaN NaN
chr6:g.35745283C>G C6orf126 14.600
chr6:g.35088381C>G TCP11 3.413
chr6:g.159652931G>C FNDC1 0.698
chr6:g.3174947C>T NaN NaN

In [36]:
fiSSEA_results.mv_annot['dbnsfp.genename'].value_counts().head(10)


Out[36]:
HLA-B       26
HLA-A       22
HLA-DRB1    20
SYNE1       17
HLA-DQA1    16
HLA-DRB5    16
HLA-DQB1    16
FNDC1       16
EYS         15
BCLAF1      11
dtype: int64

All MyVariant.info Annotations for SNPs flattened into a dataframe


In [21]:
fiSSEA_results.mv_annot.head(1).T


Out[21]:
_id chr6:g.39041502A>C
_id chr6:g.39041502A>C
cosmic.allele1 NaN
cosmic.allele2 NaN
cosmic.chrom NaN
cosmic.chromEnd NaN
cosmic.chromStart NaN
cosmic.mut_freq NaN
cosmic.mut_nt NaN
cosmic.tumor_site NaN
dbnsfp.1000gp1.ac 1246
dbnsfp.1000gp1.af 0.5705128
dbnsfp.1000gp1.afr_ac 310
dbnsfp.1000gp1.afr_af 0.6300813
dbnsfp.1000gp1.amr_ac 218
dbnsfp.1000gp1.amr_af 0.6022099
dbnsfp.1000gp1.asn_ac 299
dbnsfp.1000gp1.asn_af 0.5227273
dbnsfp.1000gp1.eur_ac 419
dbnsfp.1000gp1.eur_af 0.5527704
dbnsfp.aa NaN
dbnsfp.aa.aapos_fathmm ENSP00000362353:L260F
dbnsfp.aa.aapos_sift ENSP00000362353:L260F
dbnsfp.aa.alt F
dbnsfp.aa.codonpos 3
dbnsfp.aa.pos 260
dbnsfp.aa.ref L
dbnsfp.aa.refcodon TTA
dbnsfp.allele1 A
dbnsfp.allele2 C
dbnsfp.ancestral_allele C
... ...
evs.hgvs.coding c.780A>C
evs.hgvs.protein p.(L260F)
evs.ma_fin_percent.african_american 41.9655
evs.ma_fin_percent.all 43.3877
evs.ma_fin_percent.european_american 44.1163
evs.on_illumina_human_exome_chip yes
evs.polyphen2 NaN
evs.polyphen2.class b
evs.polyphen2.score e
evs.ref_base_ncbi A
evs.rs_id rs1042044
gwassnps.allele1 NaN
gwassnps.allele2 NaN
gwassnps.chrom NaN
gwassnps.chromEnd NaN
gwassnps.chromStart NaN
gwassnps.pubmedID NaN
gwassnps.rsid NaN
gwassnps.trait NaN
mutdb.allele1 A
mutdb.allele2 C
mutdb.chrom 6
mutdb.chromEnd 3.90415e+07
mutdb.chromStart 3.90415e+07
mutdb.cosmic_id None
mutdb.dbsnp_id rs1042044
mutdb.mutpred_score 0.552
mutdb.strand p
mutdb.uniprot_id VAR_015098
snpedia.text NaN

242 rows × 1 columns

Query and Parsing - Annotations from MyVariant.info

fiSSEA Functions

MyVariant.info Query and Parsing


In [4]:
def myvariant_post(hgvs_list):
    '''
    Query and Parser for myvariant.info
    Parses Raw Elastic Search results into
    pandas dataframe
    
    Parameters
    -------------
    hgvs_list: list, required
        
        
    Output
    -------------
    pandas df:
        normalized json of myvariant results
    '''
    
    def normalize(con):
        '''
        This function uses the json.loads and json_normalize
        modules to flatten the myvariant.info results
        '''
        temp_df = json_normalize(json.loads(con), 'docs')
        source_parsed = []
        for source_json in temp_df['_source'].values:
            try:
                source_parsed.append(json_normalize(source_json))
            except TypeError:  #occurs with empty results
                pass
        return pd.concat(source_parsed)
    
    
    
    if type(hgvs_list) == list:
        hgvs_list = ','.join(hgvs_list)
        
    assert type(hgvs_list) == str
    
    h = httplib2.Http()
    headers = {'content-type': 'application/x-www-form-urlencoded'}
    params = 'ids=' + hgvs_list +",size=1000"  #can pass a dictionary
    #print params
    res, con = h.request('http://myvariant.info:8001/api/variant/', 'POST', params, headers=headers)
    
    
    mv_df = normalize(con)
    mv_df.index = mv_df['_id']

    return mv_df

In [ ]:
def get_myvariant_snp_annot(vcf_df_mv):
    '''
    Returns myvariant.info annotations for all snps for
    input vcf pandas dataframe. Submits myvariant.info
    POST request in snp batches of 1000.
    
    Parameters
    -------------
    vcf_df_mv: pandas df; required
    
        vcf_df_mv must contain the following columns:
            vartype1, vartype2, CHROM, REF, POS, a1, a2
            *please see pandasVCF docs for column definitions
            

    Output
    -------------
    myvariant.info results: pandas df
    
    '''
    
    def chunks(l, n):
        """ 
        Yield successive n-sized chunks from l.
        """
        l = list(l)
        chunk_list = []
        for i in xrange(0, len(l), n):
            chunk_list.append( l[i:i+n] )
        return chunk_list
    
    
 
    if 'CHROM' not in vcf_df_mv.columns:
        vcf_df_mv.reset_index(inplace=True)
        
    
    vcf_df_mv = vcf_df_mv[(vcf_df_mv['vartype1'] == 'snp') | (vcf_df_mv['vartype2'] == 'snp')] #restrict to snps

    non_ref_a1 = vcf_df_mv[vcf_df_mv['REF'] != vcf_df_mv['a1']][['CHROM', 'POS', 'REF', 'a1']]  #restrict a1 to non-reference alleles
    hgvs_a1 = 'chr' + non_ref_a1['CHROM'].astype(str) + ":g." + non_ref_a1['POS'].astype(str)  \
                                 + non_ref_a1['REF'] + '>' +  non_ref_a1['a1']

    non_ref_a2 = vcf_df_mv[vcf_df_mv['REF'] != vcf_df_mv['a2']][['CHROM', 'POS', 'REF', 'a2']]  #restrict a2 to non-reference alleles
    hgvs_a2 = 'chr' + non_ref_a2['CHROM'].astype(str) + ":g." + non_ref_a2['POS'].astype(str) \
                                 + non_ref_a2['REF'] + '>' +  non_ref_a2['a2']


    hgvs_ids = set(hgvs_a1.values) | set(hgvs_a2.values)  #set of non-reference a1 and a2 snps

    hgvs_ids_chunks = chunks(hgvs_ids, 1000)  #batch queries of hgvs a1 and a2 variants, n=1000

    myvariant_results = map(myvariant_post, hgvs_ids_chunks)
        
    return myvariant_results

fiSSEA Function


In [11]:
class fiSSEA(object):

    
    def __init__(self, vcf_file_path, sample_id='', chunksize=100000, fiSSEA_score='dbnsfp.cadd.phred', gene_stat=np.mean):
        
        
        #setting fiSSEA_score
        self.fiSSEA_score = fiSSEA_score
        
        #setting gene statistic
        self.gene_stat = gene_stat
        
        #open Vcf object
        vcf_df_obj = Vcf(vcf_file_path, sample_id, ['#CHROM', 'POS', 'REF', 'ALT', 'ID', 'FILTER', 'FORMAT'], chunksize=chunksize)

        #read in vcf chunk
        vcf_df_obj.get_vcf_df_chunk()

        #drop duplicate vcf lines, if necessary
        vcf_df_obj.df = vcf_df_obj.df.drop_duplicates(subset=['CHROM', 'POS'])

        #remove empty variant lines
        vcf_df_obj.df = vcf_df_obj.df[vcf_df_obj.df.ALT!='.']

        #replace chr if necessary, depends on format of annotations
        vcf_df_obj.df['CHROM'] = vcf_df_obj.df['CHROM'].str.replace('chr', '')

        #parse vcf lines
        vcf_df_obj.add_variant_annotations( inplace=True, verbose=True)
        
        #settting vcf_df as class object
        self.vcf_df = vcf_df_obj.df

        #get myvariant.info annotations, create fiSSEA dataframe
        self.fiSSEA_status = self.get_fi_scores()
        

    
    def get_multigene_var_records(self, df_line):
        '''
        Splits multigene variants into separate
        rows
        '''
        genes = df_line['dbnsfp.genename']
        gene_series = [] #list of gene variants
        for gene in genes: #iterate through all genes intersecting this variant
            df_line_copy = df_line.copy()  #avoids aliasing
            del df_line_copy['dbnsfp.genename']  #removing genename to allow gene insertion
            df_line_copy['dbnsfp.genename'] = gene
            gene_series.append(df_line_copy)

        return gene_series
    
    
    
    def get_fi_scores(self):
        '''
        Creates fi_scores series, index on gene symbol, value fi_score.
        '''
        
        #Retrieving myvariant.info annotations for all snps
        myvariant_results = get_myvariant_snp_annot(self.vcf_df) #query and parse myvariant.info using all snps in self.vcf_df
        self.mv_annot = pd.concat(myvariant_results)  #setting dataframe of parsed myvariant.info results

        #reducing size of dataframe for fiSSEA
        fiSSEA_cols = ['dbnsfp.genename', self.fiSSEA_score]
        mv_annot_fiSSEA = self.mv_annot[fiSSEA_cols] 
        
        #dropping variants without a genename & filling fi_score NaN with 0
        mv_annot_fiSSEA.dropna(subset=['dbnsfp.genename'], inplace=True)  
        mv_annot_fiSSEA[self.fiSSEA_score].fillna(value=0)  

        
        #Splitting variants intersecting multiple genes into separate rows
        mv_annot_fiSSEA['dbnsfp.genename_type'] = mv_annot_fiSSEA['dbnsfp.genename'].map(type)
        mv_annot_fiSSEA_multigene = mv_annot_fiSSEA[mv_annot_fiSSEA['dbnsfp.genename_type']==list]
        mv_annot_fiSSEA = mv_annot_fiSSEA[~mv_annot_fiSSEA.index.isin(mv_annot_fiSSEA_multigene.index)]
        
        if len(mv_annot_fiSSEA_multigene) > 0:
            mv_annot_fiSSEA_multigene['index'] = mv_annot_fiSSEA_multigene.index
            mv_annot_fiSSEA_multigene.drop_duplicates(subset='index', inplace=True)
            mv_annot_fiSSEA_multigene = mv_annot_fiSSEA_multigene.apply(self.get_multigene_var_records, axis=1)
            mv_annot_fiSSEA_multigene = [n for i in mv_annot_fiSSEA_multigene for n in i]
            mv_annot_fiSSEA = mv_annot_fiSSEA.append(pd.DataFrame(mv_annot_fiSSEA_multigene))
            del mv_annot_fiSSEA_multigene
        
        gc.collect()

        
        #Setting gene specific functional impact score
        self.fi_scores = mv_annot_fiSSEA.groupby('dbnsfp.genename')[self.fiSSEA_score].aggregate(self.gene_stat)
        
        return 0

In [ ]: