Functional Impact SNP Set Enrichment (fiSSEA)

Authors: Erick R. Scott, Adam Mark, Ali Torkamani, Chunlei Wu & Andrew Su

November 2014

Libraries



In [1]:

    
import os, sys, time, gc, argparse
import json, httplib2
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import subprocess

PandasVCF, MyVariant.info, fiSSEA Libraries



In [2]:

    
import warnings
warnings.simplefilter(action = "ignore", category = FutureWarning)
pd.options.mode.chained_assignment = None #supressing the warnings



In [3]:

    
sys.path.append('/Users/ers_vader/git/pandasVCF/src/multi_sample/')
from pandasVCFmulti import *

sys.path.append('../myvariant/src/')
import myvariant
mv = myvariant.MyVariantInfo()

sys.path.append('../src/')
from fiSSEA_multi import *

VCF CDS Variants



In [4]:

    
def intersect_cds_vcf(vcf_path, cds_bed, write_path):
    '''
    Convenience function to create coding snp vcf file
    '''
    os.system('bedtools intersect -a ' + vcf_path + ' -b ' + cds_bed + ' -wa -header |uniq > ' + write_path )
    return write_path

Running fiSSEA

GSEA Command Line Run

Please download the gsea.jar source from http://www.broadinstitute.org/gsea/downloads.jsp

We have included the tissue-specific gene set .gmt data derived from the Xavier Lab's Enrichment Profiler dataset (http://xavierlab2.mgh.harvard.edu/EnrichmentProfiler/download.html) in the /fiSSEA/gsea/genesets/ directory

Multi-Sample



In [5]:

    
#MULTI-SAMPLE VCF
example_vcf = '../data/SWGR_titin.vcf.gz'
sample_id = 'all'

chunk = 1500
fi_score = 'dbnsfp.cadd.phred'
#Set the path to the gsea .jar file
gsea_jar = '../gsea/gsea2-2.1.0.jar'

#Set the path to the tissue specific gene set
tissue_genesets_path = '../gsea/genesets/EnrichmentValues_Primary_U133A_ave_percent_over70percent_no_brain.gmt'

#Set the path to write GSEA results
output_dir_path = '../gsea/output/'

fiSSEA_multi Scripps Wellderly Genome Resource Titin Variants



In [6]:

    
%time fiSSEA_results = fiSSEA(example_vcf, sample_id=sample_id, chunksize=1000, \
                              fiSSEA_score=fi_score, gene_stat=np.max, n_cores=12)









    



End of File Reached
CPU times: user 7.05 s, sys: 604 ms, total: 7.66 s
Wall time: 12.2 s

DataFrame used to make the gene functional impact scores



In [9]:

    
fiSSEA_results.vcf_df.head()









    Out[9]:






  
    
      
      
      
      
      sample_ids
      a1
      a2
      vartype1
      vartype2
      dbnsfp.cadd.phred_a1
      dbnsfp.genename
      dbnsfp.cadd.phred_a2
      dbnsfp.genename_type
      cadd_score_sum
    
    
      CHROM
      POS
      REF
      ALT
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2
      179392262
      A
      G
       SHUFFLE_WELLDERLY_EA_424
       G
       A
       snp
       ref
       13.44
       TTN
       0
       <type 'unicode'>
       13.44
    
    
      179392340
      C
      T
       SHUFFLE_WELLDERLY_EA_402
       T
       C
       snp
       ref
       12.93
       TTN
       0
       <type 'unicode'>
       12.93
    
    
      179392342
      G
      T,A
        SHUFFLE_WELLDERLY_EA_24
       T
       G
       snp
       ref
       11.85
       TTN
       0
       <type 'unicode'>
       11.85
    
    
      179392350
      A
      T
       SHUFFLE_WELLDERLY_EA_344
       T
       A
       snp
       ref
       14.49
       TTN
       0
       <type 'unicode'>
       14.49
    
    
      179392352
      C
      T
       SHUFFLE_WELLDERLY_EA_407
       T
       C
       snp
       ref
       16.40
       TTN
       0
       <type 'unicode'>
       16.40

Function Impact Scores



In [10]:

    
fiSSEA_results.fi_scores.head(10)









    Out[10]:






  
    
      sample_ids
      SHUFFLE_WELLDERLY_EA_0
      SHUFFLE_WELLDERLY_EA_1
      SHUFFLE_WELLDERLY_EA_10
      SHUFFLE_WELLDERLY_EA_100
      SHUFFLE_WELLDERLY_EA_101
      SHUFFLE_WELLDERLY_EA_102
      SHUFFLE_WELLDERLY_EA_103
      SHUFFLE_WELLDERLY_EA_104
      SHUFFLE_WELLDERLY_EA_105
      SHUFFLE_WELLDERLY_EA_106
      ...
      SHUFFLE_WELLDERLY_EA_90
      SHUFFLE_WELLDERLY_EA_91
      SHUFFLE_WELLDERLY_EA_92
      SHUFFLE_WELLDERLY_EA_93
      SHUFFLE_WELLDERLY_EA_94
      SHUFFLE_WELLDERLY_EA_95
      SHUFFLE_WELLDERLY_EA_96
      SHUFFLE_WELLDERLY_EA_97
      SHUFFLE_WELLDERLY_EA_98
      SHUFFLE_WELLDERLY_EA_99
    
    
      dbnsfp.genename
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      TTN
       35.16
       27.74
       35.16
       35.16
       35.16
       35.16
       35.16
       27.74
       35.16
       35.16
      ...
       35.16
       35.16
       35.16
       62
       35.16
       35.16
       35.16
       35.16
       35.16
       68
    
  

1 rows × 454 columns



In [11]:

    
fiSSEA_results.fi_scores.T.head(10)









    Out[11]:






  
    
      dbnsfp.genename
      TTN
    
    
      sample_ids
      
    
  
  
    
      SHUFFLE_WELLDERLY_EA_0
       35.16
    
    
      SHUFFLE_WELLDERLY_EA_1
       27.74
    
    
      SHUFFLE_WELLDERLY_EA_10
       35.16
    
    
      SHUFFLE_WELLDERLY_EA_100
       35.16
    
    
      SHUFFLE_WELLDERLY_EA_101
       35.16
    
    
      SHUFFLE_WELLDERLY_EA_102
       35.16
    
    
      SHUFFLE_WELLDERLY_EA_103
       35.16
    
    
      SHUFFLE_WELLDERLY_EA_104
       27.74
    
    
      SHUFFLE_WELLDERLY_EA_105
       35.16
    
    
      SHUFFLE_WELLDERLY_EA_106
       35.16



In [8]:

    
fiSSEA_results.fi_scores.T.hist(bins=20)









    Out[8]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x10d1d8610>]], dtype=object)

Single Sample



In [5]:

    
#SINGLE-SAMPLE VCF
example_vcf = '../data/snyderome_chr6_cds.vcf.gz'
sample_id = 'TB0001907'

chunk = 1500
fi_score = 'dbnsfp.cadd.phred'
#Set the path to the gsea .jar file
gsea_jar = '../gsea/gsea2-2.1.0.jar'

#Set the path to the tissue specific gene set
tissue_genesets_path = '../gsea/genesets/EnrichmentValues_Primary_U133A_ave_percent_over70percent_no_brain.gmt'

#Set the path to write GSEA results
output_dir_path = '../gsea/output/'



In [6]:

    
%time fiSSEA_results = fiSSEA(example_vcf, sample_id=sample_id, chunksize=1000, \
                              fiSSEA_score=fi_score, gene_stat=np.max, n_cores=2)









    



End of File Reached
CPU times: user 1.7 s, sys: 104 ms, total: 1.81 s
Wall time: 2.84 s



In [7]:

    
fiSSEA_results.vcf_df.head()









    Out[7]:






  
    
      
      
      
      
      a1
      a2
      dbnsfp.cadd.phred_a1
      dbnsfp.cadd.phred_a2
      dbnsfp.genename
      dbnsfp.genename_type
      sample_ids
      vartype1
      vartype2
      cadd_score_sum
    
    
      CHROM
      POS
      REF
      ALT
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      6
      348906 
      G
      A
       G
       A
        0.000
        0.166
       DUSP22
       <type 'unicode'>
       TB0001907
       ref
       snp
        0.166
    
    
      656143 
      C
      A
       A
       A
       10.480
       10.480
        HUS1B
       <type 'unicode'>
       TB0001907
       snp
       snp
       20.960
    
    
      1313078
      G
      A
       G
       A
        0.000
       12.030
        FOXQ1
       <type 'unicode'>
       TB0001907
       ref
       snp
       12.030
    
    
      1313117
      A
      C
       C
       C
        2.302
        2.302
        FOXQ1
       <type 'unicode'>
       TB0001907
       snp
       snp
        4.604
    
    
      1313121
      A
      C
       C
       C
        1.298
        1.298
        FOXQ1
       <type 'unicode'>
       TB0001907
       snp
       snp
        2.596



In [8]:

    
fiSSEA_results.fi_scores.head(10)









    Out[8]:





dbnsfp.genename
AARS2               2.672
ABCC10             18.640
ACAT2              58.400
AIM1               15.640
AKAP12             14.192
AKD1               24.200
ALDH5A1            11.300
ANKRD6             11.076
ANKS1A             14.730
ARHGAP18            8.878
Name: cadd_score_sum, dtype: float64



In [9]:

    
fiSSEA_results.fi_scores.hist(bins=100)









    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x10d45dbd0>

Running the Gene Set Enrichment Analysis



In [18]:

    
print gsea_jar
print tissue_genesets_path
print output_dir_path
print sample_id


%time fiSSEA_results.run_GSEA_preranked(gsea_jar, tissue_genesets_path, output_dir_path, sample_id)









    



../gsea/gsea2-2.1.0.jar
../gsea/genesets/EnrichmentValues_Primary_U133A_ave_percent_over70percent_no_brain.gmt
../gsea/output/
TB0001907
CPU times: user 5.86 ms, sys: 8.93 ms, total: 14.8 ms
Wall time: 11 s






    Out[18]:





0



In [17]:

    
#open_GSEA_html will open a browser window with the results
fiSSEA_results.open_GSEA_html()









    Out[17]:





0

GSEA using user-supplied fi_scores DataFrame



In [5]:

    
import os, sys, time, gc
import pandas as pd
import numpy as np
import subprocess



class fi_scores_GSEA(object):
    
    def __init__(self, fi_scores, gsea_jar_path, tissue_geneset_path, output_dir_path, single_sample_id):
   
        self.fi_scores = fi_scores
        self.single_sample_id = single_sample_id
        self.sample_id = single_sample_id

        #Setting GSEA parameter paths
        self.gsea_jar_path = gsea_jar_path
        self.rnk_write_dir = output_dir_path
        self.tissue_geneset_path = tissue_geneset_path
        self.output_dir_path = output_dir_path
        
        
        
    
    def write_rnk_file(self):
        '''
        Writes functional impact scores to .rnk file in
        the fiSSEA/gsea/rnk/ directory.

        Returns path to rnk file
        '''

        fiSSEA_results_write = self.fi_scores[[self.single_sample_id]]
        fiSSEA_results_write.index.name = '#' + self.single_sample_id


        rnk_path = self.rnk_write_dir + self.single_sample_id + '.rnk'
        self.rnk_path = rnk_path

        if not os.path.exists(self.rnk_write_dir):
                os.makedirs(self.rnk_write_dir)
            
        print len(fiSSEA_results_write)
        fiSSEA_results_write = fiSSEA_results_write[fiSSEA_results_write.astype(float) != 0.]
        print len(fiSSEA_results_write)
        fiSSEA_results_write.dropna(inplace=True)
        fiSSEA_results_write.to_csv(self.rnk_path, sep='\t', header=True)
        return True



    def call_preranked_gsea(self):
        '''
        Calls GSEA PreRanked Analysis using several default parameters
        '''

        gsea_cmd = ['java', '-cp', self.gsea_jar_path, '-Xmx512m', 'xtools.gsea.GseaPreranked', \
                '-gmx', self.tissue_geneset_path, '-collapse false', '-mode Max_probe', \
                 '-nperm 1000', '-rnk', self.rnk_path, '-scoring_scheme classic', \
                 '-include_only_symbols true', \
                '-rpt_label', self.sample_id,  '-chip gseaftp.broadinstitute.org://pub/gsea/annotations/GENE_SYMBOL.chip', \
                '-include_only_symbols true', '-make_sets true', \
                '-rnd_seed 149', '-set_max 15000', '-set_min 10', '-zip_report true', \
                '-out', self.output_dir_path, '-gui false']
        
                #'-plot_top_x 20',


        p = subprocess.Popen(gsea_cmd,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT)
        self.gsea_cmd = gsea_cmd
        run_stdout = [">>> " + line.rstrip() for line in iter(p.stdout.readline, b'')]
        return run_stdout



    def run_GSEA_preranked(self):
        '''
        Calls GSEA jar for a PreRanked analysis.

        Parameters
        -------------
        gsea_jar_path: str, required
            Specifies the path to GSEA jar, e.g. /Users/bin/gsea/gsea2-2.1.0.jar
                Can be downloaded from http://www.broadinstitute.org/gsea/downloads.jsp

        rnk_write_dir: str, required
            Specifies the directory path where the rnk file should be written
                e.g. /Users/data/gsea/rnk/

        tissue_geneset_path: str, required
            Specifies the path to the tissue genesets (.gmt format)
                e.g.  /Users/data/gsea/genesets/EnrichmentValues_Primary_U133A_ave_percent_over70percent_no_brain.gmt

        output_dir_path: str, required
            Specifies the directory path where the GSEA output should be written
                e.g. /Users/data/gsea/results/

        Returns
        -------------
        Writes GSEA PreRanked Results to output_dir

        '''

        

        #Writing the rnk file to disk
        self.write_rnk_file()

        #Run PreRanked Gene Set Enrichment Analysis, please see run_preranked_gsea function to change
        #parameters
        #Returns 0 if successful run
        run_stdout = self.call_preranked_gsea()
        self.run_stdout = run_stdout


        last_written_dir = os.popen('ls -lht ' + self.output_dir_path).readlines()
        last_written_dir = last_written_dir[1].split()[-1]
        self.last_written_dir = last_written_dir  #likely the path to the output files

        return 0


    def open_GSEA_html(self):
        '''
        Polls the user-specified output directory for the last written
        file and opens the index.html file in that directory
        '''
        #Opening GSEA HTML Results Report
        os.system('open ' + self.output_dir_path + self.last_written_dir + '/index.html')
        return 0

				sample_ids	a1	a2	vartype1	vartype2	dbnsfp.cadd.phred_a1	dbnsfp.genename	dbnsfp.cadd.phred_a2	dbnsfp.genename_type	cadd_score_sum
CHROM	POS	REF	ALT
2	179392262	A	G	SHUFFLE_WELLDERLY_EA_424	G	A	snp	ref	13.44	TTN	0	<type 'unicode'>	13.44
	179392340	C	T	SHUFFLE_WELLDERLY_EA_402	T	C	snp	ref	12.93	TTN	0	<type 'unicode'>	12.93
	179392342	G	T,A	SHUFFLE_WELLDERLY_EA_24	T	G	snp	ref	11.85	TTN	0	<type 'unicode'>	11.85
	179392350	A	T	SHUFFLE_WELLDERLY_EA_344	T	A	snp	ref	14.49	TTN	0	<type 'unicode'>	14.49
	179392352	C	T	SHUFFLE_WELLDERLY_EA_407	T	C	snp	ref	16.40	TTN	0	<type 'unicode'>	16.40

sample_ids	SHUFFLE_WELLDERLY_EA_0	SHUFFLE_WELLDERLY_EA_1	SHUFFLE_WELLDERLY_EA_10	SHUFFLE_WELLDERLY_EA_100	SHUFFLE_WELLDERLY_EA_101	SHUFFLE_WELLDERLY_EA_102	SHUFFLE_WELLDERLY_EA_103	SHUFFLE_WELLDERLY_EA_104	SHUFFLE_WELLDERLY_EA_105	SHUFFLE_WELLDERLY_EA_106	...	SHUFFLE_WELLDERLY_EA_90	SHUFFLE_WELLDERLY_EA_91	SHUFFLE_WELLDERLY_EA_92	SHUFFLE_WELLDERLY_EA_93	SHUFFLE_WELLDERLY_EA_94	SHUFFLE_WELLDERLY_EA_95	SHUFFLE_WELLDERLY_EA_96	SHUFFLE_WELLDERLY_EA_97	SHUFFLE_WELLDERLY_EA_98	SHUFFLE_WELLDERLY_EA_99
dbnsfp.genename
TTN	35.16	27.74	35.16	35.16	35.16	35.16	35.16	27.74	35.16	35.16	...	35.16	35.16	35.16	62	35.16	35.16	35.16	35.16	35.16	68

dbnsfp.genename	TTN
sample_ids
SHUFFLE_WELLDERLY_EA_0	35.16
SHUFFLE_WELLDERLY_EA_1	27.74
SHUFFLE_WELLDERLY_EA_10	35.16
SHUFFLE_WELLDERLY_EA_100	35.16
SHUFFLE_WELLDERLY_EA_101	35.16
SHUFFLE_WELLDERLY_EA_102	35.16
SHUFFLE_WELLDERLY_EA_103	35.16
SHUFFLE_WELLDERLY_EA_104	27.74
SHUFFLE_WELLDERLY_EA_105	35.16
SHUFFLE_WELLDERLY_EA_106	35.16

				a1	a2	dbnsfp.cadd.phred_a1	dbnsfp.cadd.phred_a2	dbnsfp.genename	dbnsfp.genename_type	sample_ids	vartype1	vartype2	cadd_score_sum
CHROM	POS	REF	ALT
6	348906	G	A	G	A	0.000	0.166	DUSP22	<type 'unicode'>	TB0001907	ref	snp	0.166
	656143	C	A	A	A	10.480	10.480	HUS1B	<type 'unicode'>	TB0001907	snp	snp	20.960
	1313078	G	A	G	A	0.000	12.030	FOXQ1	<type 'unicode'>	TB0001907	ref	snp	12.030
	1313117	A	C	C	C	2.302	2.302	FOXQ1	<type 'unicode'>	TB0001907	snp	snp	4.604
	1313121	A	C	C	C	1.298	1.298	FOXQ1	<type 'unicode'>	TB0001907	snp	snp	2.596