Functional Impact SNP Set Enrichment (fiSSEA)

Authors: Erick R. Scott, Adam Mark, Ali Torkamani, Chunlei Wu & Andrew Su

November 2014

Libraries


In [1]:
import os, sys, time, gc, argparse
import json, httplib2
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import subprocess

PandasVCF, MyVariant.info, fiSSEA Libraries


In [2]:
import warnings
warnings.simplefilter(action = "ignore", category = FutureWarning)
pd.options.mode.chained_assignment = None #supressing the warnings

In [3]:
sys.path.append('/Users/ers_vader/git/pandasVCF/src/multi_sample/')
from pandasVCFmulti import *

sys.path.append('../myvariant/src/')
import myvariant
mv = myvariant.MyVariantInfo()

sys.path.append('../src/')
from fiSSEA_multi import *

VCF CDS Variants


In [4]:
def intersect_cds_vcf(vcf_path, cds_bed, write_path):
    '''
    Convenience function to create coding snp vcf file
    '''
    os.system('bedtools intersect -a ' + vcf_path + ' -b ' + cds_bed + ' -wa -header |uniq > ' + write_path )
    return write_path

Running fiSSEA

GSEA Command Line Run

    Please download the gsea.jar source from http://www.broadinstitute.org/gsea/downloads.jsp
    We have included the tissue-specific gene set .gmt data derived from the Xavier Lab's Enrichment Profiler dataset (http://xavierlab2.mgh.harvard.edu/EnrichmentProfiler/download.html) in the /fiSSEA/gsea/genesets/ directory

Multi-Sample


In [5]:
#MULTI-SAMPLE VCF
example_vcf = '../data/SWGR_titin.vcf.gz'
sample_id = 'all'

chunk = 1500
fi_score = 'dbnsfp.cadd.phred'
#Set the path to the gsea .jar file
gsea_jar = '../gsea/gsea2-2.1.0.jar'

#Set the path to the tissue specific gene set
tissue_genesets_path = '../gsea/genesets/EnrichmentValues_Primary_U133A_ave_percent_over70percent_no_brain.gmt'

#Set the path to write GSEA results
output_dir_path = '../gsea/output/'

fiSSEA_multi Scripps Wellderly Genome Resource Titin Variants


In [6]:
%time fiSSEA_results = fiSSEA(example_vcf, sample_id=sample_id, chunksize=1000, \
                              fiSSEA_score=fi_score, gene_stat=np.max, n_cores=12)


End of File Reached
CPU times: user 7.05 s, sys: 604 ms, total: 7.66 s
Wall time: 12.2 s

DataFrame used to make the gene functional impact scores


In [9]:
fiSSEA_results.vcf_df.head()


Out[9]:
sample_ids a1 a2 vartype1 vartype2 dbnsfp.cadd.phred_a1 dbnsfp.genename dbnsfp.cadd.phred_a2 dbnsfp.genename_type cadd_score_sum
CHROM POS REF ALT
2 179392262 A G SHUFFLE_WELLDERLY_EA_424 G A snp ref 13.44 TTN 0 <type 'unicode'> 13.44
179392340 C T SHUFFLE_WELLDERLY_EA_402 T C snp ref 12.93 TTN 0 <type 'unicode'> 12.93
179392342 G T,A SHUFFLE_WELLDERLY_EA_24 T G snp ref 11.85 TTN 0 <type 'unicode'> 11.85
179392350 A T SHUFFLE_WELLDERLY_EA_344 T A snp ref 14.49 TTN 0 <type 'unicode'> 14.49
179392352 C T SHUFFLE_WELLDERLY_EA_407 T C snp ref 16.40 TTN 0 <type 'unicode'> 16.40

Function Impact Scores


In [10]:
fiSSEA_results.fi_scores.head(10)


Out[10]:
sample_ids SHUFFLE_WELLDERLY_EA_0 SHUFFLE_WELLDERLY_EA_1 SHUFFLE_WELLDERLY_EA_10 SHUFFLE_WELLDERLY_EA_100 SHUFFLE_WELLDERLY_EA_101 SHUFFLE_WELLDERLY_EA_102 SHUFFLE_WELLDERLY_EA_103 SHUFFLE_WELLDERLY_EA_104 SHUFFLE_WELLDERLY_EA_105 SHUFFLE_WELLDERLY_EA_106 ... SHUFFLE_WELLDERLY_EA_90 SHUFFLE_WELLDERLY_EA_91 SHUFFLE_WELLDERLY_EA_92 SHUFFLE_WELLDERLY_EA_93 SHUFFLE_WELLDERLY_EA_94 SHUFFLE_WELLDERLY_EA_95 SHUFFLE_WELLDERLY_EA_96 SHUFFLE_WELLDERLY_EA_97 SHUFFLE_WELLDERLY_EA_98 SHUFFLE_WELLDERLY_EA_99
dbnsfp.genename
TTN 35.16 27.74 35.16 35.16 35.16 35.16 35.16 27.74 35.16 35.16 ... 35.16 35.16 35.16 62 35.16 35.16 35.16 35.16 35.16 68

1 rows × 454 columns


In [11]:
fiSSEA_results.fi_scores.T.head(10)


Out[11]:
dbnsfp.genename TTN
sample_ids
SHUFFLE_WELLDERLY_EA_0 35.16
SHUFFLE_WELLDERLY_EA_1 27.74
SHUFFLE_WELLDERLY_EA_10 35.16
SHUFFLE_WELLDERLY_EA_100 35.16
SHUFFLE_WELLDERLY_EA_101 35.16
SHUFFLE_WELLDERLY_EA_102 35.16
SHUFFLE_WELLDERLY_EA_103 35.16
SHUFFLE_WELLDERLY_EA_104 27.74
SHUFFLE_WELLDERLY_EA_105 35.16
SHUFFLE_WELLDERLY_EA_106 35.16

In [8]:
fiSSEA_results.fi_scores.T.hist(bins=20)


Out[8]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x10d1d8610>]], dtype=object)

Single Sample


In [5]:
#SINGLE-SAMPLE VCF
example_vcf = '../data/snyderome_chr6_cds.vcf.gz'
sample_id = 'TB0001907'

chunk = 1500
fi_score = 'dbnsfp.cadd.phred'
#Set the path to the gsea .jar file
gsea_jar = '../gsea/gsea2-2.1.0.jar'

#Set the path to the tissue specific gene set
tissue_genesets_path = '../gsea/genesets/EnrichmentValues_Primary_U133A_ave_percent_over70percent_no_brain.gmt'

#Set the path to write GSEA results
output_dir_path = '../gsea/output/'

In [6]:
%time fiSSEA_results = fiSSEA(example_vcf, sample_id=sample_id, chunksize=1000, \
                              fiSSEA_score=fi_score, gene_stat=np.max, n_cores=2)


End of File Reached
CPU times: user 1.7 s, sys: 104 ms, total: 1.81 s
Wall time: 2.84 s

In [7]:
fiSSEA_results.vcf_df.head()


Out[7]:
a1 a2 dbnsfp.cadd.phred_a1 dbnsfp.cadd.phred_a2 dbnsfp.genename dbnsfp.genename_type sample_ids vartype1 vartype2 cadd_score_sum
CHROM POS REF ALT
6 348906 G A G A 0.000 0.166 DUSP22 <type 'unicode'> TB0001907 ref snp 0.166
656143 C A A A 10.480 10.480 HUS1B <type 'unicode'> TB0001907 snp snp 20.960
1313078 G A G A 0.000 12.030 FOXQ1 <type 'unicode'> TB0001907 ref snp 12.030
1313117 A C C C 2.302 2.302 FOXQ1 <type 'unicode'> TB0001907 snp snp 4.604
1313121 A C C C 1.298 1.298 FOXQ1 <type 'unicode'> TB0001907 snp snp 2.596

In [8]:
fiSSEA_results.fi_scores.head(10)


Out[8]:
dbnsfp.genename
AARS2               2.672
ABCC10             18.640
ACAT2              58.400
AIM1               15.640
AKAP12             14.192
AKD1               24.200
ALDH5A1            11.300
ANKRD6             11.076
ANKS1A             14.730
ARHGAP18            8.878
Name: cadd_score_sum, dtype: float64

In [9]:
fiSSEA_results.fi_scores.hist(bins=100)


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x10d45dbd0>

Running the Gene Set Enrichment Analysis


In [18]:
print gsea_jar
print tissue_genesets_path
print output_dir_path
print sample_id


%time fiSSEA_results.run_GSEA_preranked(gsea_jar, tissue_genesets_path, output_dir_path, sample_id)


../gsea/gsea2-2.1.0.jar
../gsea/genesets/EnrichmentValues_Primary_U133A_ave_percent_over70percent_no_brain.gmt
../gsea/output/
TB0001907
CPU times: user 5.86 ms, sys: 8.93 ms, total: 14.8 ms
Wall time: 11 s
Out[18]:
0

In [17]:
#open_GSEA_html will open a browser window with the results
fiSSEA_results.open_GSEA_html()


Out[17]:
0

GSEA using user-supplied fi_scores DataFrame


In [5]:
import os, sys, time, gc
import pandas as pd
import numpy as np
import subprocess



class fi_scores_GSEA(object):
    
    def __init__(self, fi_scores, gsea_jar_path, tissue_geneset_path, output_dir_path, single_sample_id):
   
        self.fi_scores = fi_scores
        self.single_sample_id = single_sample_id
        self.sample_id = single_sample_id

        #Setting GSEA parameter paths
        self.gsea_jar_path = gsea_jar_path
        self.rnk_write_dir = output_dir_path
        self.tissue_geneset_path = tissue_geneset_path
        self.output_dir_path = output_dir_path
        
        
        
    
    def write_rnk_file(self):
        '''
        Writes functional impact scores to .rnk file in
        the fiSSEA/gsea/rnk/ directory.

        Returns path to rnk file
        '''

        fiSSEA_results_write = self.fi_scores[[self.single_sample_id]]
        fiSSEA_results_write.index.name = '#' + self.single_sample_id


        rnk_path = self.rnk_write_dir + self.single_sample_id + '.rnk'
        self.rnk_path = rnk_path

        if not os.path.exists(self.rnk_write_dir):
                os.makedirs(self.rnk_write_dir)
            
        print len(fiSSEA_results_write)
        fiSSEA_results_write = fiSSEA_results_write[fiSSEA_results_write.astype(float) != 0.]
        print len(fiSSEA_results_write)
        fiSSEA_results_write.dropna(inplace=True)
        fiSSEA_results_write.to_csv(self.rnk_path, sep='\t', header=True)
        return True



    def call_preranked_gsea(self):
        '''
        Calls GSEA PreRanked Analysis using several default parameters
        '''

        gsea_cmd = ['java', '-cp', self.gsea_jar_path, '-Xmx512m', 'xtools.gsea.GseaPreranked', \
                '-gmx', self.tissue_geneset_path, '-collapse false', '-mode Max_probe', \
                 '-nperm 1000', '-rnk', self.rnk_path, '-scoring_scheme classic', \
                 '-include_only_symbols true', \
                '-rpt_label', self.sample_id,  '-chip gseaftp.broadinstitute.org://pub/gsea/annotations/GENE_SYMBOL.chip', \
                '-include_only_symbols true', '-make_sets true', \
                '-rnd_seed 149', '-set_max 15000', '-set_min 10', '-zip_report true', \
                '-out', self.output_dir_path, '-gui false']
        
                #'-plot_top_x 20',


        p = subprocess.Popen(gsea_cmd,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT)
        self.gsea_cmd = gsea_cmd
        run_stdout = [">>> " + line.rstrip() for line in iter(p.stdout.readline, b'')]
        return run_stdout



    def run_GSEA_preranked(self):
        '''
        Calls GSEA jar for a PreRanked analysis.

        Parameters
        -------------
        gsea_jar_path: str, required
            Specifies the path to GSEA jar, e.g. /Users/bin/gsea/gsea2-2.1.0.jar
                Can be downloaded from http://www.broadinstitute.org/gsea/downloads.jsp

        rnk_write_dir: str, required
            Specifies the directory path where the rnk file should be written
                e.g. /Users/data/gsea/rnk/

        tissue_geneset_path: str, required
            Specifies the path to the tissue genesets (.gmt format)
                e.g.  /Users/data/gsea/genesets/EnrichmentValues_Primary_U133A_ave_percent_over70percent_no_brain.gmt

        output_dir_path: str, required
            Specifies the directory path where the GSEA output should be written
                e.g. /Users/data/gsea/results/

        Returns
        -------------
        Writes GSEA PreRanked Results to output_dir

        '''

        

        #Writing the rnk file to disk
        self.write_rnk_file()

        #Run PreRanked Gene Set Enrichment Analysis, please see run_preranked_gsea function to change
        #parameters
        #Returns 0 if successful run
        run_stdout = self.call_preranked_gsea()
        self.run_stdout = run_stdout


        last_written_dir = os.popen('ls -lht ' + self.output_dir_path).readlines()
        last_written_dir = last_written_dir[1].split()[-1]
        self.last_written_dir = last_written_dir  #likely the path to the output files

        return 0


    def open_GSEA_html(self):
        '''
        Polls the user-specified output directory for the last written
        file and opens the index.html file in that directory
        '''
        #Opening GSEA HTML Results Report
        os.system('open ' + self.output_dir_path + self.last_written_dir + '/index.html')
        return 0