In [1]:
import os, sys, time, gc, argparse
import json, httplib2
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import subprocess
In [2]:
import warnings
warnings.simplefilter(action = "ignore", category = FutureWarning)
pd.options.mode.chained_assignment = None #supressing the warnings
In [3]:
sys.path.append('/Users/ers_vader/git/pandasVCF/src/multi_sample/')
from pandasVCFmulti import *
sys.path.append('../myvariant/src/')
import myvariant
mv = myvariant.MyVariantInfo()
sys.path.append('../src/')
from fiSSEA_multi import *
In [4]:
def intersect_cds_vcf(vcf_path, cds_bed, write_path):
'''
Convenience function to create coding snp vcf file
'''
os.system('bedtools intersect -a ' + vcf_path + ' -b ' + cds_bed + ' -wa -header |uniq > ' + write_path )
return write_path
In [5]:
#MULTI-SAMPLE VCF
example_vcf = '../data/SWGR_titin.vcf.gz'
sample_id = 'all'
chunk = 1500
fi_score = 'dbnsfp.cadd.phred'
#Set the path to the gsea .jar file
gsea_jar = '../gsea/gsea2-2.1.0.jar'
#Set the path to the tissue specific gene set
tissue_genesets_path = '../gsea/genesets/EnrichmentValues_Primary_U133A_ave_percent_over70percent_no_brain.gmt'
#Set the path to write GSEA results
output_dir_path = '../gsea/output/'
In [6]:
%time fiSSEA_results = fiSSEA(example_vcf, sample_id=sample_id, chunksize=1000, \
fiSSEA_score=fi_score, gene_stat=np.max, n_cores=12)
In [9]:
fiSSEA_results.vcf_df.head()
Out[9]:
In [10]:
fiSSEA_results.fi_scores.head(10)
Out[10]:
In [11]:
fiSSEA_results.fi_scores.T.head(10)
Out[11]:
In [8]:
fiSSEA_results.fi_scores.T.hist(bins=20)
Out[8]:
In [5]:
#SINGLE-SAMPLE VCF
example_vcf = '../data/snyderome_chr6_cds.vcf.gz'
sample_id = 'TB0001907'
chunk = 1500
fi_score = 'dbnsfp.cadd.phred'
#Set the path to the gsea .jar file
gsea_jar = '../gsea/gsea2-2.1.0.jar'
#Set the path to the tissue specific gene set
tissue_genesets_path = '../gsea/genesets/EnrichmentValues_Primary_U133A_ave_percent_over70percent_no_brain.gmt'
#Set the path to write GSEA results
output_dir_path = '../gsea/output/'
In [6]:
%time fiSSEA_results = fiSSEA(example_vcf, sample_id=sample_id, chunksize=1000, \
fiSSEA_score=fi_score, gene_stat=np.max, n_cores=2)
In [7]:
fiSSEA_results.vcf_df.head()
Out[7]:
In [8]:
fiSSEA_results.fi_scores.head(10)
Out[8]:
In [9]:
fiSSEA_results.fi_scores.hist(bins=100)
Out[9]:
In [18]:
print gsea_jar
print tissue_genesets_path
print output_dir_path
print sample_id
%time fiSSEA_results.run_GSEA_preranked(gsea_jar, tissue_genesets_path, output_dir_path, sample_id)
Out[18]:
In [17]:
#open_GSEA_html will open a browser window with the results
fiSSEA_results.open_GSEA_html()
Out[17]:
In [5]:
import os, sys, time, gc
import pandas as pd
import numpy as np
import subprocess
class fi_scores_GSEA(object):
def __init__(self, fi_scores, gsea_jar_path, tissue_geneset_path, output_dir_path, single_sample_id):
self.fi_scores = fi_scores
self.single_sample_id = single_sample_id
self.sample_id = single_sample_id
#Setting GSEA parameter paths
self.gsea_jar_path = gsea_jar_path
self.rnk_write_dir = output_dir_path
self.tissue_geneset_path = tissue_geneset_path
self.output_dir_path = output_dir_path
def write_rnk_file(self):
'''
Writes functional impact scores to .rnk file in
the fiSSEA/gsea/rnk/ directory.
Returns path to rnk file
'''
fiSSEA_results_write = self.fi_scores[[self.single_sample_id]]
fiSSEA_results_write.index.name = '#' + self.single_sample_id
rnk_path = self.rnk_write_dir + self.single_sample_id + '.rnk'
self.rnk_path = rnk_path
if not os.path.exists(self.rnk_write_dir):
os.makedirs(self.rnk_write_dir)
print len(fiSSEA_results_write)
fiSSEA_results_write = fiSSEA_results_write[fiSSEA_results_write.astype(float) != 0.]
print len(fiSSEA_results_write)
fiSSEA_results_write.dropna(inplace=True)
fiSSEA_results_write.to_csv(self.rnk_path, sep='\t', header=True)
return True
def call_preranked_gsea(self):
'''
Calls GSEA PreRanked Analysis using several default parameters
'''
gsea_cmd = ['java', '-cp', self.gsea_jar_path, '-Xmx512m', 'xtools.gsea.GseaPreranked', \
'-gmx', self.tissue_geneset_path, '-collapse false', '-mode Max_probe', \
'-nperm 1000', '-rnk', self.rnk_path, '-scoring_scheme classic', \
'-include_only_symbols true', \
'-rpt_label', self.sample_id, '-chip gseaftp.broadinstitute.org://pub/gsea/annotations/GENE_SYMBOL.chip', \
'-include_only_symbols true', '-make_sets true', \
'-rnd_seed 149', '-set_max 15000', '-set_min 10', '-zip_report true', \
'-out', self.output_dir_path, '-gui false']
#'-plot_top_x 20',
p = subprocess.Popen(gsea_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
self.gsea_cmd = gsea_cmd
run_stdout = [">>> " + line.rstrip() for line in iter(p.stdout.readline, b'')]
return run_stdout
def run_GSEA_preranked(self):
'''
Calls GSEA jar for a PreRanked analysis.
Parameters
-------------
gsea_jar_path: str, required
Specifies the path to GSEA jar, e.g. /Users/bin/gsea/gsea2-2.1.0.jar
Can be downloaded from http://www.broadinstitute.org/gsea/downloads.jsp
rnk_write_dir: str, required
Specifies the directory path where the rnk file should be written
e.g. /Users/data/gsea/rnk/
tissue_geneset_path: str, required
Specifies the path to the tissue genesets (.gmt format)
e.g. /Users/data/gsea/genesets/EnrichmentValues_Primary_U133A_ave_percent_over70percent_no_brain.gmt
output_dir_path: str, required
Specifies the directory path where the GSEA output should be written
e.g. /Users/data/gsea/results/
Returns
-------------
Writes GSEA PreRanked Results to output_dir
'''
#Writing the rnk file to disk
self.write_rnk_file()
#Run PreRanked Gene Set Enrichment Analysis, please see run_preranked_gsea function to change
#parameters
#Returns 0 if successful run
run_stdout = self.call_preranked_gsea()
self.run_stdout = run_stdout
last_written_dir = os.popen('ls -lht ' + self.output_dir_path).readlines()
last_written_dir = last_written_dir[1].split()[-1]
self.last_written_dir = last_written_dir #likely the path to the output files
return 0
def open_GSEA_html(self):
'''
Polls the user-specified output directory for the last written
file and opens the index.html file in that directory
'''
#Opening GSEA HTML Results Report
os.system('open ' + self.output_dir_path + self.last_written_dir + '/index.html')
return 0