In [3]:
import pandas as pd
import numpy as np
This is our main abstraction for dealing with files from CGhub. Here we store a number of attributes that are important for retreiving, and cleaning up files from CGHub. The process for downloading and cleaning up files is different using the standard GTDownload mechansim or the proprietary GTFuse functionality available on the Annai machine. This object is meant to abtract away the housekeeping required for different methods of downloading the data.
In [1]:
class CGFile(object):
gt_fuse = None
cghub = None
cache = None
def __init__(self, cg_vec):
self.barcode = cg_vec.name[0]
self.sample_type = cg_vec.name[1]
self.library_type = cg_vec.name[2]
self.cg_vec = cg_vec
self.analysis_id = cg_vec['analysis_id']
self.filename = cg_vec['filename']
self.mount_loc = None
self.bam = None
def mount(self, directory='.'):
self.mount_loc = '{}/{}_{}_{}'.format(directory, self.barcode,
self.sample_type, self.library_type)
self.bam = '{}/{}/{}'.format(self.mount_loc, self.analysis_id, self.filename)
init_folder = 'mkdir {}'.format(self.mount_loc)
mount_file = '{} {}/{} {}'.format(self.gt_fuse, self.cghub, self.analysis_id,
self.mount_loc)
return [init_folder, mount_file]
def unmount(self):
unmount_folder = 'fusermount -u {}'.format(self.mount_loc)
clear_cache = 'rm -rf {}/{}*'.format(self.cache, self.analysis_id)
clean_folder = 'rm -rf {}'.format(self.mount_loc)
return [unmount_folder, clear_cache, clean_folder]
def download(self, directory='.'):
self.file_loc = '{}/{}_{}_{}'.format(directory, self.barcode,
self.sample_type, self.library_type)
self.bam = '{}/{}/{}'.format(self.file_loc, self.analysis_id, self.filename)
init_folder = 'mkdir {}'.format(self.file_loc)
download_file = '{} {}/{} -p {}'.format(self.gt_download, self.cghub,
self.analysis_id, self.file_loc)
return [init_folder, download_file]
def clean_download(self):
clean_folder = 'rm -rf {}'.format(self.file_loc)
return [clean_folder]
This is an object to handing doing variant calling on matched tumor-normal genomes. We run MuTect and SomaticIndelDetector for SNVs and indels, respectively.
In [2]:
def sort_regions(regions):
'''
Helper function to sort regions in the order expected by the Broad tools. Takes
in a list of regions and returns them in sorted order, whith X and Y at the end.
'''
ss = map(lambda s: (int(s.replace('X', '24').replace('Y','25').split(':')[0]),
int(s.split(':')[1].split('-')[0])), regions)
ss = pd.Series(ss)
regions = list(np.array(regions)[list(ss.argsort())])
return regions
In [4]:
class MutationCalling(object):
def __init__(self, java, mutect_jar, indel_jar, reference, dbsnp, cosmic):
self.mutect = '{} -Xmx32g -jar {}'.format(java, mutect_jar)
self.mutect_args = {'analysis_type': 'MuTect',
'reference_sequence': reference,
'dbsnp' : dbsnp,
'cosmic': cosmic,
}
self.somaticIndelDetector = '{} -Xmx32g -jar {}'.format(java, indel_jar)
self.indel_args = {'analysis_type': 'SomaticIndelDetector',
'reference_sequence': reference,
}
def call_snps(self, tumor, normal, intervals, directory='.', out='tmp'):
args = self.mutect_args
out_stub = '{}/{}/{}_'.format(directory, out, tumor.barcode)
args['out'] = out_stub + 'call_stats.txt'
args['performanceLog'] = out_stub + 'mutect_log.txt'
args['coverage_file'] = out_stub + 'coverage.wig.txt'
arglist = [''] + [k + ' ' + v for k,v in args.iteritems()]
arglist += ['input_file:normal ' + normal.bam, 'input_file:tumor ' +
tumor.bam]
arglist += ['intervals {}'.format(r) for r in intervals]
mutect_call = self.mutect + ' --'.join(arglist)
return [mutect_call]
def call_indels(self, tumor, normal, intervals, directory='.', out='tmp'):
args = self.indel_args
out_stub = '{}/{}/{}_'.format(directory, out, tumor.barcode)
args['out'] = out_stub + 'indels.txt'
args['performanceLog'] = out_stub + 'indel_log.txt'
arglist = [''] + [k + ' ' + v for k,v in args.iteritems()]
arglist += ['input_file:normal ' + normal.bam, 'input_file:tumor ' +
tumor.bam]
arglist += ['intervals {}'.format(r) for r in intervals]
indel_call = self.somaticIndelDetector + ' --'.join(arglist)
return [indel_call]
def mutation_caller(self, manifest, regions=[], normal_code='NB',
tumor_code='TP', library_type='WGS', directory='',
out='', local_dir=''):
regions = sort_regions(regions)
def call_wrapper(pat):
normal = CGFile(manifest.ix[pat, normal_code, library_type])
tumor = CGFile(manifest.ix[pat, tumor_code, library_type])
call = ['echo running patient {} at `date`'.format(pat)]
call += normal.download(directory)
call += tumor.download(directory)
call += self.call_snps(tumor, normal, regions, directory, out)
call += self.call_indels(tumor, normal, regions, directory, out)
call += ['rm -rf {}'.format(normal.file_loc)]
call += ['rm -rf {}'.format(tumor.file_loc)]
call += ['mv {}/{}*.txt* {}'.format(directory, pat, local_dir)]
call += ['echo finished patient {} at `date`'.format(pat)]
call = ';\n'.join(call)
return call
return call_wrapper
We don't really have any globals here, so I am not putting these functions into a class.
In [7]:
def get_sam_call(data, sub_maf, directory='.', out='.'):
sam_call = 'samtools view {}'.format(data.bam)
chr_prefix = 'chr' if data.cg_vec.ix['assembly'] in ['HG19'] else ''
for g,mutation in sub_maf.iterrows():
c = str(mutation['Chromosome'])
if c not in map(str, range(1,23)) + ['X']:
continue
p = int(mutation['Start_position'])
sam_call = sam_call + ' {}{}:{}-{}'.format(chr_prefix,c,p,p)
sam_call = sam_call + ' > {}/{}/{}_{}'.format(directory, out, data.barcode,
data.library_type)
return [sam_call]
In [8]:
def region_subsetter(maf, manifest, tissue_code='TP', library_type='RNA-Seq',
directory='', out=''):
def call_wrapper(pat, genes=None):
data = CGFile(manifest.ix[pat, tissue_code, library_type])
sub_maf = maf[maf.Tumor_Sample_Barcode.apply(lambda s: s.startswith(pat))]
if genes is not None:
sub_maf = sub_maf[sub_maf.Hugo_Symbol.isin(genes)]
sub_maf['Start_position_f'] = sub_maf['Start_position'].astype(float)
sub_maf['Chr_f'] = sub_maf['Chromosome'].replace('X', 23).astype(int)
sub_maf = sub_maf.sort(['Chr_f','Start_position_f'])
call = ['echo running patient {} at `date`'.format(pat)]
call += data.mount(directory)
call += get_sam_call(data, sub_maf, directory, out)
call += data.unmount()
call = ';\n'.join(call)
return call
return call_wrapper