CGHelpers


In [3]:
import pandas as pd
import numpy as np

CGFile object

This is our main abstraction for dealing with files from CGhub. Here we store a number of attributes that are important for retreiving, and cleaning up files from CGHub. The process for downloading and cleaning up files is different using the standard GTDownload mechansim or the proprietary GTFuse functionality available on the Annai machine. This object is meant to abtract away the housekeeping required for different methods of downloading the data.


In [1]:
class CGFile(object):
    gt_fuse = None
    cghub = None
    cache = None

    def __init__(self, cg_vec):
        self.barcode = cg_vec.name[0]
        
        self.sample_type = cg_vec.name[1]
        self.library_type = cg_vec.name[2]
        self.cg_vec = cg_vec
        
        self.analysis_id = cg_vec['analysis_id']
        self.filename = cg_vec['filename']
        
        self.mount_loc = None
        self.bam = None
        
    def mount(self, directory='.'):
        self.mount_loc = '{}/{}_{}_{}'.format(directory, self.barcode, 
                                              self.sample_type, self.library_type)
        self.bam = '{}/{}/{}'.format(self.mount_loc, self.analysis_id, self.filename)
        
        init_folder = 'mkdir {}'.format(self.mount_loc)
        mount_file = '{} {}/{} {}'.format(self.gt_fuse, self.cghub, self.analysis_id, 
                                          self.mount_loc)
        return [init_folder, mount_file]
    
    def unmount(self):
        unmount_folder = 'fusermount -u {}'.format(self.mount_loc) 
        clear_cache = 'rm -rf {}/{}*'.format(self.cache, self.analysis_id)
        clean_folder = 'rm -rf {}'.format(self.mount_loc)
        return [unmount_folder, clear_cache, clean_folder]
    
    def download(self, directory='.'):
        self.file_loc = '{}/{}_{}_{}'.format(directory, self.barcode, 
                                             self.sample_type, self.library_type)
        self.bam = '{}/{}/{}'.format(self.file_loc, self.analysis_id, self.filename)
        
        init_folder = 'mkdir {}'.format(self.file_loc)
        download_file = '{} {}/{} -p {}'.format(self.gt_download, self.cghub, 
                                                self.analysis_id, self.file_loc)
        return [init_folder, download_file]
    
    def clean_download(self):
        clean_folder = 'rm -rf {}'.format(self.file_loc)
        return [clean_folder]

MutationCalling

This is an object to handing doing variant calling on matched tumor-normal genomes. We run MuTect and SomaticIndelDetector for SNVs and indels, respectively.


In [2]:
def sort_regions(regions):
    '''
    Helper function to sort regions in the order expected by the Broad tools.  Takes
    in a list of regions and returns them in sorted order, whith X and Y at the end.
    '''
    ss = map(lambda s: (int(s.replace('X', '24').replace('Y','25').split(':')[0]),
               int(s.split(':')[1].split('-')[0])), regions)
    ss = pd.Series(ss)
    regions = list(np.array(regions)[list(ss.argsort())])
    return regions

In [4]:
class MutationCalling(object):
    def __init__(self, java, mutect_jar, indel_jar, reference, dbsnp, cosmic):
        self.mutect = '{} -Xmx32g -jar {}'.format(java, mutect_jar)
        self.mutect_args = {'analysis_type': 'MuTect',
                              'reference_sequence': reference,
                              'dbsnp' : dbsnp,
                              'cosmic': cosmic,
                              }

        self.somaticIndelDetector = '{} -Xmx32g -jar {}'.format(java, indel_jar)
        self.indel_args = {'analysis_type': 'SomaticIndelDetector',
                           'reference_sequence': reference,
                           }
        
    def call_snps(self, tumor, normal, intervals, directory='.', out='tmp'):
        args = self.mutect_args
        out_stub = '{}/{}/{}_'.format(directory, out, tumor.barcode)
        args['out'] = out_stub + 'call_stats.txt'
        args['performanceLog'] = out_stub + 'mutect_log.txt'
        args['coverage_file'] = out_stub + 'coverage.wig.txt'
        
        arglist  = [''] + [k + ' ' + v for k,v in args.iteritems()]
        arglist += ['input_file:normal ' + normal.bam, 'input_file:tumor ' + 
                    tumor.bam] 
        arglist += ['intervals {}'.format(r) for r in intervals]
        mutect_call = self.mutect + ' --'.join(arglist)
        return [mutect_call]
    
    def call_indels(self, tumor, normal, intervals, directory='.', out='tmp'):
        args = self.indel_args
        out_stub = '{}/{}/{}_'.format(directory, out, tumor.barcode)
        args['out'] = out_stub + 'indels.txt'
        args['performanceLog'] = out_stub + 'indel_log.txt'
        
        arglist  = [''] + [k + ' ' + v for k,v in args.iteritems()]
        arglist += ['input_file:normal ' + normal.bam, 'input_file:tumor ' + 
                    tumor.bam] 
        arglist += ['intervals {}'.format(r) for r in intervals]
        indel_call = self.somaticIndelDetector + ' --'.join(arglist)
        return [indel_call]
    
    def mutation_caller(self, manifest, regions=[], normal_code='NB', 
                        tumor_code='TP', library_type='WGS', directory='', 
                        out='', local_dir=''):
        regions = sort_regions(regions)
        def call_wrapper(pat):
            normal = CGFile(manifest.ix[pat, normal_code, library_type])
            tumor = CGFile(manifest.ix[pat, tumor_code, library_type])

            call = ['echo running patient {} at `date`'.format(pat)]
            call += normal.download(directory)
            call += tumor.download(directory)
            call += self.call_snps(tumor, normal, regions, directory, out)
            call += self.call_indels(tumor, normal, regions, directory, out)
            call += ['rm -rf {}'.format(normal.file_loc)]
            call += ['rm -rf {}'.format(tumor.file_loc)]
            call += ['mv {}/{}*.txt* {}'.format(directory, pat, local_dir)]
            call += ['echo finished patient {} at `date`'.format(pat)]
            call = ';\n'.join(call)
            return call
        return call_wrapper

Samtools

We don't really have any globals here, so I am not putting these functions into a class.


In [7]:
def get_sam_call(data, sub_maf, directory='.', out='.'):
    sam_call = 'samtools view {}'.format(data.bam)
    chr_prefix = 'chr' if data.cg_vec.ix['assembly'] in ['HG19'] else ''
    for g,mutation in sub_maf.iterrows():
        c = str(mutation['Chromosome'])
        if c not in map(str, range(1,23)) + ['X']:
            continue
        p = int(mutation['Start_position'])
        sam_call = sam_call + ' {}{}:{}-{}'.format(chr_prefix,c,p,p)
    sam_call =  sam_call + ' > {}/{}/{}_{}'.format(directory, out, data.barcode, 
                                                   data.library_type)
    return [sam_call]

In [8]:
def region_subsetter(maf, manifest, tissue_code='TP', library_type='RNA-Seq', 
                     directory='', out=''):
    def call_wrapper(pat, genes=None):
        data = CGFile(manifest.ix[pat, tissue_code, library_type])
        sub_maf = maf[maf.Tumor_Sample_Barcode.apply(lambda s: s.startswith(pat))]
        if genes is not None:
            sub_maf = sub_maf[sub_maf.Hugo_Symbol.isin(genes)]
        sub_maf['Start_position_f'] = sub_maf['Start_position'].astype(float)
        sub_maf['Chr_f'] = sub_maf['Chromosome'].replace('X', 23).astype(int)
        sub_maf = sub_maf.sort(['Chr_f','Start_position_f'])
        
        call = ['echo running patient {} at `date`'.format(pat)]
        call += data.mount(directory)
        call += get_sam_call(data, sub_maf, directory, out)
        call += data.unmount()
        call = ';\n'.join(call)
        return call
    return call_wrapper