Libraries


In [2]:
import pandas as pd
import sys,os
import random
import numpy as np
sys.path.append('../')
from RASLseqTools import *
from RASLseqTools import RASLseqAnalysis_BLAST

RASLseqAnalysis Module Code for Developing


In [32]:
class RASLseqAnalysis_BLAST(object):
    '''
    This class creates a pandas DataFrame for RASLseq fastq sequences.
    
    Attributes of this class annotate RASLseq fastq sequences.
    
    The get_attributes method allows the user to specify what additional
    information to add to the pandas DataFrame.
    
    
    Attributes
    ----------
    
    fastq_file: str 
        path to Fastq file
        
    probes_path: str
        path to Probe file
        
    blastdb: str 
        path to write a blast database
        
    blastn_dir: str
        path to directory holding the blastn executable
    
    well_annot: str
        path to Barcode Annotations file
    
    write: str 
        path to write directory
    
    print_on: boolean, default=False
        Whether to print information during data processing
    
    read_df: pandas DataFrame
        index: (PlateBarcode, sequence)
        columns: ['PlateBarcode', 'seq', 'seq_count']
            PlateBarcode - Index read from fastq file
            seq - fastq read sequence
            seq_count - number of occurrences of seq in fastq
    
    
    APRIL 14 2014: ToDos:  
        2) Refactor external methods ( raslblast) 
        3) Package everything
        4) Consider GUI
    
    '''

    def __init__(self, fastq_path, sequencer_id, probes_path, blastdb_path, blastn_dir, well_annot, write_path, print_on=False,\
                 offset_5p=24, offset_3p=22, wellbc_start=0, wellbc_end=8):
        #create a probe with name name
        
        self.RASLseqReads_obj = RASLseqReads.RASLseqReads(fastq_path, sequencer_id, print_on)
        
        self.RASLseqProbes_obj = RASLseqProbes.RASLseqProbes(probes_path, blastdb_path, blastn_dir, aligner='blast')   
        
        self.RASLseqBCannot_obj = RASLseqBCannot.RASLseqBCannot(well_annot)
        
        self.RASLseqBCannot_obj.well_bc = self.RASLseqBCannot_obj.well_bc
        
        self.sequencer_id = sequencer_id
        
        self.print_on = print_on
        
        self.write_path = write_path
        
        self.offset_5p = int(offset_5p)
        
        self.offset_3p = int(offset_3p)
        
        self.wellbc_start = int(wellbc_start)
        
        self.wellbc_end = int(wellbc_end)
        
        self.read_df = self.RASLseqReads_obj.get_blast_read_df()
        
        self.fastq_read_count = self.read_df.seq_count.sum()  #number of total reads found in fastq input    
        
        
        #filter thresholds
        self.bc_edit_dist_filter = 2
        
        self.blast_results_filter = {'length':30, 'qstart':6, 'obs_wellbc_len_max':10, 'obs_wellbc_len_min':6}
    
    
    
    def _get_probe_well_read_counts(self, collapsed_read_counts):
        '''
        This function aggregates probe-specific read counts for each plate and well
        
        Parameters
        ----------
        collapsed_read_counts: pandas dataframe
            must possess cols: ['plate_barcode','mapped_bc','probe']
            
        Returns
        -------
        Pandas Dataframe
            index: ['plate_barcode','WellBarcode']
            columns: Probe-specific read count sum (sum of counts across reads
            mapping by BLAST to probe)
        '''
        
        #grouping reads by plate barcode, well barcode, and probe
        collapsed_read_counts_group = collapsed_read_counts.groupby(['plate_barcode','mapped_bc','probe'])
        
        #aggregating probe counts for each well
        counts = collapsed_read_counts_group.seq_count.aggregate(np.sum)
        counts_df = counts.unstack(level=2)  #creating matrix of aggregated probe counts indexed on 'plate_barcode','mapped_bc'
        
        
        #id and removal of off-target ligation hits found by ! character in blast sseq name
        on_target_col = [i for i in counts_df.columns if "!" not in i]  
        counts_df = counts_df[on_target_col]  #removing off-target ligation hits from df
        counts_df.index.names= ['PlateBarcode', 'WellBarcode']
        return counts_df
    
    
    
    def _merge_plate_well_annot(self, probe_counts_df, well_annot_df):
        '''
        This function merges gene_counts_df with well annotations
        
        Parameters
        ----------
        probe_counts_df: Pandas DataFrame
            Requires pandas index: ('plate_barcode','WellBarcode')
        
        well_annot_path: Pandas DataFrame
            Requires pandas index: ('plate_barcode','WellBarcode')
        
        Returns
        -------
        Pandas DataFrame
            well_annot_df right joined to gene_counts_df
            index: ('plate_barcode','WellBarcode')
            
        '''
        return well_annot_df.join(probe_counts_df,how='right')
    
    
    

    def get_target_counts_df(self):
        '''
        This method uses a functional programming design to 
        produce a pandas DataFrame describing the RASLseq probe-specific
        read counts prepended with well specific annotations.
        
        '''
        
        #Add observed wellbc
        #self.read_df = RASLseqWellbc.get_rasl_probe_and_wellbc_exact_df(self.read_df, print_on=self.print_on)
        self.read_df['observed_wellbc'] = self.read_df['seq'].str[self.wellbc_start:self.wellbc_end]
         
        #add fuzzy matched wellbc (mapped_bc)
        self.read_df = RASLseqWellbc.get_rasl_probe_and_wellbc_fuzzy_df(self.read_df, self.RASLseqBCannot_obj.well_bc, \
                            print_on=self.print_on)         
        
        #filter fuzzy wellbc mappings
        self.read_df = self.read_df[(self.read_df.bc_edit_dist.astype(float) < self.bc_edit_dist_filter)]
        
        #add observed rasl_probe_seq
        #self.read_df = RASLseqSeq.get_rasl_probe_exact_df(self.read_df)
        self.read_df['rasl_probe'] = self.read_df['seq'].str[self.offset_5p : -self.offset_3p]
        
        #blast alignment
        self.read_df = RASLseqAlign.get_rasl_aligned_df(self.read_df, self.RASLseqProbes_obj.aligner_dir, \
                            self.RASLseqProbes_obj.probedb_file, print_on=self.print_on, aligner='blast')
        
        #filtering blast results
        self.read_df = self.read_df[(self.read_df.length > self.blast_results_filter['length']) & \
                                    (self.read_df.qstart < self.blast_results_filter['qstart']) & \
                                    (self.read_df.observed_wellbc.map(len) < \
                                            self.blast_results_filter['obs_wellbc_len_max']) & \
                                    (self.read_df.observed_wellbc.map(len) > \
                                            self.blast_results_filter['obs_wellbc_len_min'])]     
        
        #SUM PROBE-SPECIFIC READ COUNTS
        self.read_df_counts = self._get_probe_well_read_counts(self.read_df)
        
        self.read_count_mapped = self.read_df_counts.sum().sum()
        
        
        #MERGING WELL ANNOTATIONS AND PROBE READ COUNTS
        #return self.read_df_counts, self.RASLseqBCannot_obj.well_annot_df
        self.RASLseqAnalysis_df = self._merge_plate_well_annot(self.read_df_counts, self.RASLseqBCannot_obj.well_annot_df)     
        
        
        self.RASLseqAnalysis_df.to_csv(self.write_path, sep="\t")
        self.read_df_counts.to_csv(self.write_path+'.read_df_counts', sep="\t")
        self.read_df.to_csv(self.write_path+'.read_df', sep="\t")
        
        os.system('gzip ' + self.write_path+'.read_df')
        
        return

Example Data File Paths


In [3]:
cwd = os.getcwd()
data_dir = os.path.split(cwd)[0] + '/data/'
print data_dir


/Users/ers_vader/git/RASLseqTools/data/

In [4]:
#FASTQ file
fq_path = data_dir +  'sample.fastq.gz'  

#Sequencer ID in FASTQ reads
seq_id = '@HISEQ'

#RASL probe information
probes = data_dir + 'sample.probes'

#Path to write blast database; NOTE: THIS WILL BE DELETED BEFORE THE END OF THE RUN
db_path = data_dir + 'blast_database'

#Path to blastn executable
blastn_path = '/usr/local/ncbi/blast/bin/' #this may be different for you

#Well annotations
well_annots = data_dir +  'sample.bc'  

#Path to write output files
write_path = data_dir + 'sample_output.txt'

In [5]:
!ls $data_dir


sample.bc       sample.fastq.gz sample.probes

In [8]:
#Generates the FASTQ read, Probe, & Well Annotation dataframes
test = RASLseqAnalysis_BLAST.RASLseqAnalysis_BLAST(fq_path, seq_id, probes, db_path, blastn_path, well_annots, write_path)

In [37]:
#Runs Blastn alignment, WellBarcode Mapping, Read Summarization, and Well Annotation 
test.get_target_counts_df()

#Generates 3 files:
#sample_output.txt : Contains the probe reads counts for each well
#sample_output.txt.read_df.gz : contains the blast alignment results for each unique read sequence
#sample_output.txt.read_df_counts : Contains probe read counts for each well with no well metadata 

#NOTE: if this command is run twice an addition sample_output.txt.read_df (19Mbytes file will be written, but not gzipped)

In [17]:
test.RASLseqAnalysis_df


Out[17]:
Concentration NM_000569.6_FCGR3A_2014///NM_000569.6_FCGR3A_2014 NM_000569.6_FCGR3A_2119///NM_000569.6_FCGR3A_2119 NM_000572.2_IL10_1007///NM_000572.2_IL10_1007 NM_000572.2_IL10_787///NM_000572.2_IL10_787 NM_000594.3_TNF_1327///NM_000594.3_TNF_1327 NM_000594.3_TNF_1389///NM_000594.3_TNF_1389 NM_001002_RPLP0_647///NM_001002_RPLP0_647 NM_001002_RPLP0_909///NM_001002_RPLP0_909 NM_001081455.1_P2RY14_2328///NM_001081455.1_P2RY14_2328 NM_001081455.1_P2RY14_2526///NM_001081455.1_P2RY14_2526
PlateBarcode WellBarcode
CATATTG AGCTCAGA NaN 629 614 591 601 616 601 602 599 624 618
ATCGATCG NaN 594 585 598 628 645 614 629 618 605 601
CTAGCTAG NaN 623 598 588 564 609 621 612 639 618 622
GACTGACT 0.1 588 561 588 606 600 619 605 625 640 588
GAGATTC AGCTCAGA NaN 609 632 586 594 562 616 621 623 588 660
ATCGATCG 0.1 631 616 654 643 592 642 599 606 624 620
CTAGCTAG NaN 611 623 608 614 602 628 590 617 635 612
GACTGACT NaN 607 623 647 593 652 638 644 610 581 635
GAGCATG AGCTCAGA NaN 648 593 605 622 620 602 605 635 637 642
ATCGATCG NaN 615 631 609 598 634 629 657 616 610 589
CTAGCTAG 0.1 612 580 621 635 614 596 635 630 679 675
GACTGACT NaN 661 624 645 620 641 615 585 620 612 601
TGATAGT AGCTCAGA 0.1 612 642 618 604 603 612 633 618 636 649
ATCGATCG NaN 599 616 595 603 626 639 640 606 611 630
CTAGCTAG NaN 628 593 608 619 628 632 632 570 623 585
GACTGACT NaN 599 608 572 651 620 641 590 593 626 657

Expected Results for sample_output.txt


In [38]:
df = pd.read_table(write_path, sep='\t')
df


Out[38]:
PlateBarcode WellBarcode Concentration NM_000569.6_FCGR3A_2014///NM_000569.6_FCGR3A_2014 NM_000569.6_FCGR3A_2119///NM_000569.6_FCGR3A_2119 NM_000572.2_IL10_1007///NM_000572.2_IL10_1007 NM_000572.2_IL10_787///NM_000572.2_IL10_787 NM_000594.3_TNF_1327///NM_000594.3_TNF_1327 NM_000594.3_TNF_1389///NM_000594.3_TNF_1389 NM_001002_RPLP0_647///NM_001002_RPLP0_647 NM_001002_RPLP0_909///NM_001002_RPLP0_909 NM_001081455.1_P2RY14_2328///NM_001081455.1_P2RY14_2328 NM_001081455.1_P2RY14_2526///NM_001081455.1_P2RY14_2526
0 CATATTG AGCTCAGA NaN 626 605 587 593 612 599 596 590 618 614
1 CATATTG ATCGATCG NaN 587 580 592 622 640 607 621 611 601 596
2 CATATTG CTAGCTAG NaN 618 591 581 556 604 620 610 630 608 620
3 CATATTG GACTGACT 0.1 584 554 585 601 595 614 598 615 629 583
4 GAGATTC AGCTCAGA NaN 605 629 578 590 556 613 615 615 586 651
5 GAGATTC ATCGATCG 0.1 627 612 649 638 587 635 593 598 617 615
6 GAGATTC CTAGCTAG NaN 604 616 603 608 598 618 584 613 629 610
7 GAGATTC GACTGACT NaN 604 611 642 589 647 632 639 606 574 625
8 GAGCATG AGCTCAGA NaN 645 586 602 617 617 597 598 633 629 637
9 GAGCATG ATCGATCG NaN 606 626 603 592 627 620 653 609 604 582
10 GAGCATG CTAGCTAG 0.1 607 579 618 628 610 591 627 628 673 667
11 GAGCATG GACTGACT NaN 650 619 638 613 637 608 582 614 607 595
12 TGATAGT AGCTCAGA 0.1 602 635 614 593 598 605 625 610 628 643
13 TGATAGT ATCGATCG NaN 592 605 588 591 618 633 629 597 606 624
14 TGATAGT CTAGCTAG NaN 622 589 602 610 623 618 624 567 617 580
15 TGATAGT GACTGACT NaN 599 601 565 647 616 637 584 587 618 655
The NaN values for the Concentration column are expected because the sample.bc file only contains information for 4 combinations of PlateBarcodes & WellBarcodes