Libraries


In [30]:
from RASLseqAligner import *
import pandas as pd
import os
import random
import numpy as np

RASLseqAnalysis Module Code for Developing


In [32]:
class RASLseqAnalysis_obj_heur(object):
    '''
    This class creates a pandas DataFrame for RASLseq fastq sequences.
    
    Attributes of this class annotate RASLseq fastq sequences.
    
    The get_attributes method allows the user to specify what additional
    information to add to the pandas DataFrame.
    
    
    Attributes
    ----------
    
    fastq_file: str 
        path to Fastq file
        
    probes_path: str
        path to Probe file
        
    blastdb: str 
        path to write a blast database
        
    blastn_dir: str
        path to directory holding the blastn executable
    
    well_annot: str
        path to Barcode Annotations file
    
    write: str 
        path to write directory
    
    print_on: boolean, default=False
        Whether to print information during data processing
    
    read_df: pandas DataFrame
        index: (PlateBarcode, sequence)
        columns: ['PlateBarcode', 'seq', 'seq_count']
            PlateBarcode - Index read from fastq file
            seq - fastq read sequence
            seq_count - number of occurrences of seq in fastq
    
    
    APRIL 14 2014: ToDos:  
        2) Refactor external methods ( raslblast) 
        3) Package everything
        4) Consider GUI
    
    '''

    def __init__(self, fastq_path, sequencer_id, probes_path, blastdb_path, blastn_dir, well_annot, write_path, print_on=False ):
        #create a probe with name name
        
        self.RASLseqReads_obj = RASLseqReads.RASLseqReads(fastq_path, sequencer_id, print_on)
        
        self.RASLseqProbes_obj = RASLseqProbes.RASLseqProbes(probes_path, blastdb_path, blastn_dir)   
        
        self.RASLseqBCannot_obj = RASLseqBCannot.RASLseqBCannot(well_annot)
        
        self.RASLseqBCannot_obj.well_bc = self.RASLseqBCannot_obj.well_bc
        
        self.sequencer_id = sequencer_id
        
        self.print_on = print_on
        
        self.write_path = write_path
        
        #self.write_obj = open(write_path,'w')
        
        self.read_df = self.RASLseqReads_obj.read_df
        
        self.fastq_read_count = self.read_df.seq_count.sum()  #number of total reads found in fastq input    
        
        
        #filter thresholds
        self.bc_edit_dist_filter = 2
        
        self.blast_results_filter = {'length':30, 'qstart':6, 'obs_wellbc_len_max':10, 'obs_wellbc_len_min':6}
    
    
    
    def _get_probe_well_read_counts(self, collapsed_read_counts):
        '''
        This function aggregates probe-specific read counts for each plate and well
        
        Parameters
        ----------
        collapsed_read_counts: pandas dataframe
            must possess cols: ['plate_barcode','mapped_bc','probe']
            
        Returns
        -------
        Pandas Dataframe
            index: ['plate_barcode','WellBarcode']
            columns: Probe-specific read count sum (sum of counts across reads
            mapping by BLAST to probe)
        '''
        
        #grouping reads by plate barcode, well barcode, and probe
        collapsed_read_counts_group = collapsed_read_counts.groupby(['plate_barcode','mapped_bc','probe'])
        
        #aggregating probe counts for each well
        counts = collapsed_read_counts_group.seq_count.aggregate(np.sum)
        counts_df = counts.unstack(level=2)  #creating matrix of aggregated probe counts indexed on 'plate_barcode','mapped_bc'
        
        
        #id and removal of off-target ligation hits found by ! character in blast sseq name
        on_target_col = [i for i in counts_df.columns if "!" not in i]  
        counts_df = counts_df[on_target_col]  #removing off-target ligation hits from df
        counts_df.index.names= ['PlateBarcode', 'WellBarcode']
        return counts_df
    
    
    
    def _merge_plate_well_annot(self, probe_counts_df, well_annot_df):
        '''
        This function merges gene_counts_df with well annotations
        
        Parameters
        ----------
        probe_counts_df: Pandas DataFrame
            Requires pandas index: ('plate_barcode','WellBarcode')
        
        well_annot_path: Pandas DataFrame
            Requires pandas index: ('plate_barcode','WellBarcode')
        
        Returns
        -------
        Pandas DataFrame
            well_annot_df right joined to gene_counts_df
            index: ('plate_barcode','WellBarcode')
            
        '''
        return well_annot_df.join(probe_counts_df,how='right')
    
    
    

    def get_target_counts_df(self):
        '''
        This method uses a functional programming design to 
        produce a pandas DataFrame describing the RASLseq probe-specific
        read counts prepended with well specific annotations.
        

        '''
        
        #Add observed wellbc
        #self.read_df = RASLseqWellbc.get_rasl_probe_and_wellbc_exact_df(self.read_df, print_on=self.print_on)
        self.read_df['observed_wellbc'] = self.read_df['seq'].str[:8]
         
        #add fuzzy matched wellbc (mapped_bc)
        self.read_df = RASLseqWellbc.get_rasl_probe_and_wellbc_fuzzy_df(self.read_df, self.RASLseqBCannot_obj.well_bc, \
                            print_on=self.print_on)         
        
        #filter fuzzy wellbc mappings
        self.read_df = self.read_df[(self.read_df.bc_edit_dist.astype(float) < self.bc_edit_dist_filter)]
        
        #add observed rasl_probe_seq
        #self.read_df = RASLseqSeq.get_rasl_probe_exact_df(self.read_df)
        self.read_df['rasl_probe'] = self.read_df['seq'].str[23:65]
        
        #blast alignment
        self.read_df = RASLseqAlign.get_rasl_blast_df(self.read_df, self.RASLseqProbes_obj.blastn_dir, \
                            self.RASLseqProbes_obj.blastdb_file, print_on=self.print_on)
        
        #filtering blast results
        self.read_df = self.read_df[(self.read_df.length > self.blast_results_filter['length']) & \
                                    (self.read_df.qstart < self.blast_results_filter['qstart']) & \
                                    (self.read_df.observed_wellbc.map(len) < \
                                            self.blast_results_filter['obs_wellbc_len_max']) & \
                                    (self.read_df.observed_wellbc.map(len) > \
                                            self.blast_results_filter['obs_wellbc_len_min'])]     
        
        #SUM PROBE-SPECIFIC READ COUNTS
        self.read_df_counts = self._get_probe_well_read_counts(self.read_df)
        
        self.read_count_mapped = self.read_df_counts.sum().sum()
        
        
        #MERGING WELL ANNOTATIONS AND PROBE READ COUNTS
        #return self.read_df_counts, self.RASLseqBCannot_obj.well_annot_df
        self.RASLseqAnalysis_df = self._merge_plate_well_annot(self.read_df_counts, self.RASLseqBCannot_obj.well_annot_df)     
        
        
        self.RASLseqAnalysis_df.to_csv(self.write_path, sep="\t")
        self.read_df_counts.to_csv(self.write_path+'.read_df_counts', sep="\t")
        self.read_df.to_csv(self.write_path+'.read_df', sep="\t")
        
        os.system('gzip ' + self.write_path+'.read_df')
        
        return

Example Data File Paths


In [ ]:
cwd = os.getcwd()
data_dir = os.path.split(cwd)[0] + '/data/'
print data_dir

In [34]:
#FASTQ file
fq_path = data_dir +  'sample.fastq.gz'  

#Sequencer ID in FASTQ reads
seq_id = '@HISEQ'

#RASL probe information
probes = data_dir + 'sample.probes'

#Path to write blast database; NOTE: THIS WILL BE DELETED BEFORE THE END OF THE RUN
db_path = data_dir + 'blast_database'

#Path to blastn executable
blastn_path = '/usr/local/ncbi/blast/bin/' #this may be different for you

#Well annotations
well_annots = data_dir +  'sample.bc'  

#Path to write output files
write_path = data_dir + 'sample_output.txt'

In [35]:
!ls $data_dir


sample.bc                        sample.probes                    sample_output.txt.read_df_counts
sample.fastq.gz                  sample_output.txt

In [36]:
#Generates the FASTQ read, Probe, & Well Annotation dataframes
test = RASLseqAnalysis_obj_heur(fq_path, seq_id, probes, db_path, blastn_path, well_annots, write_path)

In [37]:
#Runs Blastn alignment, WellBarcode Mapping, Read Summarization, and Well Annotation 
test.get_target_counts_df()

#Generates 3 files:
#sample_output.txt : Contains the probe reads counts for each well
#sample_output.txt.read_df.gz : contains the blast alignment results for each unique read sequence
#sample_output.txt.read_df_counts : Contains probe read counts for each well with no well metadata 

#NOTE: if this command is run twice an addition sample_output.txt.read_df (19Mbytes file will be written, but not gzipped)

Expected Results for sample_output.txt


In [38]:
df = pd.read_table(write_path, sep='\t')
df


Out[38]:
PlateBarcode WellBarcode Concentration NM_000569.6_FCGR3A_2014///NM_000569.6_FCGR3A_2014 NM_000569.6_FCGR3A_2119///NM_000569.6_FCGR3A_2119 NM_000572.2_IL10_1007///NM_000572.2_IL10_1007 NM_000572.2_IL10_787///NM_000572.2_IL10_787 NM_000594.3_TNF_1327///NM_000594.3_TNF_1327 NM_000594.3_TNF_1389///NM_000594.3_TNF_1389 NM_001002_RPLP0_647///NM_001002_RPLP0_647 NM_001002_RPLP0_909///NM_001002_RPLP0_909 NM_001081455.1_P2RY14_2328///NM_001081455.1_P2RY14_2328 NM_001081455.1_P2RY14_2526///NM_001081455.1_P2RY14_2526
0 CATATTG AGCTCAGA NaN 626 605 587 593 612 599 596 590 618 614
1 CATATTG ATCGATCG NaN 587 580 592 622 640 607 621 611 601 596
2 CATATTG CTAGCTAG NaN 618 591 581 556 604 620 610 630 608 620
3 CATATTG GACTGACT 0.1 584 554 585 601 595 614 598 615 629 583
4 GAGATTC AGCTCAGA NaN 605 629 578 590 556 613 615 615 586 651
5 GAGATTC ATCGATCG 0.1 627 612 649 638 587 635 593 598 617 615
6 GAGATTC CTAGCTAG NaN 604 616 603 608 598 618 584 613 629 610
7 GAGATTC GACTGACT NaN 604 611 642 589 647 632 639 606 574 625
8 GAGCATG AGCTCAGA NaN 645 586 602 617 617 597 598 633 629 637
9 GAGCATG ATCGATCG NaN 606 626 603 592 627 620 653 609 604 582
10 GAGCATG CTAGCTAG 0.1 607 579 618 628 610 591 627 628 673 667
11 GAGCATG GACTGACT NaN 650 619 638 613 637 608 582 614 607 595
12 TGATAGT AGCTCAGA 0.1 602 635 614 593 598 605 625 610 628 643
13 TGATAGT ATCGATCG NaN 592 605 588 591 618 633 629 597 606 624
14 TGATAGT CTAGCTAG NaN 622 589 602 610 623 618 624 567 617 580
15 TGATAGT GACTGACT NaN 599 601 565 647 616 637 584 587 618 655
The NaN values for the Concentration column are expected because the sample.bc file only contains information for 4 combinations of PlateBarcodes & WellBarcodes