Libraries



In [2]:

    
import pandas as pd
import sys,os
import random
import numpy as np
sys.path.append('../')
from RASLseqTools import *
from RASLseqTools import RASLseqAnalysis_BLAST

RASLseqAnalysis Module Code for Developing



In [32]:

    
class RASLseqAnalysis_BLAST(object):
    '''
    This class creates a pandas DataFrame for RASLseq fastq sequences.
    
    Attributes of this class annotate RASLseq fastq sequences.
    
    The get_attributes method allows the user to specify what additional
    information to add to the pandas DataFrame.
    
    
    Attributes
    ----------
    
    fastq_file: str 
        path to Fastq file
        
    probes_path: str
        path to Probe file
        
    blastdb: str 
        path to write a blast database
        
    blastn_dir: str
        path to directory holding the blastn executable
    
    well_annot: str
        path to Barcode Annotations file
    
    write: str 
        path to write directory
    
    print_on: boolean, default=False
        Whether to print information during data processing
    
    read_df: pandas DataFrame
        index: (PlateBarcode, sequence)
        columns: ['PlateBarcode', 'seq', 'seq_count']
            PlateBarcode - Index read from fastq file
            seq - fastq read sequence
            seq_count - number of occurrences of seq in fastq
    
    
    APRIL 14 2014: ToDos:  
        2) Refactor external methods ( raslblast) 
        3) Package everything
        4) Consider GUI
    
    '''

    def __init__(self, fastq_path, sequencer_id, probes_path, blastdb_path, blastn_dir, well_annot, write_path, print_on=False,\
                 offset_5p=24, offset_3p=22, wellbc_start=0, wellbc_end=8):
        #create a probe with name name
        
        self.RASLseqReads_obj = RASLseqReads.RASLseqReads(fastq_path, sequencer_id, print_on)
        
        self.RASLseqProbes_obj = RASLseqProbes.RASLseqProbes(probes_path, blastdb_path, blastn_dir, aligner='blast')   
        
        self.RASLseqBCannot_obj = RASLseqBCannot.RASLseqBCannot(well_annot)
        
        self.RASLseqBCannot_obj.well_bc = self.RASLseqBCannot_obj.well_bc
        
        self.sequencer_id = sequencer_id
        
        self.print_on = print_on
        
        self.write_path = write_path
        
        self.offset_5p = int(offset_5p)
        
        self.offset_3p = int(offset_3p)
        
        self.wellbc_start = int(wellbc_start)
        
        self.wellbc_end = int(wellbc_end)
        
        self.read_df = self.RASLseqReads_obj.get_blast_read_df()
        
        self.fastq_read_count = self.read_df.seq_count.sum()  #number of total reads found in fastq input    
        
        
        #filter thresholds
        self.bc_edit_dist_filter = 2
        
        self.blast_results_filter = {'length':30, 'qstart':6, 'obs_wellbc_len_max':10, 'obs_wellbc_len_min':6}
    
    
    
    def _get_probe_well_read_counts(self, collapsed_read_counts):
        '''
        This function aggregates probe-specific read counts for each plate and well
        
        Parameters
        ----------
        collapsed_read_counts: pandas dataframe
            must possess cols: ['plate_barcode','mapped_bc','probe']
            
        Returns
        -------
        Pandas Dataframe
            index: ['plate_barcode','WellBarcode']
            columns: Probe-specific read count sum (sum of counts across reads
            mapping by BLAST to probe)
        '''
        
        #grouping reads by plate barcode, well barcode, and probe
        collapsed_read_counts_group = collapsed_read_counts.groupby(['plate_barcode','mapped_bc','probe'])
        
        #aggregating probe counts for each well
        counts = collapsed_read_counts_group.seq_count.aggregate(np.sum)
        counts_df = counts.unstack(level=2)  #creating matrix of aggregated probe counts indexed on 'plate_barcode','mapped_bc'
        
        
        #id and removal of off-target ligation hits found by ! character in blast sseq name
        on_target_col = [i for i in counts_df.columns if "!" not in i]  
        counts_df = counts_df[on_target_col]  #removing off-target ligation hits from df
        counts_df.index.names= ['PlateBarcode', 'WellBarcode']
        return counts_df
    
    
    
    def _merge_plate_well_annot(self, probe_counts_df, well_annot_df):
        '''
        This function merges gene_counts_df with well annotations
        
        Parameters
        ----------
        probe_counts_df: Pandas DataFrame
            Requires pandas index: ('plate_barcode','WellBarcode')
        
        well_annot_path: Pandas DataFrame
            Requires pandas index: ('plate_barcode','WellBarcode')
        
        Returns
        -------
        Pandas DataFrame
            well_annot_df right joined to gene_counts_df
            index: ('plate_barcode','WellBarcode')
            
        '''
        return well_annot_df.join(probe_counts_df,how='right')
    
    
    

    def get_target_counts_df(self):
        '''
        This method uses a functional programming design to 
        produce a pandas DataFrame describing the RASLseq probe-specific
        read counts prepended with well specific annotations.
        
        '''
        
        #Add observed wellbc
        #self.read_df = RASLseqWellbc.get_rasl_probe_and_wellbc_exact_df(self.read_df, print_on=self.print_on)
        self.read_df['observed_wellbc'] = self.read_df['seq'].str[self.wellbc_start:self.wellbc_end]
         
        #add fuzzy matched wellbc (mapped_bc)
        self.read_df = RASLseqWellbc.get_rasl_probe_and_wellbc_fuzzy_df(self.read_df, self.RASLseqBCannot_obj.well_bc, \
                            print_on=self.print_on)         
        
        #filter fuzzy wellbc mappings
        self.read_df = self.read_df[(self.read_df.bc_edit_dist.astype(float) < self.bc_edit_dist_filter)]
        
        #add observed rasl_probe_seq
        #self.read_df = RASLseqSeq.get_rasl_probe_exact_df(self.read_df)
        self.read_df['rasl_probe'] = self.read_df['seq'].str[self.offset_5p : -self.offset_3p]
        
        #blast alignment
        self.read_df = RASLseqAlign.get_rasl_aligned_df(self.read_df, self.RASLseqProbes_obj.aligner_dir, \
                            self.RASLseqProbes_obj.probedb_file, print_on=self.print_on, aligner='blast')
        
        #filtering blast results
        self.read_df = self.read_df[(self.read_df.length > self.blast_results_filter['length']) & \
                                    (self.read_df.qstart < self.blast_results_filter['qstart']) & \
                                    (self.read_df.observed_wellbc.map(len) < \
                                            self.blast_results_filter['obs_wellbc_len_max']) & \
                                    (self.read_df.observed_wellbc.map(len) > \
                                            self.blast_results_filter['obs_wellbc_len_min'])]     
        
        #SUM PROBE-SPECIFIC READ COUNTS
        self.read_df_counts = self._get_probe_well_read_counts(self.read_df)
        
        self.read_count_mapped = self.read_df_counts.sum().sum()
        
        
        #MERGING WELL ANNOTATIONS AND PROBE READ COUNTS
        #return self.read_df_counts, self.RASLseqBCannot_obj.well_annot_df
        self.RASLseqAnalysis_df = self._merge_plate_well_annot(self.read_df_counts, self.RASLseqBCannot_obj.well_annot_df)     
        
        
        self.RASLseqAnalysis_df.to_csv(self.write_path, sep="\t")
        self.read_df_counts.to_csv(self.write_path+'.read_df_counts', sep="\t")
        self.read_df.to_csv(self.write_path+'.read_df', sep="\t")
        
        os.system('gzip ' + self.write_path+'.read_df')
        
        return

Example Data File Paths



In [3]:

    
cwd = os.getcwd()
data_dir = os.path.split(cwd)[0] + '/data/'
print data_dir









    



/Users/ers_vader/git/RASLseqTools/data/



In [4]:

    
#FASTQ file
fq_path = data_dir +  'sample.fastq.gz'  

#Sequencer ID in FASTQ reads
seq_id = '@HISEQ'

#RASL probe information
probes = data_dir + 'sample.probes'

#Path to write blast database; NOTE: THIS WILL BE DELETED BEFORE THE END OF THE RUN
db_path = data_dir + 'blast_database'

#Path to blastn executable
blastn_path = '/usr/local/ncbi/blast/bin/' #this may be different for you

#Well annotations
well_annots = data_dir +  'sample.bc'  

#Path to write output files
write_path = data_dir + 'sample_output.txt'



In [5]:

    
!ls $data_dir









    



sample.bc       sample.fastq.gz sample.probes



In [8]:

    
#Generates the FASTQ read, Probe, & Well Annotation dataframes
test = RASLseqAnalysis_BLAST.RASLseqAnalysis_BLAST(fq_path, seq_id, probes, db_path, blastn_path, well_annots, write_path)



In [37]:

    
#Runs Blastn alignment, WellBarcode Mapping, Read Summarization, and Well Annotation 
test.get_target_counts_df()

#Generates 3 files:
#sample_output.txt : Contains the probe reads counts for each well
#sample_output.txt.read_df.gz : contains the blast alignment results for each unique read sequence
#sample_output.txt.read_df_counts : Contains probe read counts for each well with no well metadata 

#NOTE: if this command is run twice an addition sample_output.txt.read_df (19Mbytes file will be written, but not gzipped)



In [17]:

    
test.RASLseqAnalysis_df









    Out[17]:






  
    
      
      
      Concentration
      NM_000569.6_FCGR3A_2014///NM_000569.6_FCGR3A_2014
      NM_000569.6_FCGR3A_2119///NM_000569.6_FCGR3A_2119
      NM_000572.2_IL10_1007///NM_000572.2_IL10_1007
      NM_000572.2_IL10_787///NM_000572.2_IL10_787
      NM_000594.3_TNF_1327///NM_000594.3_TNF_1327
      NM_000594.3_TNF_1389///NM_000594.3_TNF_1389
      NM_001002_RPLP0_647///NM_001002_RPLP0_647
      NM_001002_RPLP0_909///NM_001002_RPLP0_909
      NM_001081455.1_P2RY14_2328///NM_001081455.1_P2RY14_2328
      NM_001081455.1_P2RY14_2526///NM_001081455.1_P2RY14_2526
    
    
      PlateBarcode
      WellBarcode
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      CATATTG
      AGCTCAGA
       NaN
       629
       614
       591
       601
       616
       601
       602
       599
       624
       618
    
    
      ATCGATCG
       NaN
       594
       585
       598
       628
       645
       614
       629
       618
       605
       601
    
    
      CTAGCTAG
       NaN
       623
       598
       588
       564
       609
       621
       612
       639
       618
       622
    
    
      GACTGACT
       0.1
       588
       561
       588
       606
       600
       619
       605
       625
       640
       588
    
    
      GAGATTC
      AGCTCAGA
       NaN
       609
       632
       586
       594
       562
       616
       621
       623
       588
       660
    
    
      ATCGATCG
       0.1
       631
       616
       654
       643
       592
       642
       599
       606
       624
       620
    
    
      CTAGCTAG
       NaN
       611
       623
       608
       614
       602
       628
       590
       617
       635
       612
    
    
      GACTGACT
       NaN
       607
       623
       647
       593
       652
       638
       644
       610
       581
       635
    
    
      GAGCATG
      AGCTCAGA
       NaN
       648
       593
       605
       622
       620
       602
       605
       635
       637
       642
    
    
      ATCGATCG
       NaN
       615
       631
       609
       598
       634
       629
       657
       616
       610
       589
    
    
      CTAGCTAG
       0.1
       612
       580
       621
       635
       614
       596
       635
       630
       679
       675
    
    
      GACTGACT
       NaN
       661
       624
       645
       620
       641
       615
       585
       620
       612
       601
    
    
      TGATAGT
      AGCTCAGA
       0.1
       612
       642
       618
       604
       603
       612
       633
       618
       636
       649
    
    
      ATCGATCG
       NaN
       599
       616
       595
       603
       626
       639
       640
       606
       611
       630
    
    
      CTAGCTAG
       NaN
       628
       593
       608
       619
       628
       632
       632
       570
       623
       585
    
    
      GACTGACT
       NaN
       599
       608
       572
       651
       620
       641
       590
       593
       626
       657

Expected Results for sample_output.txt



In [38]:

    
df = pd.read_table(write_path, sep='\t')
df









    Out[38]:






  
    
      
      PlateBarcode
      WellBarcode
      Concentration
      NM_000569.6_FCGR3A_2014///NM_000569.6_FCGR3A_2014
      NM_000569.6_FCGR3A_2119///NM_000569.6_FCGR3A_2119
      NM_000572.2_IL10_1007///NM_000572.2_IL10_1007
      NM_000572.2_IL10_787///NM_000572.2_IL10_787
      NM_000594.3_TNF_1327///NM_000594.3_TNF_1327
      NM_000594.3_TNF_1389///NM_000594.3_TNF_1389
      NM_001002_RPLP0_647///NM_001002_RPLP0_647
      NM_001002_RPLP0_909///NM_001002_RPLP0_909
      NM_001081455.1_P2RY14_2328///NM_001081455.1_P2RY14_2328
      NM_001081455.1_P2RY14_2526///NM_001081455.1_P2RY14_2526
    
  
  
    
      0 
       CATATTG
       AGCTCAGA
       NaN
       626
       605
       587
       593
       612
       599
       596
       590
       618
       614
    
    
      1 
       CATATTG
       ATCGATCG
       NaN
       587
       580
       592
       622
       640
       607
       621
       611
       601
       596
    
    
      2 
       CATATTG
       CTAGCTAG
       NaN
       618
       591
       581
       556
       604
       620
       610
       630
       608
       620
    
    
      3 
       CATATTG
       GACTGACT
       0.1
       584
       554
       585
       601
       595
       614
       598
       615
       629
       583
    
    
      4 
       GAGATTC
       AGCTCAGA
       NaN
       605
       629
       578
       590
       556
       613
       615
       615
       586
       651
    
    
      5 
       GAGATTC
       ATCGATCG
       0.1
       627
       612
       649
       638
       587
       635
       593
       598
       617
       615
    
    
      6 
       GAGATTC
       CTAGCTAG
       NaN
       604
       616
       603
       608
       598
       618
       584
       613
       629
       610
    
    
      7 
       GAGATTC
       GACTGACT
       NaN
       604
       611
       642
       589
       647
       632
       639
       606
       574
       625
    
    
      8 
       GAGCATG
       AGCTCAGA
       NaN
       645
       586
       602
       617
       617
       597
       598
       633
       629
       637
    
    
      9 
       GAGCATG
       ATCGATCG
       NaN
       606
       626
       603
       592
       627
       620
       653
       609
       604
       582
    
    
      10
       GAGCATG
       CTAGCTAG
       0.1
       607
       579
       618
       628
       610
       591
       627
       628
       673
       667
    
    
      11
       GAGCATG
       GACTGACT
       NaN
       650
       619
       638
       613
       637
       608
       582
       614
       607
       595
    
    
      12
       TGATAGT
       AGCTCAGA
       0.1
       602
       635
       614
       593
       598
       605
       625
       610
       628
       643
    
    
      13
       TGATAGT
       ATCGATCG
       NaN
       592
       605
       588
       591
       618
       633
       629
       597
       606
       624
    
    
      14
       TGATAGT
       CTAGCTAG
       NaN
       622
       589
       602
       610
       623
       618
       624
       567
       617
       580
    
    
      15
       TGATAGT
       GACTGACT
       NaN
       599
       601
       565
       647
       616
       637
       584
       587
       618
       655

The NaN values for the Concentration column are expected because the sample.bc file only contains information for 4 combinations of PlateBarcodes & WellBarcodes

		Concentration	NM_000569.6_FCGR3A_2014///NM_000569.6_FCGR3A_2014	NM_000569.6_FCGR3A_2119///NM_000569.6_FCGR3A_2119	NM_000572.2_IL10_1007///NM_000572.2_IL10_1007	NM_000572.2_IL10_787///NM_000572.2_IL10_787	NM_000594.3_TNF_1327///NM_000594.3_TNF_1327	NM_000594.3_TNF_1389///NM_000594.3_TNF_1389	NM_001002_RPLP0_647///NM_001002_RPLP0_647	NM_001002_RPLP0_909///NM_001002_RPLP0_909	NM_001081455.1_P2RY14_2328///NM_001081455.1_P2RY14_2328	NM_001081455.1_P2RY14_2526///NM_001081455.1_P2RY14_2526
PlateBarcode	WellBarcode
CATATTG	AGCTCAGA	NaN	629	614	591	601	616	601	602	599	624	618
	ATCGATCG	NaN	594	585	598	628	645	614	629	618	605	601
	CTAGCTAG	NaN	623	598	588	564	609	621	612	639	618	622
	GACTGACT	0.1	588	561	588	606	600	619	605	625	640	588
GAGATTC	AGCTCAGA	NaN	609	632	586	594	562	616	621	623	588	660
	ATCGATCG	0.1	631	616	654	643	592	642	599	606	624	620
	CTAGCTAG	NaN	611	623	608	614	602	628	590	617	635	612
	GACTGACT	NaN	607	623	647	593	652	638	644	610	581	635
GAGCATG	AGCTCAGA	NaN	648	593	605	622	620	602	605	635	637	642
	ATCGATCG	NaN	615	631	609	598	634	629	657	616	610	589
	CTAGCTAG	0.1	612	580	621	635	614	596	635	630	679	675
	GACTGACT	NaN	661	624	645	620	641	615	585	620	612	601
TGATAGT	AGCTCAGA	0.1	612	642	618	604	603	612	633	618	636	649
	ATCGATCG	NaN	599	616	595	603	626	639	640	606	611	630
	CTAGCTAG	NaN	628	593	608	619	628	632	632	570	623	585
	GACTGACT	NaN	599	608	572	651	620	641	590	593	626	657