Libraries



In [30]:

    
from RASLseqAligner import *
import pandas as pd
import os
import random
import numpy as np

RASLseqAnalysis Module Code for Developing



In [32]:

    
class RASLseqAnalysis_obj_heur(object):
    '''
    This class creates a pandas DataFrame for RASLseq fastq sequences.
    
    Attributes of this class annotate RASLseq fastq sequences.
    
    The get_attributes method allows the user to specify what additional
    information to add to the pandas DataFrame.
    
    
    Attributes
    ----------
    
    fastq_file: str 
        path to Fastq file
        
    probes_path: str
        path to Probe file
        
    blastdb: str 
        path to write a blast database
        
    blastn_dir: str
        path to directory holding the blastn executable
    
    well_annot: str
        path to Barcode Annotations file
    
    write: str 
        path to write directory
    
    print_on: boolean, default=False
        Whether to print information during data processing
    
    read_df: pandas DataFrame
        index: (PlateBarcode, sequence)
        columns: ['PlateBarcode', 'seq', 'seq_count']
            PlateBarcode - Index read from fastq file
            seq - fastq read sequence
            seq_count - number of occurrences of seq in fastq
    
    
    APRIL 14 2014: ToDos:  
        2) Refactor external methods ( raslblast) 
        3) Package everything
        4) Consider GUI
    
    '''

    def __init__(self, fastq_path, sequencer_id, probes_path, blastdb_path, blastn_dir, well_annot, write_path, print_on=False ):
        #create a probe with name name
        
        self.RASLseqReads_obj = RASLseqReads.RASLseqReads(fastq_path, sequencer_id, print_on)
        
        self.RASLseqProbes_obj = RASLseqProbes.RASLseqProbes(probes_path, blastdb_path, blastn_dir)   
        
        self.RASLseqBCannot_obj = RASLseqBCannot.RASLseqBCannot(well_annot)
        
        self.RASLseqBCannot_obj.well_bc = self.RASLseqBCannot_obj.well_bc
        
        self.sequencer_id = sequencer_id
        
        self.print_on = print_on
        
        self.write_path = write_path
        
        #self.write_obj = open(write_path,'w')
        
        self.read_df = self.RASLseqReads_obj.read_df
        
        self.fastq_read_count = self.read_df.seq_count.sum()  #number of total reads found in fastq input    
        
        
        #filter thresholds
        self.bc_edit_dist_filter = 2
        
        self.blast_results_filter = {'length':30, 'qstart':6, 'obs_wellbc_len_max':10, 'obs_wellbc_len_min':6}
    
    
    
    def _get_probe_well_read_counts(self, collapsed_read_counts):
        '''
        This function aggregates probe-specific read counts for each plate and well
        
        Parameters
        ----------
        collapsed_read_counts: pandas dataframe
            must possess cols: ['plate_barcode','mapped_bc','probe']
            
        Returns
        -------
        Pandas Dataframe
            index: ['plate_barcode','WellBarcode']
            columns: Probe-specific read count sum (sum of counts across reads
            mapping by BLAST to probe)
        '''
        
        #grouping reads by plate barcode, well barcode, and probe
        collapsed_read_counts_group = collapsed_read_counts.groupby(['plate_barcode','mapped_bc','probe'])
        
        #aggregating probe counts for each well
        counts = collapsed_read_counts_group.seq_count.aggregate(np.sum)
        counts_df = counts.unstack(level=2)  #creating matrix of aggregated probe counts indexed on 'plate_barcode','mapped_bc'
        
        
        #id and removal of off-target ligation hits found by ! character in blast sseq name
        on_target_col = [i for i in counts_df.columns if "!" not in i]  
        counts_df = counts_df[on_target_col]  #removing off-target ligation hits from df
        counts_df.index.names= ['PlateBarcode', 'WellBarcode']
        return counts_df
    
    
    
    def _merge_plate_well_annot(self, probe_counts_df, well_annot_df):
        '''
        This function merges gene_counts_df with well annotations
        
        Parameters
        ----------
        probe_counts_df: Pandas DataFrame
            Requires pandas index: ('plate_barcode','WellBarcode')
        
        well_annot_path: Pandas DataFrame
            Requires pandas index: ('plate_barcode','WellBarcode')
        
        Returns
        -------
        Pandas DataFrame
            well_annot_df right joined to gene_counts_df
            index: ('plate_barcode','WellBarcode')
            
        '''
        return well_annot_df.join(probe_counts_df,how='right')
    
    
    

    def get_target_counts_df(self):
        '''
        This method uses a functional programming design to 
        produce a pandas DataFrame describing the RASLseq probe-specific
        read counts prepended with well specific annotations.
        

        '''
        
        #Add observed wellbc
        #self.read_df = RASLseqWellbc.get_rasl_probe_and_wellbc_exact_df(self.read_df, print_on=self.print_on)
        self.read_df['observed_wellbc'] = self.read_df['seq'].str[:8]
         
        #add fuzzy matched wellbc (mapped_bc)
        self.read_df = RASLseqWellbc.get_rasl_probe_and_wellbc_fuzzy_df(self.read_df, self.RASLseqBCannot_obj.well_bc, \
                            print_on=self.print_on)         
        
        #filter fuzzy wellbc mappings
        self.read_df = self.read_df[(self.read_df.bc_edit_dist.astype(float) < self.bc_edit_dist_filter)]
        
        #add observed rasl_probe_seq
        #self.read_df = RASLseqSeq.get_rasl_probe_exact_df(self.read_df)
        self.read_df['rasl_probe'] = self.read_df['seq'].str[23:65]
        
        #blast alignment
        self.read_df = RASLseqAlign.get_rasl_blast_df(self.read_df, self.RASLseqProbes_obj.blastn_dir, \
                            self.RASLseqProbes_obj.blastdb_file, print_on=self.print_on)
        
        #filtering blast results
        self.read_df = self.read_df[(self.read_df.length > self.blast_results_filter['length']) & \
                                    (self.read_df.qstart < self.blast_results_filter['qstart']) & \
                                    (self.read_df.observed_wellbc.map(len) < \
                                            self.blast_results_filter['obs_wellbc_len_max']) & \
                                    (self.read_df.observed_wellbc.map(len) > \
                                            self.blast_results_filter['obs_wellbc_len_min'])]     
        
        #SUM PROBE-SPECIFIC READ COUNTS
        self.read_df_counts = self._get_probe_well_read_counts(self.read_df)
        
        self.read_count_mapped = self.read_df_counts.sum().sum()
        
        
        #MERGING WELL ANNOTATIONS AND PROBE READ COUNTS
        #return self.read_df_counts, self.RASLseqBCannot_obj.well_annot_df
        self.RASLseqAnalysis_df = self._merge_plate_well_annot(self.read_df_counts, self.RASLseqBCannot_obj.well_annot_df)     
        
        
        self.RASLseqAnalysis_df.to_csv(self.write_path, sep="\t")
        self.read_df_counts.to_csv(self.write_path+'.read_df_counts', sep="\t")
        self.read_df.to_csv(self.write_path+'.read_df', sep="\t")
        
        os.system('gzip ' + self.write_path+'.read_df')
        
        return

Example Data File Paths



In [ ]:

    
cwd = os.getcwd()
data_dir = os.path.split(cwd)[0] + '/data/'
print data_dir



In [34]:

    
#FASTQ file
fq_path = data_dir +  'sample.fastq.gz'  

#Sequencer ID in FASTQ reads
seq_id = '@HISEQ'

#RASL probe information
probes = data_dir + 'sample.probes'

#Path to write blast database; NOTE: THIS WILL BE DELETED BEFORE THE END OF THE RUN
db_path = data_dir + 'blast_database'

#Path to blastn executable
blastn_path = '/usr/local/ncbi/blast/bin/' #this may be different for you

#Well annotations
well_annots = data_dir +  'sample.bc'  

#Path to write output files
write_path = data_dir + 'sample_output.txt'



In [35]:

    
!ls $data_dir









    



sample.bc                        sample.probes                    sample_output.txt.read_df_counts
sample.fastq.gz                  sample_output.txt



In [36]:

    
#Generates the FASTQ read, Probe, & Well Annotation dataframes
test = RASLseqAnalysis_obj_heur(fq_path, seq_id, probes, db_path, blastn_path, well_annots, write_path)



In [37]:

    
#Runs Blastn alignment, WellBarcode Mapping, Read Summarization, and Well Annotation 
test.get_target_counts_df()

#Generates 3 files:
#sample_output.txt : Contains the probe reads counts for each well
#sample_output.txt.read_df.gz : contains the blast alignment results for each unique read sequence
#sample_output.txt.read_df_counts : Contains probe read counts for each well with no well metadata 

#NOTE: if this command is run twice an addition sample_output.txt.read_df (19Mbytes file will be written, but not gzipped)

Expected Results for sample_output.txt



In [38]:

    
df = pd.read_table(write_path, sep='\t')
df









    Out[38]:






  
    
      
      PlateBarcode
      WellBarcode
      Concentration
      NM_000569.6_FCGR3A_2014///NM_000569.6_FCGR3A_2014
      NM_000569.6_FCGR3A_2119///NM_000569.6_FCGR3A_2119
      NM_000572.2_IL10_1007///NM_000572.2_IL10_1007
      NM_000572.2_IL10_787///NM_000572.2_IL10_787
      NM_000594.3_TNF_1327///NM_000594.3_TNF_1327
      NM_000594.3_TNF_1389///NM_000594.3_TNF_1389
      NM_001002_RPLP0_647///NM_001002_RPLP0_647
      NM_001002_RPLP0_909///NM_001002_RPLP0_909
      NM_001081455.1_P2RY14_2328///NM_001081455.1_P2RY14_2328
      NM_001081455.1_P2RY14_2526///NM_001081455.1_P2RY14_2526
    
  
  
    
      0 
       CATATTG
       AGCTCAGA
       NaN
       626
       605
       587
       593
       612
       599
       596
       590
       618
       614
    
    
      1 
       CATATTG
       ATCGATCG
       NaN
       587
       580
       592
       622
       640
       607
       621
       611
       601
       596
    
    
      2 
       CATATTG
       CTAGCTAG
       NaN
       618
       591
       581
       556
       604
       620
       610
       630
       608
       620
    
    
      3 
       CATATTG
       GACTGACT
       0.1
       584
       554
       585
       601
       595
       614
       598
       615
       629
       583
    
    
      4 
       GAGATTC
       AGCTCAGA
       NaN
       605
       629
       578
       590
       556
       613
       615
       615
       586
       651
    
    
      5 
       GAGATTC
       ATCGATCG
       0.1
       627
       612
       649
       638
       587
       635
       593
       598
       617
       615
    
    
      6 
       GAGATTC
       CTAGCTAG
       NaN
       604
       616
       603
       608
       598
       618
       584
       613
       629
       610
    
    
      7 
       GAGATTC
       GACTGACT
       NaN
       604
       611
       642
       589
       647
       632
       639
       606
       574
       625
    
    
      8 
       GAGCATG
       AGCTCAGA
       NaN
       645
       586
       602
       617
       617
       597
       598
       633
       629
       637
    
    
      9 
       GAGCATG
       ATCGATCG
       NaN
       606
       626
       603
       592
       627
       620
       653
       609
       604
       582
    
    
      10
       GAGCATG
       CTAGCTAG
       0.1
       607
       579
       618
       628
       610
       591
       627
       628
       673
       667
    
    
      11
       GAGCATG
       GACTGACT
       NaN
       650
       619
       638
       613
       637
       608
       582
       614
       607
       595
    
    
      12
       TGATAGT
       AGCTCAGA
       0.1
       602
       635
       614
       593
       598
       605
       625
       610
       628
       643
    
    
      13
       TGATAGT
       ATCGATCG
       NaN
       592
       605
       588
       591
       618
       633
       629
       597
       606
       624
    
    
      14
       TGATAGT
       CTAGCTAG
       NaN
       622
       589
       602
       610
       623
       618
       624
       567
       617
       580
    
    
      15
       TGATAGT
       GACTGACT
       NaN
       599
       601
       565
       647
       616
       637
       584
       587
       618
       655

The NaN values for the Concentration column are expected because the sample.bc file only contains information for 4 combinations of PlateBarcodes & WellBarcodes

	PlateBarcode	WellBarcode	Concentration	NM_000569.6_FCGR3A_2014///NM_000569.6_FCGR3A_2014	NM_000569.6_FCGR3A_2119///NM_000569.6_FCGR3A_2119	NM_000572.2_IL10_1007///NM_000572.2_IL10_1007	NM_000572.2_IL10_787///NM_000572.2_IL10_787	NM_000594.3_TNF_1327///NM_000594.3_TNF_1327	NM_000594.3_TNF_1389///NM_000594.3_TNF_1389	NM_001002_RPLP0_647///NM_001002_RPLP0_647	NM_001002_RPLP0_909///NM_001002_RPLP0_909	NM_001081455.1_P2RY14_2328///NM_001081455.1_P2RY14_2328	NM_001081455.1_P2RY14_2526///NM_001081455.1_P2RY14_2526
0	CATATTG	AGCTCAGA	NaN	626	605	587	593	612	599	596	590	618	614
1	CATATTG	ATCGATCG	NaN	587	580	592	622	640	607	621	611	601	596
2	CATATTG	CTAGCTAG	NaN	618	591	581	556	604	620	610	630	608	620
3	CATATTG	GACTGACT	0.1	584	554	585	601	595	614	598	615	629	583
4	GAGATTC	AGCTCAGA	NaN	605	629	578	590	556	613	615	615	586	651
5	GAGATTC	ATCGATCG	0.1	627	612	649	638	587	635	593	598	617	615
6	GAGATTC	CTAGCTAG	NaN	604	616	603	608	598	618	584	613	629	610
7	GAGATTC	GACTGACT	NaN	604	611	642	589	647	632	639	606	574	625
8	GAGCATG	AGCTCAGA	NaN	645	586	602	617	617	597	598	633	629	637
9	GAGCATG	ATCGATCG	NaN	606	626	603	592	627	620	653	609	604	582
10	GAGCATG	CTAGCTAG	0.1	607	579	618	628	610	591	627	628	673	667
11	GAGCATG	GACTGACT	NaN	650	619	638	613	637	608	582	614	607	595
12	TGATAGT	AGCTCAGA	0.1	602	635	614	593	598	605	625	610	628	643
13	TGATAGT	ATCGATCG	NaN	592	605	588	591	618	633	629	597	606	624
14	TGATAGT	CTAGCTAG	NaN	622	589	602	610	623	618	624	567	617	580
15	TGATAGT	GACTGACT	NaN	599	601	565	647	616	637	584	587	618	655