Author: stefan.m.janssen@gmail.com
Date: 27 Dec 2016
language: Python 3.5
conda enviroment: micronota
license: unlicensed

summarize_observation_counts.ipynb


In [ ]:
import pandas as pd
import os.path

In [ ]:
path_root = '/path/to/data/' 
resultsFile = '../../data/otu-picking/observations.tsv'
masterMappingFile = '../../data/mapping-files/emp_qiime_mapping_release1.tsv'

bioms = {
    'gg'       :   {'label': 'cr_gg_seqs',
                    'biomsummary': path_root + '03-otus/01-closed-ref-greengenes/emp_cr_gg_13_8.summary.txt',
                    'biomfile':    path_root + '03-otus/01-closed-ref-greengenes/emp_cr_gg_13_8.biom'},
    'silva'    :   {'label': 'cr_silva_seqs',
                    'biomsummary': path_root + '03-otus/01-closed-ref-silva-16S/emp_cr_silva_16S_123.summary.txt',
                    'biomfile':    path_root + '03-otus/01-closed-ref-silva-16S/emp_cr_silva_16S_123.biom'},                
    'openref'  :   {'label': 'or_gg_seqs',
                    'biomsummary': path_root + '03-otus/02-open-ref-greengenes/emp_or_gg_13_8.summary.txt',
                    'biomfile':    path_root + '03-otus/02-open-ref-greengenes/emp_or_gg_13_8.biom'},                  
    'deblur90' :   {'label': 'deblur_90nt',
                    'biomsummary': path_root + '03-otus/04-deblur/emp.90.min25.deblur.withtax.summary.txt',
                    'biomfile':    path_root + '03-otus/04-deblur/emp.90.min25.deblur.withtax.biom'},                  
    'deblur100':   {'label': 'deblur_100nt',
                    'biomsummary': path_root + '03-otus/04-deblur/emp.100.min25.deblur.withtax.summary.txt',
                    'biomfile':    path_root + '03-otus/04-deblur/emp.100.min25.deblur.withtax.biom'},                  
    'deblur150':   {'label': 'deblur_150nt',
                    'biomsummary': path_root + '03-otus/04-deblur/emp.150.min25.deblur.withtax.summary.txt',
                    'biomfile':    path_root + '03-otus/04-deblur/emp.150.min25.deblur.withtax.biom'},
    'deblur100ot': {'label': 'deblur_100nt_onlytree',
                    'biomsummary': path_root + '03-otus/04-deblur/emp.100.min25.deblur.withtax.onlytree.summary.txt',
                    'biomfile':    path_root + '03-otus/04-deblur/emp.100.min25.deblur.withtax.onlytree.biom'},                  
    'deblur150ot': {'label': 'deblur_150nt_onlytree',
                    'biomsummary': path_root + '03-otus/04-deblur/emp.150.min25.deblur.withtax.onlytree.summary.txt',
                    'biomfile':    path_root + '03-otus/04-deblur/emp.150.min25.deblur.withtax.onlytree.biom'},
    'deblur90ot' : {'label': 'deblur_90nt_onlytree',
                    'biomsummary': path_root + '03-otus/04-deblur/emp.90.min25.deblur.withtax.onlytree.summary.txt',
                    'biomfile':    path_root + '03-otus/04-deblur/emp.90.min25.deblur.withtax.onlytree.biom'},
}
fasta = {
    'split-libraries' : {'label': 'split-libraries',
                         'dir': path_root + '01-split-libraries',  
                         'fnaFile': 'seqs.fna',          
                         'logfile': 'split_library_log.txt'},
    'adapter-clean-up': {'label': 'adapter-clean-up',
                         'dir': path_root + '02-adapter-clean-up', 
                         'fnaFile': 'filtered_seqs.fna', 
                         'filterFnaFile': 'seqs_to_filter.fna',
                         'logfile': ''},
}

In [ ]:
# Gathering all necessary information from the original files themselves is not feasible, because of IO and CPU time.
# The following method makes use of additional information that describe the BIOM tables and fasta files.
# This is fine, as long as those descriptors are up to date. Otherwise, we are screwed!
# These asumptions must hold for proper operations of this script:
#     1. each BIOM file is acompanied by a *.summary.txt file which holds the number of counts per sample starting at line 15
#     2. each seq.fasta file has its split_library_log.txt which give counts of sequences for the same sample. It can have several runs and counts might be split acros those runs.
#     3. to avoid looking into the filered_seqs.fna files we substract the number of sequences to be filtered from the number of sequences per sample of the previous computation. Thus, it is neccessary that we have up-to-date 'seqs_to_filter.fna' files.

############ STEP 1/4 ###########
print("Step 1/4: parsing summary information about BIOM " + str(len(bioms)) + " tables: ", end="")
o = {}
for name in bioms:
    print(".", end="")
    o[name] = pd.read_csv(bioms[name]['biomsummary'], sep=":", skiprows=15, index_col=0, names=["sampleID",bioms[name]['label']])
    o[name].index = o[name].index.map(str.upper) # use upper case 
observations = pd.concat([ o[name] for name in o], axis=1, join='outer')
print(" done.")


############ STEP 2/4 ###########
field = 'split-libraries'
logs = []
path = fasta[field]['dir']
filename= fasta[field]['logfile']
logfiles = !find "$path" -name "$filename"
tmpfile = "/tmp/sampleIDs"
print("Step 2/4: parsing " + str(len(logfiles)) + " '" + filename + "' files: ", end="")
for logfile in logfiles:
    print(".", end="")
    studyID = logfile.split("/")[-2]
    #a logfile can contain several entries, thus it is necessary to grep only the sampleID lines in a pre-processing step
    !grep "^$studyID" "$logfile" > "$tmpfile"
    log = pd.read_csv(tmpfile, sep="\t", index_col=0, names=['sampleID', fasta[field]['label']])
    log.index = log.index.map(str.upper) # use upper case 
    log = log.reset_index().groupby(log.index).sum() #counts might be spread over several runs, thus we need to sum them here
    logs.append(log)
logs = pd.concat(logs, axis=0)
observations = observations.merge(logs, left_index=True, right_index=True, how="outer")
print(" done.")


############ STEP 3/4 ###########
if fasta['split-libraries']['label'] in observations:
    field = 'adapter-clean-up'
    path = fasta[field]['dir']
    filename = fasta[field]['filterFnaFile']
    tmpfile = "/tmp/sampleIDs"
    filenames_filterSeqs = !find "$path" -name "$filename"
    pdsFiltered = []
    print("Step 3/4: parsing " + str(len(filenames_filterSeqs)) + " '" + filename + "' files: ", end="")
    for file in filenames_filterSeqs:
        print(".", end="")
        studyID = logfile.split("/")[-2]
        !grep "^>" "$file" | cut -d "_" -f 1 | sort | tr -d ">" | uniq -c | sed "s/^[ \t]*//" > "$tmpfile" 
        nrs = pd.read_csv(tmpfile, sep=" ", index_col=1, names=[fasta[field]['label'] + "-remove", 'sampleID'])
        nrs.index = nrs.index.map(str.upper) # use upper case 
        pdsFiltered.append( nrs )

    pdsFiltered = pd.concat(pdsFiltered, axis=0)

    #use the counted numbers of sequences that have been filtered to compute the number of remaining sequences
    x = observations.merge(pdsFiltered, left_index=True, right_index=True, how="outer")
    x.fillna(0, inplace=True)
    observations[fasta['adapter-clean-up']['label']] = observations[fasta['split-libraries']['label']] - x[fasta['adapter-clean-up']['label']+"-remove"]
    print(" done.")
else:
    print("Field '" + fasta['split-libraries']['label'] + "' is missing in the observations table. First compute this field, before computing the number of filtered sequences!")
    

############ STEP 4/4 ###########
print("Step 4/4: add sample names identical to master mapping file '" + str(masterMappingFile) + "': ", end="")
metadata = pd.read_csv(masterMappingFile, sep="\t", usecols=[0])
metadata['UC'] = metadata['#SampleID'].map(str.upper) # use upper case
metadata.set_index("UC", inplace=True)
observations = observations.merge(metadata, left_index=True, right_index=True, how="inner")
observations.set_index('#SampleID', inplace=True)
print(" done.")


#store results in one file
print("\nResults have been written to '" + resultsFile + "'.")
observations.to_csv(resultsFile, sep="\t")

In [ ]:
#the slow method: go through the content of the fasta files and aggregate the header lines. Takes a few hours, even on barnacle!
print("Execute the following command on 'barnacle' and transfer the resulting files somewhere you have access to! If you do this on your local machine ~0.5TB must be transfered via network.")
for field in ['split-libraries', 'adapter-clean-up']:
    path = fasta[field]['dir']
    file = fasta[field]['fnaFile']
    tmpFile = '/tmp/' + field + ".counts"
    cmd = "\tfor f in `find '" + path + "/' -name '" + file + "' | cut -d '_' -f 1 | sort | uniq -c | tr -d '>' | sed 's/^[ ]*//' >> '" + tmpFile + "'; done"
    print(cmd + "\n")

obs = observations
for field in ['split-libraries', 'adapter-clean-up']:
    tmpFile = '/path/to/output/' + field + ".counts"
    if os.path.isfile(tmpFile):
        x = pd.read_csv(tmpFile, sep=" ", index_col=1, names=["INFASTA_" + field, "sampleID"])
        x.index = x.index.map(str.upper) # use upper case
        obs = obs.merge(x, left_index=True, right_index=True, how="outer")
obs.to_csv(resultsFile, sep="\t")