In [ ]:
import pandas as pd
import os.path
In [ ]:
path_root = '/path/to/data/'
resultsFile = '../../data/otu-picking/observations.tsv'
masterMappingFile = '../../data/mapping-files/emp_qiime_mapping_release1.tsv'
bioms = {
'gg' : {'label': 'cr_gg_seqs',
'biomsummary': path_root + '03-otus/01-closed-ref-greengenes/emp_cr_gg_13_8.summary.txt',
'biomfile': path_root + '03-otus/01-closed-ref-greengenes/emp_cr_gg_13_8.biom'},
'silva' : {'label': 'cr_silva_seqs',
'biomsummary': path_root + '03-otus/01-closed-ref-silva-16S/emp_cr_silva_16S_123.summary.txt',
'biomfile': path_root + '03-otus/01-closed-ref-silva-16S/emp_cr_silva_16S_123.biom'},
'openref' : {'label': 'or_gg_seqs',
'biomsummary': path_root + '03-otus/02-open-ref-greengenes/emp_or_gg_13_8.summary.txt',
'biomfile': path_root + '03-otus/02-open-ref-greengenes/emp_or_gg_13_8.biom'},
'deblur90' : {'label': 'deblur_90nt',
'biomsummary': path_root + '03-otus/04-deblur/emp.90.min25.deblur.withtax.summary.txt',
'biomfile': path_root + '03-otus/04-deblur/emp.90.min25.deblur.withtax.biom'},
'deblur100': {'label': 'deblur_100nt',
'biomsummary': path_root + '03-otus/04-deblur/emp.100.min25.deblur.withtax.summary.txt',
'biomfile': path_root + '03-otus/04-deblur/emp.100.min25.deblur.withtax.biom'},
'deblur150': {'label': 'deblur_150nt',
'biomsummary': path_root + '03-otus/04-deblur/emp.150.min25.deblur.withtax.summary.txt',
'biomfile': path_root + '03-otus/04-deblur/emp.150.min25.deblur.withtax.biom'},
'deblur100ot': {'label': 'deblur_100nt_onlytree',
'biomsummary': path_root + '03-otus/04-deblur/emp.100.min25.deblur.withtax.onlytree.summary.txt',
'biomfile': path_root + '03-otus/04-deblur/emp.100.min25.deblur.withtax.onlytree.biom'},
'deblur150ot': {'label': 'deblur_150nt_onlytree',
'biomsummary': path_root + '03-otus/04-deblur/emp.150.min25.deblur.withtax.onlytree.summary.txt',
'biomfile': path_root + '03-otus/04-deblur/emp.150.min25.deblur.withtax.onlytree.biom'},
'deblur90ot' : {'label': 'deblur_90nt_onlytree',
'biomsummary': path_root + '03-otus/04-deblur/emp.90.min25.deblur.withtax.onlytree.summary.txt',
'biomfile': path_root + '03-otus/04-deblur/emp.90.min25.deblur.withtax.onlytree.biom'},
}
fasta = {
'split-libraries' : {'label': 'split-libraries',
'dir': path_root + '01-split-libraries',
'fnaFile': 'seqs.fna',
'logfile': 'split_library_log.txt'},
'adapter-clean-up': {'label': 'adapter-clean-up',
'dir': path_root + '02-adapter-clean-up',
'fnaFile': 'filtered_seqs.fna',
'filterFnaFile': 'seqs_to_filter.fna',
'logfile': ''},
}
In [ ]:
# Gathering all necessary information from the original files themselves is not feasible, because of IO and CPU time.
# The following method makes use of additional information that describe the BIOM tables and fasta files.
# This is fine, as long as those descriptors are up to date. Otherwise, we are screwed!
# These asumptions must hold for proper operations of this script:
# 1. each BIOM file is acompanied by a *.summary.txt file which holds the number of counts per sample starting at line 15
# 2. each seq.fasta file has its split_library_log.txt which give counts of sequences for the same sample. It can have several runs and counts might be split acros those runs.
# 3. to avoid looking into the filered_seqs.fna files we substract the number of sequences to be filtered from the number of sequences per sample of the previous computation. Thus, it is neccessary that we have up-to-date 'seqs_to_filter.fna' files.
############ STEP 1/4 ###########
print("Step 1/4: parsing summary information about BIOM " + str(len(bioms)) + " tables: ", end="")
o = {}
for name in bioms:
print(".", end="")
o[name] = pd.read_csv(bioms[name]['biomsummary'], sep=":", skiprows=15, index_col=0, names=["sampleID",bioms[name]['label']])
o[name].index = o[name].index.map(str.upper) # use upper case
observations = pd.concat([ o[name] for name in o], axis=1, join='outer')
print(" done.")
############ STEP 2/4 ###########
field = 'split-libraries'
logs = []
path = fasta[field]['dir']
filename= fasta[field]['logfile']
logfiles = !find "$path" -name "$filename"
tmpfile = "/tmp/sampleIDs"
print("Step 2/4: parsing " + str(len(logfiles)) + " '" + filename + "' files: ", end="")
for logfile in logfiles:
print(".", end="")
studyID = logfile.split("/")[-2]
#a logfile can contain several entries, thus it is necessary to grep only the sampleID lines in a pre-processing step
!grep "^$studyID" "$logfile" > "$tmpfile"
log = pd.read_csv(tmpfile, sep="\t", index_col=0, names=['sampleID', fasta[field]['label']])
log.index = log.index.map(str.upper) # use upper case
log = log.reset_index().groupby(log.index).sum() #counts might be spread over several runs, thus we need to sum them here
logs.append(log)
logs = pd.concat(logs, axis=0)
observations = observations.merge(logs, left_index=True, right_index=True, how="outer")
print(" done.")
############ STEP 3/4 ###########
if fasta['split-libraries']['label'] in observations:
field = 'adapter-clean-up'
path = fasta[field]['dir']
filename = fasta[field]['filterFnaFile']
tmpfile = "/tmp/sampleIDs"
filenames_filterSeqs = !find "$path" -name "$filename"
pdsFiltered = []
print("Step 3/4: parsing " + str(len(filenames_filterSeqs)) + " '" + filename + "' files: ", end="")
for file in filenames_filterSeqs:
print(".", end="")
studyID = logfile.split("/")[-2]
!grep "^>" "$file" | cut -d "_" -f 1 | sort | tr -d ">" | uniq -c | sed "s/^[ \t]*//" > "$tmpfile"
nrs = pd.read_csv(tmpfile, sep=" ", index_col=1, names=[fasta[field]['label'] + "-remove", 'sampleID'])
nrs.index = nrs.index.map(str.upper) # use upper case
pdsFiltered.append( nrs )
pdsFiltered = pd.concat(pdsFiltered, axis=0)
#use the counted numbers of sequences that have been filtered to compute the number of remaining sequences
x = observations.merge(pdsFiltered, left_index=True, right_index=True, how="outer")
x.fillna(0, inplace=True)
observations[fasta['adapter-clean-up']['label']] = observations[fasta['split-libraries']['label']] - x[fasta['adapter-clean-up']['label']+"-remove"]
print(" done.")
else:
print("Field '" + fasta['split-libraries']['label'] + "' is missing in the observations table. First compute this field, before computing the number of filtered sequences!")
############ STEP 4/4 ###########
print("Step 4/4: add sample names identical to master mapping file '" + str(masterMappingFile) + "': ", end="")
metadata = pd.read_csv(masterMappingFile, sep="\t", usecols=[0])
metadata['UC'] = metadata['#SampleID'].map(str.upper) # use upper case
metadata.set_index("UC", inplace=True)
observations = observations.merge(metadata, left_index=True, right_index=True, how="inner")
observations.set_index('#SampleID', inplace=True)
print(" done.")
#store results in one file
print("\nResults have been written to '" + resultsFile + "'.")
observations.to_csv(resultsFile, sep="\t")
In [ ]:
#the slow method: go through the content of the fasta files and aggregate the header lines. Takes a few hours, even on barnacle!
print("Execute the following command on 'barnacle' and transfer the resulting files somewhere you have access to! If you do this on your local machine ~0.5TB must be transfered via network.")
for field in ['split-libraries', 'adapter-clean-up']:
path = fasta[field]['dir']
file = fasta[field]['fnaFile']
tmpFile = '/tmp/' + field + ".counts"
cmd = "\tfor f in `find '" + path + "/' -name '" + file + "' | cut -d '_' -f 1 | sort | uniq -c | tr -d '>' | sed 's/^[ ]*//' >> '" + tmpFile + "'; done"
print(cmd + "\n")
obs = observations
for field in ['split-libraries', 'adapter-clean-up']:
tmpFile = '/path/to/output/' + field + ".counts"
if os.path.isfile(tmpFile):
x = pd.read_csv(tmpFile, sep=" ", index_col=1, names=["INFASTA_" + field, "sampleID"])
x.index = x.index.map(str.upper) # use upper case
obs = obs.merge(x, left_index=True, right_index=True, how="outer")
obs.to_csv(resultsFile, sep="\t")