In [7]:
import glob
import os
import yaml
import re
import copy
from io import StringIO
import pandas as pd
import numpy as np
In [2]:
def open_sample_sheet(sample_sheet_fp, lanes=False, skiprows=18):
"""Read in an IGM sample sheet and return a pandas DF with primary data table"""
sample_sheet = pd.read_excel(sample_sheet_fp, skiprows = skiprows, header=1)
# Remove trailing whitespace from column names and homogenize case
sample_sheet.columns = [x.strip().lower() for x in sample_sheet.columns]
if lanes:
sample_sheet = sample_sheet.loc[sample_sheet['lane'].isin(lanes)]
if 'sample name' not in sample_sheet.columns and 'sample_id' in sample_sheet.columns:
sample_sheet['sample name'] = sample_sheet['sample_id']
return(sample_sheet)
In [3]:
def open_sequencing_manifest(manifest_fp, lanes='False', sheetname='Sample Information', skiprows=18):
"""Read in an IGM sequencing manifest and return a pandas DF with primary data table"""
sample_sheet = pd.read_excel(manifest_fp, sheetname=sheetname, skiprows=skiprows, header=1)
# Remove trailing whitespace from column names and homogenize case
sample_sheet.columns = [x.strip().lower() for x in sample_sheet.columns]
if lanes:
sample_sheet = sample_sheet.loc[sample_sheet['lane'].isin(lanes)]
return(sample_sheet)
In [4]:
def get_read(sample, seq_dir, read):
"""Function to pull a given read based on sample name from the reads directory"""
reads = glob.glob(os.path.join(seq_dir, "{0}_*_{1}_*.fastq.gz".format(sample, read)))
if len(reads) == 1:
return(reads[0])
elif len(reads) > 1:
raise ValueError('Too many reads found for {0} in {1}:\n'
'read_str: {2}'.format(sample, seq_dir, reads))
elif len(reads) < 1:
raise ValueError('Too few reads found for {0} in {1}:\n'
'read_str: {1}\n'.format(sample, seq_dir, reads))
In [5]:
def make_sample_dict(sample_sheet, seq_dir,
forward = 'R1',
reverse = 'R2',
adaptor = '$CONDA_ENV_PATH/share/trimmomatic-*/adapters/TruSeq3-PE-2.fa',
phred = 'phred33',
sample_header = 'Sample_Prefix',
sample_name = 'Sample',
tolerate_losses = True):
samples_pe = {'samples_pe': {}}
for x in sample_sheet.index:
try:
samples_pe['samples_pe'][sample_sheet.loc[x, sample_name]] = {
'forward': get_read(sample_sheet.loc[x, sample_header], seq_dir, forward),
'reverse': get_read(sample_sheet.loc[x, sample_header], seq_dir, reverse),
'adaptor': adaptor,
'phred': phred
}
except ValueError:
if tolerate_losses:
print('No sequences found for %s' % sample_sheet.loc[x, sample_name])
else:
raise ValueError
return(samples_pe)
In [6]:
def format_yaml(RUN, samples_pe,
TMP_DIR_ROOT = '/localscratch',
host_db = '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',
envs = {'KNEAD_ENV': 'source activate kneaddata',
'BOWTIE_ENV': 'source activate kneaddata',
'QC_ENV': 'source activate kneaddata',
'TRIM_ENV': 'source activate kneaddata',
'HUMANN2_ENV': 'source activate kneaddata',
'METAPHLAN_ENV': 'source activate kneaddata'},
software = {'gzip': 'gzip',
'trimmomatic': 'trimmomatic',
'mash': '/home/jgsanders/git_sw/git_bin/mash-Linux64-v1.1.1/mash',
'seqtk': '/home/jgsanders/git_sw/seqtk/seqtk'},
params = {'TRIMMOMATIC': {'QUAL': 'LEADING:20 TRAILING:20 AVGQUAL:30 MINLEN:32 TOPHRED33',
'ILLUMINACLIP': '2:30:10'},
'MASH': {'REFSEQ_DB': '/home/jgsanders/ref_data/mash/RefSeqSketchesDefaults.msh',
'OTHER': '-r -m 2 -k 21 -s 1000'},
'HUMANN2': {'METAPHLAN_DIR': '/home/jgsanders/share/metaphlan2',
'HUMANN2_NT_DB': '/home/jgsanders/ref_data/humann2/chocophlan',
'HUMANN2_AA_DB': '/home/jgsanders/ref_data/humann2/uniref',
'NORMS': ['cpm','relab'],
'OTHER': ''}},
default_flow_style = False):
config_str = ''
config_str += yaml.dump({'TMP_DIR_ROOT': TMP_DIR_ROOT}, default_flow_style = default_flow_style)
config_str += yaml.dump({'RUN': RUN}, default_flow_style = default_flow_style)
config_str += yaml.dump({'HOST_DB': host_db}, default_flow_style = default_flow_style)
config_str += yaml.dump({'ENVS': envs}, default_flow_style = default_flow_style)
config_str += yaml.dump({'SOFTWARE': software}, default_flow_style = default_flow_style)
config_str += yaml.dump({'PARAMS': params}, default_flow_style = default_flow_style)
config_str += yaml.dump(samples_pe, default_flow_style = default_flow_style)
return(config_str)
In [19]:
fps = ['Bs_S106_L001_R1_001.fastq.gz',
'Bs_S106_L001_R2_001.fastq.gz',
'Bs_S106_L002_R1_001.fastq.gz',
'Bs_S106_L002_R2_001.fastq.gz',
'Vf_S104_L001_R1_001.fastq.gz',
'Vf_S104_L001_R2_001.fastq.gz',
'Vf_S104_L002_R1_001.fastq.gz',
'Vf_S104_L002_R2_001.fastq.gz']
exp_df = pd.DataFrame.from_dict({'Extension': {0: 'fastq.gz',
1: 'fastq.gz',
2: 'fastq.gz',
3: 'fastq.gz',
4: 'fastq.gz',
5: 'fastq.gz',
6: 'fastq.gz',
7: 'fastq.gz'},
'File': {0: 'Bs_S106_L001_R1_001.fastq.gz',
1: 'Bs_S106_L001_R2_001.fastq.gz',
2: 'Bs_S106_L002_R1_001.fastq.gz',
3: 'Bs_S106_L002_R2_001.fastq.gz',
4: 'Vf_S104_L001_R1_001.fastq.gz',
5: 'Vf_S104_L001_R2_001.fastq.gz',
6: 'Vf_S104_L002_R1_001.fastq.gz',
7: 'Vf_S104_L002_R2_001.fastq.gz'},
'Index': {0: 'S106',
1: 'S106',
2: 'S106',
3: 'S106',
4: 'S104',
5: 'S104',
6: 'S104',
7: 'S104'},
'Lane': {0: 'L001',
1: 'L001',
2: 'L002',
3: 'L002',
4: 'L001',
5: 'L001',
6: 'L002',
7: 'L002'},
'Read': {0: 'R1',
1: 'R2',
2: 'R1',
3: 'R2',
4: 'R1',
5: 'R2',
6: 'R1',
7: 'R2'},
'Run': {0: '001',
1: '001',
2: '001',
3: '001',
4: '001',
5: '001',
6: '001',
7: '001'},
'Sample': {0: 'Bs',
1: 'Bs',
2: 'Bs',
3: 'Bs',
4: 'Vf',
5: 'Vf',
6: 'Vf',
7: 'Vf'}})
def parse_ilm_fps_to_df(fps,
pattern = '^((.+?)_(S\d+)_(L\d+)_(R[12])_(\d+)\.(.+))$',
pattern_names = ['File','Sample','Index','Lane','Read','Run','Extension']):
p = re.compile(pattern)
df = pd.DataFrame(columns = pattern_names)
for f in fps:
m = p.match(f)
if m:
df = df.append(dict(zip(pattern_names, m.groups())), ignore_index = True)
return(df)
obs_df = parse_ilm_fps_to_df(fps)
pd.util.testing.assert_frame_equal(obs_df.sort_index(axis=1), exp_df.sort_index(axis=1))
df = exp_df
seq_dir = './example/reads'
exp_dict = {'Bs': {'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
'./example/reads/Bs_S106_L002_R1_001.fastq.gz'],
'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
'./example/reads/Bs_S106_L002_R2_001.fastq.gz']},
'Vf': {'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
'./example/reads/Vf_S104_L002_R1_001.fastq.gz'],
'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
'./example/reads/Vf_S104_L002_R2_001.fastq.gz']}}
def get_sample_reads_df(df, seq_dir):
sample_reads_dict = {}
samples = list(set(df['Sample']))
for s in samples:
f_fps = sorted(list(df.loc[(df['Sample'] == s) & (df['Read'] == 'R1'),'File'].values))
r_fps = sorted(list(df.loc[(df['Sample'] == s) & (df['Read'] == 'R2'),'File'].values))
sample_reads_dict[s] = {'forward': [os.path.join(seq_dir, x) for x in f_fps],
'reverse': [os.path.join(seq_dir, x) for x in r_fps]}
return(sample_reads_dict)
obs_dict = get_sample_reads_df(df, seq_dir)
assert(exp_dict == obs_dict)
def get_sample_paths(seq_dir):
fps = os.listdir(seq_dir)
files_df = parse_ilm_fps_to_df(fps)
sample_reads_dict = get_sample_reads_df(files_df, seq_dir)
return(sample_reads_dict)
exp_db_dict = {'Bs': {'filter_db': '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',
'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
'./example/reads/Bs_S106_L002_R1_001.fastq.gz'],
'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
'./example/reads/Bs_S106_L002_R2_001.fastq.gz']},
'Vf': {'filter_db': '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',
'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
'./example/reads/Vf_S104_L002_R1_001.fastq.gz'],
'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
'./example/reads/Vf_S104_L002_R2_001.fastq.gz']}}
exp_db_dict_update = {'Bs': {'filter_db': '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',
'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
'./example/reads/Bs_S106_L002_R1_001.fastq.gz'],
'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
'./example/reads/Bs_S106_L002_R2_001.fastq.gz']},
'Vf': {'filter_db': '/home/jgsanders/ref_data/genomes/mouse/mouse',
'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
'./example/reads/Vf_S104_L002_R1_001.fastq.gz'],
'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
'./example/reads/Vf_S104_L002_R2_001.fastq.gz']}}
exp_db_dict_partial = {'Bs': {'filter_db': None,
'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
'./example/reads/Bs_S106_L002_R1_001.fastq.gz'],
'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
'./example/reads/Bs_S106_L002_R2_001.fastq.gz']},
'Vf': {'filter_db': '/home/jgsanders/ref_data/genomes/mouse/mouse',
'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
'./example/reads/Vf_S104_L002_R1_001.fastq.gz'],
'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
'./example/reads/Vf_S104_L002_R2_001.fastq.gz']}}
def add_filter_db(sample_fp_dict, db_fp, samples = None):
if samples is None:
samples = sample_fp_dict.keys()
samples_dict = copy.deepcopy(sample_fp_dict)
for s in samples_dict:
if s in samples:
samples_dict[s]['filter_db'] = db_fp
elif 'filter_db' in samples_dict[s]:
continue
else:
samples_dict[s]['filter_db'] = None
return(samples_dict)
obs_dict = add_filter_db(exp_dict, '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens')
assert(obs_dict == exp_db_dict)
obs_dict = add_filter_db(exp_db_dict, '/home/jgsanders/ref_data/genomes/mouse/mouse', samples = ['Vf'])
assert(obs_dict == exp_db_dict_update)
obs_dict = add_filter_db(exp_dict, '/home/jgsanders/ref_data/genomes/mouse/mouse', samples = ['Vf'])
assert(obs_dict == exp_db_dict_partial)
Steps:
In [44]:
seq_dir = './example/reads'
!ls -l {seq_dir} | head
If that looks correct, execute the following cell. It should result in a dictionary associating samples to filepaths.
To validate, a portion of the dictionary is converted to yaml and printed below:
In [45]:
sample_paths = get_sample_paths(seq_dir)
print(yaml.dump(sample_paths, default_flow_style = False))
In [30]:
# read in sample sheet
def read_sample_sheet(f, sep='\t'):
data = False
data_lines = ''
for line in f:
if data:
if line.startswith('[') or line.strip() == '':
data = False
continue
data_lines += line
elif line.startswith('[Data]'):
data = True
continue
else:
continue
data_df = pd.read_csv(StringIO(data_lines), sep = '\t', comment='#')
return(data_df)
with open('./example/example_sample_sheet.txt', 'r') as f:
ss_df = read_sample_sheet(f)
# column for read prefixes
# column for sample names
# aggregate into dict
ss_df.head()
Out[30]:
In [31]:
seq_dir = './example/reads/'
fps = os.listdir(seq_dir)
files_df = parse_ilm_fps_to_df(fps)
files_df
Out[31]:
In [47]:
def get_sample_reads_df_from_sample_sheet(ss_df, seq_dir, sample_col='Description', prefix_col='Sample_ID'):
sample_reads_dict = {}
samples = list(set(ss_df[sample_col]))
fps = os.listdir(seq_dir)
files_df = parse_ilm_fps_to_df(fps)
for s in samples:
# get the subset of sample sheet rows for that sample
sample_rows = ss_df.loc[ss_df[sample_col] == s,]
f_fps = []
r_fps = []
# iter across subset table and find file corresponding to each row
for idx, row in sample_rows.iterrows():
lane = 'L{0:03d}'.format(row['Lane'])
f_fps.extend(files_df.loc[(files_df['Sample'] == row['Sample_ID']) &
(files_df['Lane'] == lane) &
(files_df['Read'] == 'R1'), 'File'].values)
r_fps.extend(files_df.loc[(files_df['Sample'] == row['Sample_ID']) &
(files_df['Lane'] == lane) &
(files_df['Read'] == 'R2'), 'File'].values)
f_fps = sorted(f_fps)
r_fps = sorted(r_fps)
sample_reads_dict[s] = {'forward': [os.path.join(seq_dir, x) for x in f_fps],
'reverse': [os.path.join(seq_dir, x) for x in r_fps]}
return(sample_reads_dict)
sample_reads_dict = get_sample_reads_df_from_sample_sheet(ss_df, seq_dir, sample_col='Description', prefix_col='Sample_ID')
print(yaml.dump(sample_reads_dict, default_flow_style = False))
- filter db per sample
- samples for binning
- samples for abundance profiling
- tmp dir
- envs
- software
- assembler
- trimmer
- params
* atropos
* maxbin
* humann2
* metaphlan
* kraken
* shogun?
In [48]:
config_dict = {}
These are the filepaths to the databases used for host filtering.
The method 'add host filter db' adds the provided filepath to the sample dictionary you created above.
By default, all samples get same database.
You can also provide a list of specific sample names to update if you want to set different filter dbs for some samples. To do this, just re-execute add_filter_db
providing a list of samples to update; the existing filter_db
paths will remain unchanged if they are not in this list.
Example:
filter_db1 = '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens'
filter_db2 = '/home/jgsanders/ref_data/genomes/mouse/mouse'
samples_dict = add_filter_db(sample_paths, filter_db1)
samples_dict = add_filter_db(samples_dict, filter_db2, samples = ['Vf'])
In [49]:
filter_db = '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens'
samples_dict = add_filter_db(sample_paths, filter_db)
print(yaml.dump(samples_dict, default_flow_style = False))
In [50]:
# binning_samples = ['Vf','Bs']
binning_samples = list(samples_dict.keys())
config_dict['binning_samples'] = binning_samples
In [51]:
#abundance_samples = ['Vf','Bs']
# This line filters out BLANK* samples from abundance mapping
abundance_samples = [x for x in samples_dict if not x.startswith('BLANK')]
# This line uses all samples for abundance mapping
abundance_samples = list(samples_dict.keys())
config_dict['abundance_samples'] = abundance_samples
In [52]:
config_dict['trimmer'] = 'atropos'
In [53]:
assemblers = ['megahit','metaspades']
config_dict['assemblers'] = assemblers
In [54]:
config_dict['mapping_assembler'] = 'metaspades'
In [55]:
tmp_dir_root = '/localscratch'
config_dict['tmp_dir_root'] = tmp_dir_root
In [56]:
config_dict['env'] = 'source activate snakemake_assemble'
config_dict['anvi_env'] = 'source activate sn_anvio'
config_dict['metaphlan_env'] = 'source activate humann2'
config_dict['humann2_env'] = 'source activate humann2'
config_dict['kraken_env'] = 'source activate kraken'
config_dict['shogun_env'] = 'source activate shogun'
In [57]:
software = {'snakemake_folder': '/home/jgsanders/git_sw/snakemake_assemble',
'seqtk': 'seqtk'}
config_dict['software'] = software
In [58]:
resources = {'centrifuge_base': '/home/jgsanders/miniconda/envs/anvio2/centrifuge',
'centrifuge_models': '/home/jgsanders/miniconda/envs/anvio2/centrifuge/b+h+v/b+h+v'}
config_dict['resources'] = resources
In [59]:
params = {}
Sets quality and adapter trimming parameters.
Some suggested defaults:
Nextera: -a CTGTCTCTTATACACATCT -A CTGTCTCTTATACACATCT -q 15 --minimum-length 100 --pair-filter any
KapaHyperPlus: -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -A GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT -q 15 --minimum-length 100 --pair-filter any
In [60]:
params['atropos'] = ' -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -A GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT -q 15 --minimum-length 100 --pair-filter any'
In [61]:
params['maxbin'] = '-plotmarker'
In [62]:
params['kraken'] = {'kraken_db': '/home/qiz173/Databases/Kraken/stdb'}
In [63]:
params['shogun'] = "--utree_indx /home/qiz173/Databases/SHOGUN/annotated/utree/stdb.ctr"
In [64]:
params['metaphlan'] = {'metaphlan_dir': '/home/jgsanders/git_sw/metaphlan2'}
Sets various parameters for HUMAnN2 funcitonal profiling.
humann2_aa_db
: Amino acid database for translated amino acid search
humann2_nt_db
: ChocoPhlAn database for nucleotide search
metaphan_dir
: path to metaphlan2 directory install, should also have the default metaphlan2 db
norms
: normalizations for which to output tables
other
: any other HUMAnN2 parameters to use
In [65]:
humann2 = {'humann2_aa_db': 'humann2_aa_db: /home/jgsanders/ref_data/humann2/uniref',
'humann2_nt_db': '/home/jgsanders/ref_data/humann2/chocophlan',
'metaphlan_dir': '/home/jgsanders/share/metaphlan2',
'norms': ['cpm','relab'],
'other': ''}
params['humann2'] = humann2
In [66]:
params['cram'] = ''
params['skewer'] = '-x ./adapters/HyperPlus.fa -n -l 100 -m any -q 15'
In [67]:
config_dict['params'] = params
config_dict['samples'] = samples_dict
Format as yaml and write to filepath:
In [68]:
config_yaml = yaml.dump(config_dict, default_flow_style = False)
with open('config.yaml', 'w') as f:
f.write(config_yaml)
This is the complete config yaml:
In [69]:
print(config_yaml)
In [ ]:
In [ ]: