In [7]:
import glob
import os
import yaml
import re
import copy
from io import StringIO
import pandas as pd
import numpy as np

In [2]:
def open_sample_sheet(sample_sheet_fp, lanes=False, skiprows=18):
    """Read in an IGM sample sheet and return a pandas DF with primary data table"""
    sample_sheet = pd.read_excel(sample_sheet_fp, skiprows = skiprows, header=1)
    
    # Remove trailing whitespace from column names and homogenize case
    sample_sheet.columns = [x.strip().lower() for x in sample_sheet.columns]
    
    if lanes:
        sample_sheet = sample_sheet.loc[sample_sheet['lane'].isin(lanes)]
    if 'sample name' not in sample_sheet.columns and 'sample_id' in sample_sheet.columns:
        sample_sheet['sample name'] = sample_sheet['sample_id']

    return(sample_sheet)

In [3]:
def open_sequencing_manifest(manifest_fp, lanes='False', sheetname='Sample Information', skiprows=18):
    """Read in an IGM sequencing manifest and return a pandas DF with primary data table"""
    sample_sheet = pd.read_excel(manifest_fp, sheetname=sheetname, skiprows=skiprows, header=1)
    
    # Remove trailing whitespace from column names and homogenize case
    sample_sheet.columns = [x.strip().lower() for x in sample_sheet.columns]
    
    if lanes:
        sample_sheet = sample_sheet.loc[sample_sheet['lane'].isin(lanes)]
        
    return(sample_sheet)

In [4]:
def get_read(sample, seq_dir, read):
    """Function to pull a given read based on sample name from the reads directory"""
    reads = glob.glob(os.path.join(seq_dir, "{0}_*_{1}_*.fastq.gz".format(sample, read)))
    if len(reads) == 1:
        return(reads[0])
    elif len(reads) > 1:
        raise ValueError('Too many reads found for {0} in {1}:\n'
                         'read_str: {2}'.format(sample, seq_dir, reads))
    elif len(reads) < 1:
        raise ValueError('Too few reads found for {0} in {1}:\n'
                         'read_str: {1}\n'.format(sample, seq_dir, reads))

In [5]:
def make_sample_dict(sample_sheet, seq_dir,
                        forward = 'R1',
                        reverse = 'R2',
                        adaptor = '$CONDA_ENV_PATH/share/trimmomatic-*/adapters/TruSeq3-PE-2.fa',
                        phred = 'phred33',
                        sample_header = 'Sample_Prefix',
                        sample_name = 'Sample',
                        tolerate_losses = True):
    
    samples_pe = {'samples_pe': {}}
    for x in sample_sheet.index:
        try:
            samples_pe['samples_pe'][sample_sheet.loc[x, sample_name]] = {
               'forward': get_read(sample_sheet.loc[x, sample_header], seq_dir, forward),
               'reverse': get_read(sample_sheet.loc[x, sample_header], seq_dir, reverse),
               'adaptor': adaptor,
               'phred': phred
            }
        except ValueError:
            if tolerate_losses:
                print('No sequences found for %s' % sample_sheet.loc[x, sample_name])
            else:
                raise ValueError
    
    return(samples_pe)

In [6]:
def format_yaml(RUN, samples_pe,
                   TMP_DIR_ROOT = '/localscratch',
                   host_db = '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',
                   envs = {'KNEAD_ENV': 'source activate kneaddata',
                           'BOWTIE_ENV': 'source activate kneaddata',
                           'QC_ENV': 'source activate kneaddata',
                           'TRIM_ENV': 'source activate kneaddata',
                           'HUMANN2_ENV': 'source activate kneaddata',
                           'METAPHLAN_ENV': 'source activate kneaddata'},
                   software = {'gzip': 'gzip',
                               'trimmomatic': 'trimmomatic',
                               'mash': '/home/jgsanders/git_sw/git_bin/mash-Linux64-v1.1.1/mash',
                               'seqtk': '/home/jgsanders/git_sw/seqtk/seqtk'},
                   params = {'TRIMMOMATIC': {'QUAL': 'LEADING:20 TRAILING:20 AVGQUAL:30 MINLEN:32 TOPHRED33',
                                              'ILLUMINACLIP': '2:30:10'},
                             'MASH': {'REFSEQ_DB': '/home/jgsanders/ref_data/mash/RefSeqSketchesDefaults.msh',
                                      'OTHER': '-r -m 2 -k 21 -s 1000'},
                             'HUMANN2': {'METAPHLAN_DIR': '/home/jgsanders/share/metaphlan2',
                                         'HUMANN2_NT_DB': '/home/jgsanders/ref_data/humann2/chocophlan',
                                         'HUMANN2_AA_DB': '/home/jgsanders/ref_data/humann2/uniref',
                                         'NORMS': ['cpm','relab'],
                                         'OTHER': ''}},
                   default_flow_style = False):
    
    config_str = ''

    config_str += yaml.dump({'TMP_DIR_ROOT': TMP_DIR_ROOT}, default_flow_style = default_flow_style)

    config_str += yaml.dump({'RUN': RUN}, default_flow_style = default_flow_style)
    
    config_str += yaml.dump({'HOST_DB': host_db}, default_flow_style = default_flow_style)
    
    config_str += yaml.dump({'ENVS': envs}, default_flow_style = default_flow_style)            
    
    config_str += yaml.dump({'SOFTWARE': software}, default_flow_style = default_flow_style)

    config_str += yaml.dump({'PARAMS': params}, default_flow_style = default_flow_style)

    config_str += yaml.dump(samples_pe, default_flow_style = default_flow_style)
    
    return(config_str)

In [19]:
fps = ['Bs_S106_L001_R1_001.fastq.gz',
       'Bs_S106_L001_R2_001.fastq.gz',
       'Bs_S106_L002_R1_001.fastq.gz',
       'Bs_S106_L002_R2_001.fastq.gz',
       'Vf_S104_L001_R1_001.fastq.gz',
       'Vf_S104_L001_R2_001.fastq.gz',
       'Vf_S104_L002_R1_001.fastq.gz',
       'Vf_S104_L002_R2_001.fastq.gz']

exp_df = pd.DataFrame.from_dict({'Extension': {0: 'fastq.gz',
  1: 'fastq.gz',
  2: 'fastq.gz',
  3: 'fastq.gz',
  4: 'fastq.gz',
  5: 'fastq.gz',
  6: 'fastq.gz',
  7: 'fastq.gz'},
 'File': {0: 'Bs_S106_L001_R1_001.fastq.gz',
  1: 'Bs_S106_L001_R2_001.fastq.gz',
  2: 'Bs_S106_L002_R1_001.fastq.gz',
  3: 'Bs_S106_L002_R2_001.fastq.gz',
  4: 'Vf_S104_L001_R1_001.fastq.gz',
  5: 'Vf_S104_L001_R2_001.fastq.gz',
  6: 'Vf_S104_L002_R1_001.fastq.gz',
  7: 'Vf_S104_L002_R2_001.fastq.gz'},
 'Index': {0: 'S106',
  1: 'S106',
  2: 'S106',
  3: 'S106',
  4: 'S104',
  5: 'S104',
  6: 'S104',
  7: 'S104'},
 'Lane': {0: 'L001',
  1: 'L001',
  2: 'L002',
  3: 'L002',
  4: 'L001',
  5: 'L001',
  6: 'L002',
  7: 'L002'},
 'Read': {0: 'R1',
  1: 'R2',
  2: 'R1',
  3: 'R2',
  4: 'R1',
  5: 'R2',
  6: 'R1',
  7: 'R2'},
 'Run': {0: '001',
  1: '001',
  2: '001',
  3: '001',
  4: '001',
  5: '001',
  6: '001',
  7: '001'},
 'Sample': {0: 'Bs',
  1: 'Bs',
  2: 'Bs',
  3: 'Bs',
  4: 'Vf',
  5: 'Vf',
  6: 'Vf',
  7: 'Vf'}})

def parse_ilm_fps_to_df(fps,
                        pattern = '^((.+?)_(S\d+)_(L\d+)_(R[12])_(\d+)\.(.+))$',
                        pattern_names = ['File','Sample','Index','Lane','Read','Run','Extension']):
    
    p = re.compile(pattern)

    df = pd.DataFrame(columns = pattern_names)

    for f in fps:
        m = p.match(f)

        if m:
            df = df.append(dict(zip(pattern_names, m.groups())), ignore_index = True)

    return(df)

obs_df = parse_ilm_fps_to_df(fps)
pd.util.testing.assert_frame_equal(obs_df.sort_index(axis=1), exp_df.sort_index(axis=1))

df = exp_df
seq_dir = './example/reads'
exp_dict = {'Bs': {'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
                               './example/reads/Bs_S106_L002_R1_001.fastq.gz'],
                   'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
                               './example/reads/Bs_S106_L002_R2_001.fastq.gz']},
            'Vf': {'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
                               './example/reads/Vf_S104_L002_R1_001.fastq.gz'],
                   'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
                               './example/reads/Vf_S104_L002_R2_001.fastq.gz']}}

def get_sample_reads_df(df, seq_dir):
    sample_reads_dict = {}
    
    samples = list(set(df['Sample']))
    
    for s in samples:
        f_fps = sorted(list(df.loc[(df['Sample'] == s) & (df['Read'] == 'R1'),'File'].values))
        r_fps = sorted(list(df.loc[(df['Sample'] == s) & (df['Read'] == 'R2'),'File'].values))

        sample_reads_dict[s] = {'forward': [os.path.join(seq_dir, x) for x in f_fps],
                                'reverse': [os.path.join(seq_dir, x) for x in r_fps]}
    
    return(sample_reads_dict)

obs_dict = get_sample_reads_df(df, seq_dir)
assert(exp_dict == obs_dict)

def get_sample_paths(seq_dir):
    fps = os.listdir(seq_dir)
    
    files_df = parse_ilm_fps_to_df(fps)
    sample_reads_dict = get_sample_reads_df(files_df, seq_dir)
    
    return(sample_reads_dict)

exp_db_dict = {'Bs': {'filter_db': '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',
                      'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
                                  './example/reads/Bs_S106_L002_R1_001.fastq.gz'],
                      'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
                                  './example/reads/Bs_S106_L002_R2_001.fastq.gz']},
               'Vf': {'filter_db': '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',
                      'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
                                  './example/reads/Vf_S104_L002_R1_001.fastq.gz'],
                      'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
                                  './example/reads/Vf_S104_L002_R2_001.fastq.gz']}}

exp_db_dict_update = {'Bs': {'filter_db': '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',
                      'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
                                  './example/reads/Bs_S106_L002_R1_001.fastq.gz'],
                      'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
                                  './example/reads/Bs_S106_L002_R2_001.fastq.gz']},
               'Vf': {'filter_db': '/home/jgsanders/ref_data/genomes/mouse/mouse',
                      'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
                                  './example/reads/Vf_S104_L002_R1_001.fastq.gz'],
                      'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
                                  './example/reads/Vf_S104_L002_R2_001.fastq.gz']}}

exp_db_dict_partial = {'Bs': {'filter_db': None,
                      'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
                                  './example/reads/Bs_S106_L002_R1_001.fastq.gz'],
                      'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
                                  './example/reads/Bs_S106_L002_R2_001.fastq.gz']},
               'Vf': {'filter_db': '/home/jgsanders/ref_data/genomes/mouse/mouse',
                      'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
                                  './example/reads/Vf_S104_L002_R1_001.fastq.gz'],
                      'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
                                  './example/reads/Vf_S104_L002_R2_001.fastq.gz']}}

def add_filter_db(sample_fp_dict, db_fp, samples = None):
    
    if samples is None:
        samples = sample_fp_dict.keys()
    
    samples_dict = copy.deepcopy(sample_fp_dict)
    
    for s in samples_dict:
        if s in samples:
            samples_dict[s]['filter_db'] = db_fp
        elif 'filter_db' in samples_dict[s]:
            continue
        else:
            samples_dict[s]['filter_db'] = None
        

        
    return(samples_dict)

obs_dict = add_filter_db(exp_dict, '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens')

assert(obs_dict == exp_db_dict)

obs_dict = add_filter_db(exp_db_dict, '/home/jgsanders/ref_data/genomes/mouse/mouse', samples = ['Vf'])

assert(obs_dict == exp_db_dict_update)

obs_dict = add_filter_db(exp_dict, '/home/jgsanders/ref_data/genomes/mouse/mouse', samples = ['Vf'])

assert(obs_dict == exp_db_dict_partial)

Steps:

  1. Get samples
    • read in sample sheet
    • guess samples from sequence folder
  2. Get params
    • filter db per sample
    • samples for binning
    • samples for abundance profiling
    • tmp dir
    • envs
    • software
    • assembler
    • trimmer
    • params
      • atropos
      • maxbin
      • humann2
      • metaphlan
      • kraken
      • shogun?
  3. format yaml

Step 1: get samples

First, we need to read in a set of samples for analysis.

There are two options for this: guess the sample names, or read them in from a sample sheet or manifest.

Option 1: guess sample names from sequence output folder.

Enter the correct sequencing directory below, and a list of some example filenames should pop up. Make sure these look correct.


In [44]:
seq_dir = './example/reads'
!ls -l {seq_dir} | head


total 180864
-rw-r--r--+ 1 jonsanders  staff  14312564 Aug  7 05:23 Bs_S106_L001_R1_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff   9523184 Aug  7 05:23 Bs_S106_L001_R2_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff  10451788 Aug  7 05:23 Bs_S106_L002_R1_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff   9305476 Aug  7 05:23 Bs_S106_L002_R2_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff  13733195 Aug  7 05:23 Vf_S104_L001_R1_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff   8845407 Aug  7 05:23 Vf_S104_L001_R2_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff  16096558 Aug  7 05:23 Vf_S104_L002_R1_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff  10323953 Aug  7 05:23 Vf_S104_L002_R2_001.fastq.gz

If that looks correct, execute the following cell. It should result in a dictionary associating samples to filepaths.

To validate, a portion of the dictionary is converted to yaml and printed below:


In [45]:
sample_paths = get_sample_paths(seq_dir)

print(yaml.dump(sample_paths, default_flow_style = False))


Bs:
  forward:
  - ./example/reads/Bs_S106_L001_R1_001.fastq.gz
  - ./example/reads/Bs_S106_L002_R1_001.fastq.gz
  reverse:
  - ./example/reads/Bs_S106_L001_R2_001.fastq.gz
  - ./example/reads/Bs_S106_L002_R2_001.fastq.gz
Vf:
  forward:
  - ./example/reads/Vf_S104_L001_R1_001.fastq.gz
  - ./example/reads/Vf_S104_L002_R1_001.fastq.gz
  reverse:
  - ./example/reads/Vf_S104_L001_R2_001.fastq.gz
  - ./example/reads/Vf_S104_L002_R2_001.fastq.gz

Option 2: read sample names and prefixes from sample sheet

This option is in development


In [30]:
# read in sample sheet

def read_sample_sheet(f, sep='\t'):
    data = False
    data_lines = ''
    for line in f:
        if data:
            if line.startswith('[') or line.strip() == '':
                data = False
                continue
            data_lines += line
        elif line.startswith('[Data]'):
            data = True
            continue
        else:
            continue
            
    data_df = pd.read_csv(StringIO(data_lines), sep = '\t', comment='#')
    return(data_df)

with open('./example/example_sample_sheet.txt', 'r') as f:
    ss_df = read_sample_sheet(f)

# column for read prefixes
# column for sample names
# aggregate into dict

ss_df.head()


Out[30]:
Lane Sample_ID Sample_Name Sample_Plate Sample_Well I7_Index_ID index I5_Index_ID index2 Sample_Project Description
0 1 Bs Bs Example Plate 1 A1 iTru7_101_01 ACGTTACC iTru5_01_A ACCGACAA Example Project Bs.1
1 1 Vf Vf Example Plate 1 B1 iTru7_101_02 CTGTGTTG iTru5_01_B AGTGGCAA Example Project Vf.1
2 2 Bs Bs Example Plate 1 A1 iTru7_101_01 ACGTTACC iTru5_01_A ACCGACAA Example Project Bs.1
3 2 Vf Vf Example Plate 1 B1 iTru7_101_02 CTGTGTTG iTru5_01_B AGTGGCAA Example Project Vf.1

In [31]:
seq_dir = './example/reads/'

fps = os.listdir(seq_dir)

files_df = parse_ilm_fps_to_df(fps)

files_df


Out[31]:
File Sample Index Lane Read Run Extension
0 Bs_S106_L001_R1_001.fastq.gz Bs S106 L001 R1 001 fastq.gz
1 Bs_S106_L001_R2_001.fastq.gz Bs S106 L001 R2 001 fastq.gz
2 Bs_S106_L002_R1_001.fastq.gz Bs S106 L002 R1 001 fastq.gz
3 Bs_S106_L002_R2_001.fastq.gz Bs S106 L002 R2 001 fastq.gz
4 Vf_S104_L001_R1_001.fastq.gz Vf S104 L001 R1 001 fastq.gz
5 Vf_S104_L001_R2_001.fastq.gz Vf S104 L001 R2 001 fastq.gz
6 Vf_S104_L002_R1_001.fastq.gz Vf S104 L002 R1 001 fastq.gz
7 Vf_S104_L002_R2_001.fastq.gz Vf S104 L002 R2 001 fastq.gz

In [47]:
def get_sample_reads_df_from_sample_sheet(ss_df, seq_dir, sample_col='Description', prefix_col='Sample_ID'):
    sample_reads_dict = {}
    
    samples = list(set(ss_df[sample_col]))
    
    fps = os.listdir(seq_dir)

    files_df = parse_ilm_fps_to_df(fps)
    
    for s in samples:
        # get the subset of sample sheet rows for that sample
        sample_rows = ss_df.loc[ss_df[sample_col] == s,]
        
        f_fps = []
        r_fps = []
        
        # iter across subset table and find file corresponding to each row
        for idx, row in sample_rows.iterrows():
            lane = 'L{0:03d}'.format(row['Lane']) 
            
            f_fps.extend(files_df.loc[(files_df['Sample'] == row['Sample_ID']) & 
                                      (files_df['Lane'] == lane) &
                                      (files_df['Read'] == 'R1'), 'File'].values)
            
            r_fps.extend(files_df.loc[(files_df['Sample'] == row['Sample_ID']) & 
                                      (files_df['Lane'] == lane) &
                                      (files_df['Read'] == 'R2'), 'File'].values)
        
        f_fps = sorted(f_fps)
        r_fps = sorted(r_fps)
        
        sample_reads_dict[s] = {'forward': [os.path.join(seq_dir, x) for x in f_fps],
                                'reverse': [os.path.join(seq_dir, x) for x in r_fps]}
    
    return(sample_reads_dict)

sample_reads_dict = get_sample_reads_df_from_sample_sheet(ss_df, seq_dir,  sample_col='Description', prefix_col='Sample_ID')
print(yaml.dump(sample_reads_dict, default_flow_style = False))


Bs.1:
  forward:
  - ./example/reads/Bs_S106_L001_R1_001.fastq.gz
  - ./example/reads/Bs_S106_L002_R1_001.fastq.gz
  reverse:
  - ./example/reads/Bs_S106_L001_R2_001.fastq.gz
  - ./example/reads/Bs_S106_L002_R2_001.fastq.gz
Vf.1:
  forward:
  - ./example/reads/Vf_S104_L001_R1_001.fastq.gz
  - ./example/reads/Vf_S104_L002_R1_001.fastq.gz
  reverse:
  - ./example/reads/Vf_S104_L001_R2_001.fastq.gz
  - ./example/reads/Vf_S104_L002_R2_001.fastq.gz

Step 2: define parameters

- filter db per sample
- samples for binning
- samples for abundance profiling
- tmp dir
- envs
- software
- assembler
- trimmer
- params
    * atropos
    * maxbin 
    * humann2
    * metaphlan
    * kraken
    * shogun?

In [48]:
config_dict = {}

Add filter databases per sample

These are the filepaths to the databases used for host filtering.

The method 'add host filter db' adds the provided filepath to the sample dictionary you created above.

By default, all samples get same database.

You can also provide a list of specific sample names to update if you want to set different filter dbs for some samples. To do this, just re-execute add_filter_db providing a list of samples to update; the existing filter_db paths will remain unchanged if they are not in this list.

Example:

filter_db1 = '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens'
filter_db2 = '/home/jgsanders/ref_data/genomes/mouse/mouse'

samples_dict = add_filter_db(sample_paths, filter_db1)
samples_dict = add_filter_db(samples_dict, filter_db2, samples = ['Vf'])

In [49]:
filter_db = '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens'

samples_dict = add_filter_db(sample_paths, filter_db)

print(yaml.dump(samples_dict, default_flow_style = False))


Bs:
  filter_db: /home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens
  forward:
  - ./example/reads/Bs_S106_L001_R1_001.fastq.gz
  - ./example/reads/Bs_S106_L002_R1_001.fastq.gz
  reverse:
  - ./example/reads/Bs_S106_L001_R2_001.fastq.gz
  - ./example/reads/Bs_S106_L002_R2_001.fastq.gz
Vf:
  filter_db: /home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens
  forward:
  - ./example/reads/Vf_S104_L001_R1_001.fastq.gz
  - ./example/reads/Vf_S104_L002_R1_001.fastq.gz
  reverse:
  - ./example/reads/Vf_S104_L001_R2_001.fastq.gz
  - ./example/reads/Vf_S104_L002_R2_001.fastq.gz

Add samples for binning

These are the samples from which assembled contigs will be binned into putative draft genomes.

By default, bin all samples.


In [50]:
# binning_samples = ['Vf','Bs']
binning_samples = list(samples_dict.keys())

config_dict['binning_samples'] = binning_samples

Add samples for abundance profiling

These are the samples from which abundance information will be used for binning of binning_samples

By default, use all samples.


In [51]:
#abundance_samples = ['Vf','Bs']

# This line filters out BLANK* samples from abundance mapping
abundance_samples = [x for x in samples_dict if not x.startswith('BLANK')]

# This line uses all samples for abundance mapping
abundance_samples = list(samples_dict.keys())

config_dict['abundance_samples'] = abundance_samples

Add trimmer to use

These are the read trimmers to use for qc.

Currently, the available options are:

- atropos
- skewer

In [52]:
config_dict['trimmer'] = 'atropos'

Add assemblers to use

These are the assemblers to use for assembly.

Currently, the available options are:

- spades
- metaspades
- megahit

In [53]:
assemblers = ['megahit','metaspades']

config_dict['assemblers'] = assemblers

Add assembly to use for binning

This is the assembler to use for binning.

Currently, the available options are:

- spades
- metaspades
- megahit

In [54]:
config_dict['mapping_assembler'] = 'metaspades'

Add temporary directory

This is the directory used during execution of rules. Should ideally be local scratch storage.


In [55]:
tmp_dir_root = '/localscratch'

config_dict['tmp_dir_root'] = tmp_dir_root

Add environments

These are the conda environments to use for various rules.

env is the primary snakemake_assemble environment. Change others as necessary for other rules.


In [56]:
config_dict['env'] = 'source activate snakemake_assemble'
config_dict['anvi_env'] = 'source activate sn_anvio'
config_dict['metaphlan_env'] = 'source activate humann2'
config_dict['humann2_env'] = 'source activate humann2'
config_dict['kraken_env'] = 'source activate kraken'
config_dict['shogun_env'] = 'source activate shogun'

Add software paths

Some software binaries are specified directly.

Also, the installation folder for the snakemake_assemble repository is provided as snakemake_folder


In [57]:
software = {'snakemake_folder': '/home/jgsanders/git_sw/snakemake_assemble',
            'seqtk': 'seqtk'}

config_dict['software'] = software

anvio resources

Sets paths to resources for anvio


In [58]:
resources =  {'centrifuge_base': '/home/jgsanders/miniconda/envs/anvio2/centrifuge',
              'centrifuge_models': '/home/jgsanders/miniconda/envs/anvio2/centrifuge/b+h+v/b+h+v'}
config_dict['resources'] = resources

Add parameters definitions

These are parameters passed to the various tools used.


In [59]:
params = {}

Atropos params

Sets quality and adapter trimming parameters.

Some suggested defaults: Nextera: -a CTGTCTCTTATACACATCT -A CTGTCTCTTATACACATCT -q 15 --minimum-length 100 --pair-filter any KapaHyperPlus: -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -A GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT -q 15 --minimum-length 100 --pair-filter any


In [60]:
params['atropos'] = ' -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -A GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT -q 15 --minimum-length 100 --pair-filter any'

maxbin params

Sets binning parameters.


In [61]:
params['maxbin'] = '-plotmarker'

Kraken params

Sets parameters for Kraken taxonomic profiling.


In [62]:
params['kraken'] = {'kraken_db': '/home/qiz173/Databases/Kraken/stdb'}

SHOGUN params

Sets parameters for Shogun taxonomic profiling.


In [63]:
params['shogun'] = "--utree_indx /home/qiz173/Databases/SHOGUN/annotated/utree/stdb.ctr"

MetaPhlAn2 params

Sets parameters for MetaPhlAn2 taxonomic profilling.


In [64]:
params['metaphlan'] = {'metaphlan_dir': '/home/jgsanders/git_sw/metaphlan2'}

humann2

Sets various parameters for HUMAnN2 funcitonal profiling.

humann2_aa_db: Amino acid database for translated amino acid search humann2_nt_db: ChocoPhlAn database for nucleotide search metaphan_dir: path to metaphlan2 directory install, should also have the default metaphlan2 db norms: normalizations for which to output tables other: any other HUMAnN2 parameters to use


In [65]:
humann2 = {'humann2_aa_db': 'humann2_aa_db: /home/jgsanders/ref_data/humann2/uniref',
           'humann2_nt_db': '/home/jgsanders/ref_data/humann2/chocophlan',
           'metaphlan_dir': '/home/jgsanders/share/metaphlan2',
           'norms': ['cpm','relab'],
           'other': ''}

params['humann2'] = humann2

other

Various other legacy parameters that I need to fix


In [66]:
params['cram'] = ''
params['skewer'] = '-x ./adapters/HyperPlus.fa -n -l 100 -m any -q 15'

Step 3: format yaml

Now the entire configuration dictionary can be formatted as a yaml file and saved for use in the pipeline.


In [67]:
config_dict['params'] = params
config_dict['samples'] = samples_dict

Format as yaml and write to filepath:


In [68]:
config_yaml = yaml.dump(config_dict, default_flow_style = False)

with open('config.yaml', 'w') as f:
    f.write(config_yaml)

This is the complete config yaml:


In [69]:
print(config_yaml)


abundance_samples:
- Bs
- Vf
anvi_env: source activate sn_anvio
assemblers:
- megahit
- metaspades
binning_samples:
- Bs
- Vf
env: source activate snakemake_assemble
humann2_env: source activate humann2
kraken_env: source activate kraken
mapping_assembler: metaspades
metaphlan_env: source activate humann2
params:
  atropos: ' -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -A GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT
    -q 15 --minimum-length 100 --pair-filter any'
  cram: ''
  humann2:
    humann2_aa_db: 'humann2_aa_db: /home/jgsanders/ref_data/humann2/uniref'
    humann2_nt_db: /home/jgsanders/ref_data/humann2/chocophlan
    metaphlan_dir: /home/jgsanders/share/metaphlan2
    norms:
    - cpm
    - relab
    other: ''
  kraken:
    kraken_db: /home/qiz173/Databases/Kraken/stdb
  maxbin: -plotmarker
  metaphlan:
    metaphlan_dir: /home/jgsanders/git_sw/metaphlan2
  shogun: --utree_indx /home/qiz173/Databases/SHOGUN/annotated/utree/stdb.ctr
  skewer: -x ./adapters/HyperPlus.fa -n -l 100 -m any -q 15
resources:
  centrifuge_base: /home/jgsanders/miniconda/envs/anvio2/centrifuge
  centrifuge_models: /home/jgsanders/miniconda/envs/anvio2/centrifuge/b+h+v/b+h+v
samples:
  Bs:
    filter_db: /home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens
    forward:
    - ./example/reads/Bs_S106_L001_R1_001.fastq.gz
    - ./example/reads/Bs_S106_L002_R1_001.fastq.gz
    reverse:
    - ./example/reads/Bs_S106_L001_R2_001.fastq.gz
    - ./example/reads/Bs_S106_L002_R2_001.fastq.gz
  Vf:
    filter_db: /home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens
    forward:
    - ./example/reads/Vf_S104_L001_R1_001.fastq.gz
    - ./example/reads/Vf_S104_L002_R1_001.fastq.gz
    reverse:
    - ./example/reads/Vf_S104_L001_R2_001.fastq.gz
    - ./example/reads/Vf_S104_L002_R2_001.fastq.gz
shogun_env: source activate shogun
software:
  seqtk: seqtk
  snakemake_folder: /home/jgsanders/git_sw/snakemake_assemble
tmp_dir_root: /localscratch
trimmer: atropos


In [ ]:


In [ ]: