In [7]:

    
import glob
import os
import yaml
import re
import copy
from io import StringIO
import pandas as pd
import numpy as np



In [2]:

    
def open_sample_sheet(sample_sheet_fp, lanes=False, skiprows=18):
    """Read in an IGM sample sheet and return a pandas DF with primary data table"""
    sample_sheet = pd.read_excel(sample_sheet_fp, skiprows = skiprows, header=1)
    
    # Remove trailing whitespace from column names and homogenize case
    sample_sheet.columns = [x.strip().lower() for x in sample_sheet.columns]
    
    if lanes:
        sample_sheet = sample_sheet.loc[sample_sheet['lane'].isin(lanes)]
    if 'sample name' not in sample_sheet.columns and 'sample_id' in sample_sheet.columns:
        sample_sheet['sample name'] = sample_sheet['sample_id']

    return(sample_sheet)



In [3]:

    
def open_sequencing_manifest(manifest_fp, lanes='False', sheetname='Sample Information', skiprows=18):
    """Read in an IGM sequencing manifest and return a pandas DF with primary data table"""
    sample_sheet = pd.read_excel(manifest_fp, sheetname=sheetname, skiprows=skiprows, header=1)
    
    # Remove trailing whitespace from column names and homogenize case
    sample_sheet.columns = [x.strip().lower() for x in sample_sheet.columns]
    
    if lanes:
        sample_sheet = sample_sheet.loc[sample_sheet['lane'].isin(lanes)]
        
    return(sample_sheet)



In [4]:

    
def get_read(sample, seq_dir, read):
    """Function to pull a given read based on sample name from the reads directory"""
    reads = glob.glob(os.path.join(seq_dir, "{0}_*_{1}_*.fastq.gz".format(sample, read)))
    if len(reads) == 1:
        return(reads[0])
    elif len(reads) > 1:
        raise ValueError('Too many reads found for {0} in {1}:\n'
                         'read_str: {2}'.format(sample, seq_dir, reads))
    elif len(reads) < 1:
        raise ValueError('Too few reads found for {0} in {1}:\n'
                         'read_str: {1}\n'.format(sample, seq_dir, reads))



In [5]:

    
def make_sample_dict(sample_sheet, seq_dir,
                        forward = 'R1',
                        reverse = 'R2',
                        adaptor = '$CONDA_ENV_PATH/share/trimmomatic-*/adapters/TruSeq3-PE-2.fa',
                        phred = 'phred33',
                        sample_header = 'Sample_Prefix',
                        sample_name = 'Sample',
                        tolerate_losses = True):
    
    samples_pe = {'samples_pe': {}}
    for x in sample_sheet.index:
        try:
            samples_pe['samples_pe'][sample_sheet.loc[x, sample_name]] = {
               'forward': get_read(sample_sheet.loc[x, sample_header], seq_dir, forward),
               'reverse': get_read(sample_sheet.loc[x, sample_header], seq_dir, reverse),
               'adaptor': adaptor,
               'phred': phred
            }
        except ValueError:
            if tolerate_losses:
                print('No sequences found for %s' % sample_sheet.loc[x, sample_name])
            else:
                raise ValueError
    
    return(samples_pe)



In [6]:

    
def format_yaml(RUN, samples_pe,
                   TMP_DIR_ROOT = '/localscratch',
                   host_db = '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',
                   envs = {'KNEAD_ENV': 'source activate kneaddata',
                           'BOWTIE_ENV': 'source activate kneaddata',
                           'QC_ENV': 'source activate kneaddata',
                           'TRIM_ENV': 'source activate kneaddata',
                           'HUMANN2_ENV': 'source activate kneaddata',
                           'METAPHLAN_ENV': 'source activate kneaddata'},
                   software = {'gzip': 'gzip',
                               'trimmomatic': 'trimmomatic',
                               'mash': '/home/jgsanders/git_sw/git_bin/mash-Linux64-v1.1.1/mash',
                               'seqtk': '/home/jgsanders/git_sw/seqtk/seqtk'},
                   params = {'TRIMMOMATIC': {'QUAL': 'LEADING:20 TRAILING:20 AVGQUAL:30 MINLEN:32 TOPHRED33',
                                              'ILLUMINACLIP': '2:30:10'},
                             'MASH': {'REFSEQ_DB': '/home/jgsanders/ref_data/mash/RefSeqSketchesDefaults.msh',
                                      'OTHER': '-r -m 2 -k 21 -s 1000'},
                             'HUMANN2': {'METAPHLAN_DIR': '/home/jgsanders/share/metaphlan2',
                                         'HUMANN2_NT_DB': '/home/jgsanders/ref_data/humann2/chocophlan',
                                         'HUMANN2_AA_DB': '/home/jgsanders/ref_data/humann2/uniref',
                                         'NORMS': ['cpm','relab'],
                                         'OTHER': ''}},
                   default_flow_style = False):
    
    config_str = ''

    config_str += yaml.dump({'TMP_DIR_ROOT': TMP_DIR_ROOT}, default_flow_style = default_flow_style)

    config_str += yaml.dump({'RUN': RUN}, default_flow_style = default_flow_style)
    
    config_str += yaml.dump({'HOST_DB': host_db}, default_flow_style = default_flow_style)
    
    config_str += yaml.dump({'ENVS': envs}, default_flow_style = default_flow_style)            
    
    config_str += yaml.dump({'SOFTWARE': software}, default_flow_style = default_flow_style)

    config_str += yaml.dump({'PARAMS': params}, default_flow_style = default_flow_style)

    config_str += yaml.dump(samples_pe, default_flow_style = default_flow_style)
    
    return(config_str)



In [19]:

    
fps = ['Bs_S106_L001_R1_001.fastq.gz',
       'Bs_S106_L001_R2_001.fastq.gz',
       'Bs_S106_L002_R1_001.fastq.gz',
       'Bs_S106_L002_R2_001.fastq.gz',
       'Vf_S104_L001_R1_001.fastq.gz',
       'Vf_S104_L001_R2_001.fastq.gz',
       'Vf_S104_L002_R1_001.fastq.gz',
       'Vf_S104_L002_R2_001.fastq.gz']

exp_df = pd.DataFrame.from_dict({'Extension': {0: 'fastq.gz',
  1: 'fastq.gz',
  2: 'fastq.gz',
  3: 'fastq.gz',
  4: 'fastq.gz',
  5: 'fastq.gz',
  6: 'fastq.gz',
  7: 'fastq.gz'},
 'File': {0: 'Bs_S106_L001_R1_001.fastq.gz',
  1: 'Bs_S106_L001_R2_001.fastq.gz',
  2: 'Bs_S106_L002_R1_001.fastq.gz',
  3: 'Bs_S106_L002_R2_001.fastq.gz',
  4: 'Vf_S104_L001_R1_001.fastq.gz',
  5: 'Vf_S104_L001_R2_001.fastq.gz',
  6: 'Vf_S104_L002_R1_001.fastq.gz',
  7: 'Vf_S104_L002_R2_001.fastq.gz'},
 'Index': {0: 'S106',
  1: 'S106',
  2: 'S106',
  3: 'S106',
  4: 'S104',
  5: 'S104',
  6: 'S104',
  7: 'S104'},
 'Lane': {0: 'L001',
  1: 'L001',
  2: 'L002',
  3: 'L002',
  4: 'L001',
  5: 'L001',
  6: 'L002',
  7: 'L002'},
 'Read': {0: 'R1',
  1: 'R2',
  2: 'R1',
  3: 'R2',
  4: 'R1',
  5: 'R2',
  6: 'R1',
  7: 'R2'},
 'Run': {0: '001',
  1: '001',
  2: '001',
  3: '001',
  4: '001',
  5: '001',
  6: '001',
  7: '001'},
 'Sample': {0: 'Bs',
  1: 'Bs',
  2: 'Bs',
  3: 'Bs',
  4: 'Vf',
  5: 'Vf',
  6: 'Vf',
  7: 'Vf'}})

def parse_ilm_fps_to_df(fps,
                        pattern = '^((.+?)_(S\d+)_(L\d+)_(R[12])_(\d+)\.(.+))$',
                        pattern_names = ['File','Sample','Index','Lane','Read','Run','Extension']):
    
    p = re.compile(pattern)

    df = pd.DataFrame(columns = pattern_names)

    for f in fps:
        m = p.match(f)

        if m:
            df = df.append(dict(zip(pattern_names, m.groups())), ignore_index = True)

    return(df)

obs_df = parse_ilm_fps_to_df(fps)
pd.util.testing.assert_frame_equal(obs_df.sort_index(axis=1), exp_df.sort_index(axis=1))

df = exp_df
seq_dir = './example/reads'
exp_dict = {'Bs': {'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
                               './example/reads/Bs_S106_L002_R1_001.fastq.gz'],
                   'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
                               './example/reads/Bs_S106_L002_R2_001.fastq.gz']},
            'Vf': {'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
                               './example/reads/Vf_S104_L002_R1_001.fastq.gz'],
                   'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
                               './example/reads/Vf_S104_L002_R2_001.fastq.gz']}}

def get_sample_reads_df(df, seq_dir):
    sample_reads_dict = {}
    
    samples = list(set(df['Sample']))
    
    for s in samples:
        f_fps = sorted(list(df.loc[(df['Sample'] == s) & (df['Read'] == 'R1'),'File'].values))
        r_fps = sorted(list(df.loc[(df['Sample'] == s) & (df['Read'] == 'R2'),'File'].values))

        sample_reads_dict[s] = {'forward': [os.path.join(seq_dir, x) for x in f_fps],
                                'reverse': [os.path.join(seq_dir, x) for x in r_fps]}
    
    return(sample_reads_dict)

obs_dict = get_sample_reads_df(df, seq_dir)
assert(exp_dict == obs_dict)

def get_sample_paths(seq_dir):
    fps = os.listdir(seq_dir)
    
    files_df = parse_ilm_fps_to_df(fps)
    sample_reads_dict = get_sample_reads_df(files_df, seq_dir)
    
    return(sample_reads_dict)

exp_db_dict = {'Bs': {'filter_db': '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',
                      'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
                                  './example/reads/Bs_S106_L002_R1_001.fastq.gz'],
                      'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
                                  './example/reads/Bs_S106_L002_R2_001.fastq.gz']},
               'Vf': {'filter_db': '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',
                      'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
                                  './example/reads/Vf_S104_L002_R1_001.fastq.gz'],
                      'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
                                  './example/reads/Vf_S104_L002_R2_001.fastq.gz']}}

exp_db_dict_update = {'Bs': {'filter_db': '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',
                      'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
                                  './example/reads/Bs_S106_L002_R1_001.fastq.gz'],
                      'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
                                  './example/reads/Bs_S106_L002_R2_001.fastq.gz']},
               'Vf': {'filter_db': '/home/jgsanders/ref_data/genomes/mouse/mouse',
                      'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
                                  './example/reads/Vf_S104_L002_R1_001.fastq.gz'],
                      'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
                                  './example/reads/Vf_S104_L002_R2_001.fastq.gz']}}

exp_db_dict_partial = {'Bs': {'filter_db': None,
                      'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
                                  './example/reads/Bs_S106_L002_R1_001.fastq.gz'],
                      'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
                                  './example/reads/Bs_S106_L002_R2_001.fastq.gz']},
               'Vf': {'filter_db': '/home/jgsanders/ref_data/genomes/mouse/mouse',
                      'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
                                  './example/reads/Vf_S104_L002_R1_001.fastq.gz'],
                      'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
                                  './example/reads/Vf_S104_L002_R2_001.fastq.gz']}}

def add_filter_db(sample_fp_dict, db_fp, samples = None):
    
    if samples is None:
        samples = sample_fp_dict.keys()
    
    samples_dict = copy.deepcopy(sample_fp_dict)
    
    for s in samples_dict:
        if s in samples:
            samples_dict[s]['filter_db'] = db_fp
        elif 'filter_db' in samples_dict[s]:
            continue
        else:
            samples_dict[s]['filter_db'] = None
        

        
    return(samples_dict)

obs_dict = add_filter_db(exp_dict, '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens')

assert(obs_dict == exp_db_dict)

obs_dict = add_filter_db(exp_db_dict, '/home/jgsanders/ref_data/genomes/mouse/mouse', samples = ['Vf'])

assert(obs_dict == exp_db_dict_update)

obs_dict = add_filter_db(exp_dict, '/home/jgsanders/ref_data/genomes/mouse/mouse', samples = ['Vf'])

assert(obs_dict == exp_db_dict_partial)

Steps:

Get samples
- read in sample sheet
- guess samples from sequence folder
Get params
- filter db per sample
- samples for binning
- samples for abundance profiling
- tmp dir
- envs
- software
- assembler
- trimmer
- params
  - atropos
  - maxbin
  - humann2
  - metaphlan
  - kraken
  - shogun?
format yaml

Step 1: get samples

First, we need to read in a set of samples for analysis.

There are two options for this: guess the sample names, or read them in from a sample sheet or manifest.

Option 1: guess sample names from sequence output folder.

Enter the correct sequencing directory below, and a list of some example filenames should pop up. Make sure these look correct.



In [44]:

    
seq_dir = './example/reads'
!ls -l {seq_dir} | head









    



total 180864
-rw-r--r--+ 1 jonsanders  staff  14312564 Aug  7 05:23 Bs_S106_L001_R1_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff   9523184 Aug  7 05:23 Bs_S106_L001_R2_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff  10451788 Aug  7 05:23 Bs_S106_L002_R1_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff   9305476 Aug  7 05:23 Bs_S106_L002_R2_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff  13733195 Aug  7 05:23 Vf_S104_L001_R1_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff   8845407 Aug  7 05:23 Vf_S104_L001_R2_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff  16096558 Aug  7 05:23 Vf_S104_L002_R1_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff  10323953 Aug  7 05:23 Vf_S104_L002_R2_001.fastq.gz

If that looks correct, execute the following cell. It should result in a dictionary associating samples to filepaths.

To validate, a portion of the dictionary is converted to yaml and printed below:



In [45]:

    
sample_paths = get_sample_paths(seq_dir)

print(yaml.dump(sample_paths, default_flow_style = False))









    



Bs:
  forward:
  - ./example/reads/Bs_S106_L001_R1_001.fastq.gz
  - ./example/reads/Bs_S106_L002_R1_001.fastq.gz
  reverse:
  - ./example/reads/Bs_S106_L001_R2_001.fastq.gz
  - ./example/reads/Bs_S106_L002_R2_001.fastq.gz
Vf:
  forward:
  - ./example/reads/Vf_S104_L001_R1_001.fastq.gz
  - ./example/reads/Vf_S104_L002_R1_001.fastq.gz
  reverse:
  - ./example/reads/Vf_S104_L001_R2_001.fastq.gz
  - ./example/reads/Vf_S104_L002_R2_001.fastq.gz

Option 2: read sample names and prefixes from sample sheet

This option is in development



In [30]:

    
# read in sample sheet

def read_sample_sheet(f, sep='\t'):
    data = False
    data_lines = ''
    for line in f:
        if data:
            if line.startswith('[') or line.strip() == '':
                data = False
                continue
            data_lines += line
        elif line.startswith('[Data]'):
            data = True
            continue
        else:
            continue
            
    data_df = pd.read_csv(StringIO(data_lines), sep = '\t', comment='#')
    return(data_df)

with open('./example/example_sample_sheet.txt', 'r') as f:
    ss_df = read_sample_sheet(f)

# column for read prefixes
# column for sample names
# aggregate into dict

ss_df.head()









    Out[30]:







  
    
      
      Lane
      Sample_ID
      Sample_Name
      Sample_Plate
      Sample_Well
      I7_Index_ID
      index
      I5_Index_ID
      index2
      Sample_Project
      Description
    
  
  
    
      0
      1
      Bs
      Bs
      Example Plate 1
      A1
      iTru7_101_01
      ACGTTACC
      iTru5_01_A
      ACCGACAA
      Example Project
      Bs.1
    
    
      1
      1
      Vf
      Vf
      Example Plate 1
      B1
      iTru7_101_02
      CTGTGTTG
      iTru5_01_B
      AGTGGCAA
      Example Project
      Vf.1
    
    
      2
      2
      Bs
      Bs
      Example Plate 1
      A1
      iTru7_101_01
      ACGTTACC
      iTru5_01_A
      ACCGACAA
      Example Project
      Bs.1
    
    
      3
      2
      Vf
      Vf
      Example Plate 1
      B1
      iTru7_101_02
      CTGTGTTG
      iTru5_01_B
      AGTGGCAA
      Example Project
      Vf.1



In [31]:

    
seq_dir = './example/reads/'

fps = os.listdir(seq_dir)

files_df = parse_ilm_fps_to_df(fps)

files_df









    Out[31]:







  
    
      
      File
      Sample
      Index
      Lane
      Read
      Run
      Extension
    
  
  
    
      0
      Bs_S106_L001_R1_001.fastq.gz
      Bs
      S106
      L001
      R1
      001
      fastq.gz
    
    
      1
      Bs_S106_L001_R2_001.fastq.gz
      Bs
      S106
      L001
      R2
      001
      fastq.gz
    
    
      2
      Bs_S106_L002_R1_001.fastq.gz
      Bs
      S106
      L002
      R1
      001
      fastq.gz
    
    
      3
      Bs_S106_L002_R2_001.fastq.gz
      Bs
      S106
      L002
      R2
      001
      fastq.gz
    
    
      4
      Vf_S104_L001_R1_001.fastq.gz
      Vf
      S104
      L001
      R1
      001
      fastq.gz
    
    
      5
      Vf_S104_L001_R2_001.fastq.gz
      Vf
      S104
      L001
      R2
      001
      fastq.gz
    
    
      6
      Vf_S104_L002_R1_001.fastq.gz
      Vf
      S104
      L002
      R1
      001
      fastq.gz
    
    
      7
      Vf_S104_L002_R2_001.fastq.gz
      Vf
      S104
      L002
      R2
      001
      fastq.gz



In [47]:

    
def get_sample_reads_df_from_sample_sheet(ss_df, seq_dir, sample_col='Description', prefix_col='Sample_ID'):
    sample_reads_dict = {}
    
    samples = list(set(ss_df[sample_col]))
    
    fps = os.listdir(seq_dir)

    files_df = parse_ilm_fps_to_df(fps)
    
    for s in samples:
        # get the subset of sample sheet rows for that sample
        sample_rows = ss_df.loc[ss_df[sample_col] == s,]
        
        f_fps = []
        r_fps = []
        
        # iter across subset table and find file corresponding to each row
        for idx, row in sample_rows.iterrows():
            lane = 'L{0:03d}'.format(row['Lane']) 
            
            f_fps.extend(files_df.loc[(files_df['Sample'] == row['Sample_ID']) & 
                                      (files_df['Lane'] == lane) &
                                      (files_df['Read'] == 'R1'), 'File'].values)
            
            r_fps.extend(files_df.loc[(files_df['Sample'] == row['Sample_ID']) & 
                                      (files_df['Lane'] == lane) &
                                      (files_df['Read'] == 'R2'), 'File'].values)
        
        f_fps = sorted(f_fps)
        r_fps = sorted(r_fps)
        
        sample_reads_dict[s] = {'forward': [os.path.join(seq_dir, x) for x in f_fps],
                                'reverse': [os.path.join(seq_dir, x) for x in r_fps]}
    
    return(sample_reads_dict)

sample_reads_dict = get_sample_reads_df_from_sample_sheet(ss_df, seq_dir,  sample_col='Description', prefix_col='Sample_ID')
print(yaml.dump(sample_reads_dict, default_flow_style = False))









    



Bs.1:
  forward:
  - ./example/reads/Bs_S106_L001_R1_001.fastq.gz
  - ./example/reads/Bs_S106_L002_R1_001.fastq.gz
  reverse:
  - ./example/reads/Bs_S106_L001_R2_001.fastq.gz
  - ./example/reads/Bs_S106_L002_R2_001.fastq.gz
Vf.1:
  forward:
  - ./example/reads/Vf_S104_L001_R1_001.fastq.gz
  - ./example/reads/Vf_S104_L002_R1_001.fastq.gz
  reverse:
  - ./example/reads/Vf_S104_L001_R2_001.fastq.gz
  - ./example/reads/Vf_S104_L002_R2_001.fastq.gz

Step 2: define parameters

- filter db per sample
- samples for binning
- samples for abundance profiling
- tmp dir
- envs
- software
- assembler
- trimmer
- params
    * atropos
    * maxbin 
    * humann2
    * metaphlan
    * kraken
    * shogun?



In [48]:

    
config_dict = {}

Add filter databases per sample

These are the filepaths to the databases used for host filtering.

The method 'add host filter db' adds the provided filepath to the sample dictionary you created above.

By default, all samples get same database.

You can also provide a list of specific sample names to update if you want to set different filter dbs for some samples. To do this, just re-execute add_filter_db providing a list of samples to update; the existing filter_db paths will remain unchanged if they are not in this list.

Example:

filter_db1 = '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens'
filter_db2 = '/home/jgsanders/ref_data/genomes/mouse/mouse'

samples_dict = add_filter_db(sample_paths, filter_db1)
samples_dict = add_filter_db(samples_dict, filter_db2, samples = ['Vf'])



In [49]:

    
filter_db = '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens'

samples_dict = add_filter_db(sample_paths, filter_db)

print(yaml.dump(samples_dict, default_flow_style = False))









    



Bs:
  filter_db: /home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens
  forward:
  - ./example/reads/Bs_S106_L001_R1_001.fastq.gz
  - ./example/reads/Bs_S106_L002_R1_001.fastq.gz
  reverse:
  - ./example/reads/Bs_S106_L001_R2_001.fastq.gz
  - ./example/reads/Bs_S106_L002_R2_001.fastq.gz
Vf:
  filter_db: /home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens
  forward:
  - ./example/reads/Vf_S104_L001_R1_001.fastq.gz
  - ./example/reads/Vf_S104_L002_R1_001.fastq.gz
  reverse:
  - ./example/reads/Vf_S104_L001_R2_001.fastq.gz
  - ./example/reads/Vf_S104_L002_R2_001.fastq.gz

Add samples for binning

These are the samples from which assembled contigs will be binned into putative draft genomes.

By default, bin all samples.



In [50]:

    
# binning_samples = ['Vf','Bs']
binning_samples = list(samples_dict.keys())

config_dict['binning_samples'] = binning_samples

Add samples for abundance profiling

These are the samples from which abundance information will be used for binning of binning_samples

By default, use all samples.



In [51]:

    
#abundance_samples = ['Vf','Bs']

# This line filters out BLANK* samples from abundance mapping
abundance_samples = [x for x in samples_dict if not x.startswith('BLANK')]

# This line uses all samples for abundance mapping
abundance_samples = list(samples_dict.keys())

config_dict['abundance_samples'] = abundance_samples

Add trimmer to use

These are the read trimmers to use for qc.

Currently, the available options are:

- atropos
- skewer



In [52]:

    
config_dict['trimmer'] = 'atropos'

Add assemblers to use

These are the assemblers to use for assembly.

Currently, the available options are:

- spades
- metaspades
- megahit



In [53]:

    
assemblers = ['megahit','metaspades']

config_dict['assemblers'] = assemblers

Add assembly to use for binning

This is the assembler to use for binning.

Currently, the available options are:

- spades
- metaspades
- megahit



In [54]:

    
config_dict['mapping_assembler'] = 'metaspades'

Add temporary directory

This is the directory used during execution of rules. Should ideally be local scratch storage.



In [55]:

    
tmp_dir_root = '/localscratch'

config_dict['tmp_dir_root'] = tmp_dir_root

Add environments

These are the conda environments to use for various rules.

env is the primary snakemake_assemble environment. Change others as necessary for other rules.



In [56]:

    
config_dict['env'] = 'source activate snakemake_assemble'
config_dict['anvi_env'] = 'source activate sn_anvio'
config_dict['metaphlan_env'] = 'source activate humann2'
config_dict['humann2_env'] = 'source activate humann2'
config_dict['kraken_env'] = 'source activate kraken'
config_dict['shogun_env'] = 'source activate shogun'

Add software paths

Some software binaries are specified directly.

Also, the installation folder for the snakemake_assemble repository is provided as snakemake_folder



In [57]:

    
software = {'snakemake_folder': '/home/jgsanders/git_sw/snakemake_assemble',
            'seqtk': 'seqtk'}

config_dict['software'] = software

anvio resources

Sets paths to resources for anvio



In [58]:

    
resources =  {'centrifuge_base': '/home/jgsanders/miniconda/envs/anvio2/centrifuge',
              'centrifuge_models': '/home/jgsanders/miniconda/envs/anvio2/centrifuge/b+h+v/b+h+v'}
config_dict['resources'] = resources

Add parameters definitions

These are parameters passed to the various tools used.



In [59]:

    
params = {}

Atropos params

Sets quality and adapter trimming parameters.

Some suggested defaults: Nextera: -a CTGTCTCTTATACACATCT -A CTGTCTCTTATACACATCT -q 15 --minimum-length 100 --pair-filter any KapaHyperPlus: -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -A GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT -q 15 --minimum-length 100 --pair-filter any



In [60]:

    
params['atropos'] = ' -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -A GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT -q 15 --minimum-length 100 --pair-filter any'

maxbin params

Sets binning parameters.



In [61]:

    
params['maxbin'] = '-plotmarker'

Kraken params

Sets parameters for Kraken taxonomic profiling.



In [62]:

    
params['kraken'] = {'kraken_db': '/home/qiz173/Databases/Kraken/stdb'}

SHOGUN params

Sets parameters for Shogun taxonomic profiling.



In [63]:

    
params['shogun'] = "--utree_indx /home/qiz173/Databases/SHOGUN/annotated/utree/stdb.ctr"

MetaPhlAn2 params

Sets parameters for MetaPhlAn2 taxonomic profilling.



In [64]:

    
params['metaphlan'] = {'metaphlan_dir': '/home/jgsanders/git_sw/metaphlan2'}

humann2

Sets various parameters for HUMAnN2 funcitonal profiling.

humann2_aa_db: Amino acid database for translated amino acid search humann2_nt_db: ChocoPhlAn database for nucleotide search metaphan_dir: path to metaphlan2 directory install, should also have the default metaphlan2 db norms: normalizations for which to output tables other: any other HUMAnN2 parameters to use



In [65]:

    
humann2 = {'humann2_aa_db': 'humann2_aa_db: /home/jgsanders/ref_data/humann2/uniref',
           'humann2_nt_db': '/home/jgsanders/ref_data/humann2/chocophlan',
           'metaphlan_dir': '/home/jgsanders/share/metaphlan2',
           'norms': ['cpm','relab'],
           'other': ''}

params['humann2'] = humann2

other

Various other legacy parameters that I need to fix



In [66]:

    
params['cram'] = ''
params['skewer'] = '-x ./adapters/HyperPlus.fa -n -l 100 -m any -q 15'

Step 3: format yaml

Now the entire configuration dictionary can be formatted as a yaml file and saved for use in the pipeline.



In [67]:

    
config_dict['params'] = params
config_dict['samples'] = samples_dict

Format as yaml and write to filepath:



In [68]:

    
config_yaml = yaml.dump(config_dict, default_flow_style = False)

with open('config.yaml', 'w') as f:
    f.write(config_yaml)

This is the complete config yaml:



In [69]:

    
print(config_yaml)









    



abundance_samples:
- Bs
- Vf
anvi_env: source activate sn_anvio
assemblers:
- megahit
- metaspades
binning_samples:
- Bs
- Vf
env: source activate snakemake_assemble
humann2_env: source activate humann2
kraken_env: source activate kraken
mapping_assembler: metaspades
metaphlan_env: source activate humann2
params:
  atropos: ' -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -A GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT
    -q 15 --minimum-length 100 --pair-filter any'
  cram: ''
  humann2:
    humann2_aa_db: 'humann2_aa_db: /home/jgsanders/ref_data/humann2/uniref'
    humann2_nt_db: /home/jgsanders/ref_data/humann2/chocophlan
    metaphlan_dir: /home/jgsanders/share/metaphlan2
    norms:
    - cpm
    - relab
    other: ''
  kraken:
    kraken_db: /home/qiz173/Databases/Kraken/stdb
  maxbin: -plotmarker
  metaphlan:
    metaphlan_dir: /home/jgsanders/git_sw/metaphlan2
  shogun: --utree_indx /home/qiz173/Databases/SHOGUN/annotated/utree/stdb.ctr
  skewer: -x ./adapters/HyperPlus.fa -n -l 100 -m any -q 15
resources:
  centrifuge_base: /home/jgsanders/miniconda/envs/anvio2/centrifuge
  centrifuge_models: /home/jgsanders/miniconda/envs/anvio2/centrifuge/b+h+v/b+h+v
samples:
  Bs:
    filter_db: /home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens
    forward:
    - ./example/reads/Bs_S106_L001_R1_001.fastq.gz
    - ./example/reads/Bs_S106_L002_R1_001.fastq.gz
    reverse:
    - ./example/reads/Bs_S106_L001_R2_001.fastq.gz
    - ./example/reads/Bs_S106_L002_R2_001.fastq.gz
  Vf:
    filter_db: /home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens
    forward:
    - ./example/reads/Vf_S104_L001_R1_001.fastq.gz
    - ./example/reads/Vf_S104_L002_R1_001.fastq.gz
    reverse:
    - ./example/reads/Vf_S104_L001_R2_001.fastq.gz
    - ./example/reads/Vf_S104_L002_R2_001.fastq.gz
shogun_env: source activate shogun
software:
  seqtk: seqtk
  snakemake_folder: /home/jgsanders/git_sw/snakemake_assemble
tmp_dir_root: /localscratch
trimmer: atropos



In [ ]:



In [ ]:

	Lane	Sample_ID	Sample_Name	Sample_Plate	Sample_Well	I7_Index_ID	index	I5_Index_ID	index2	Sample_Project	Description
0	1	Bs	Bs	Example Plate 1	A1	iTru7_101_01	ACGTTACC	iTru5_01_A	ACCGACAA	Example Project	Bs.1
1	1	Vf	Vf	Example Plate 1	B1	iTru7_101_02	CTGTGTTG	iTru5_01_B	AGTGGCAA	Example Project	Vf.1
2	2	Bs	Bs	Example Plate 1	A1	iTru7_101_01	ACGTTACC	iTru5_01_A	ACCGACAA	Example Project	Bs.1
3	2	Vf	Vf	Example Plate 1	B1	iTru7_101_02	CTGTGTTG	iTru5_01_B	AGTGGCAA	Example Project	Vf.1

	File	Sample	Index	Lane	Read	Run	Extension
0	Bs_S106_L001_R1_001.fastq.gz	Bs	S106	L001	R1	001	fastq.gz
1	Bs_S106_L001_R2_001.fastq.gz	Bs	S106	L001	R2	001	fastq.gz
2	Bs_S106_L002_R1_001.fastq.gz	Bs	S106	L002	R1	001	fastq.gz
3	Bs_S106_L002_R2_001.fastq.gz	Bs	S106	L002	R2	001	fastq.gz
4	Vf_S104_L001_R1_001.fastq.gz	Vf	S104	L001	R1	001	fastq.gz
5	Vf_S104_L001_R2_001.fastq.gz	Vf	S104	L001	R2	001	fastq.gz
6	Vf_S104_L002_R1_001.fastq.gz	Vf	S104	L002	R1	001	fastq.gz
7	Vf_S104_L002_R2_001.fastq.gz	Vf	S104	L002	R2	001	fastq.gz