Table of Contents


In [13]:
import os
import pandas as pd
# params
directory = '../input/rawseq'
length = 180
sigma = 60
btstrp = 200
thrds = 6

# sequences:
seqs = next(os.walk(directory))[1]

In [14]:
# params
directory = '../input/rawseq'
length = 180
sigma = 60
btstrp = 200
thrds = 6

# sequences:
seqs = next(os.walk(directory))[1]

In [15]:
def explicit_kallisto(directory, files, res_dir):
    """
    TODO: Make a function that allows you to systematically 
    set up each parameter for each sequencing run individually.
    """
    
    if type(directory) is not str:
        raise ValueError('directory must be a str')
    if type(files) is not list:
        raise ValueError('files must be a list')
    
    print('This sequence file contains a Kallisto_Info file\
            and cannot be processed at the moment.')
    return '# {0} could not be processed'.format(res_dir), ''
    
def implicit_kallisto(directory, files, res_dir):
    """
    A function to write a Kallisto command with standard parameter
    setup
    """
    if type(directory) is not str:
        raise ValueError('directory must be a str')
    if type(files) is not list:
        raise ValueError('files must be a list')

    # parts of each kallisto statement
    
    # information
    info = '# kallisto command for {0}'.format(directory)
    # transcript file location:
    k_head = 'kallisto quant -i input/transcripts.idx -o '
    
    # output file location
    k_output = 'input/kallisto_all/' + res_dir + '/kallisto '
    # parameter info:
    k_params = '--single -s {0} -l {1} -b {2} -t {3} --bias --fusion'.format(sigma, length, btstrp, thrds)
    
    # what files to use:
    k_files = ''    
    # go through each file and add it to the command
    # unless it's a SampleSheet.csv file, in which
    # case you should ignore it. 
    for y in files:
        if y != 'SampleSheet.csv':
            if directory[:3] == '../':
                d = directory[3:]
            else:
                d = directory[:]
            k_files += ' '+ d + '/' + y
    # all together now:
    kallisto = k_head + k_output + k_params + k_files +';'
    return info, kallisto

In [19]:
def walk_seq_directories(directory):
    """
    Given a directory, walk through it,
    find all the rna-seq repository folders
    and generate kallisto commands
    """
    kallisto = ''
    #directory contains all the projects, walk through it:
    for x in os.walk(directory):
        # first directory is always parent
        # if it's not the parent, move forward:
        if x[0] != directory:
            # cut the head off and get the project name:
            res_dir = x[0][len(directory)+1:]
            
            # if this project has attributes explicitly written in
            # use those parameter specs:
            if 'Kallisto_Info.csv' in x[2]:
                info, command = explicit_kallisto(x[0], x[2], res_dir)
                continue
            
            # otherwise, best guesses:
            info, command = implicit_kallisto(x[0], x[2], res_dir)
            kallisto += info + '\n' + command + '\n'
            
            if not os.path.exists('../input/kallisto_all/' + res_dir):
                os.makedirs('../input/kallisto_all/' + res_dir)
    return kallisto

with open('../kallisto_commands.sh', 'w') as f:
    f.write('#!/bin/bash\n')
    f.write('# make transcript index\n')
    f.write('kallisto index -i input/transcripts.idx input/c_elegans_WBcel235.rel79.cdna.all.fa;\n')
    kallisto = walk_seq_directories(directory)
    f.write(kallisto)

In [ ]:


In [ ]: