NGS Pipeline Using CFNCluster On AWS

Author: Guorong Xu

2016-06-21

The notebook is an example that tells you how to configure your project information and run the Whole-genome sequencing, RNA sequencing, ChiP sequencing or miRNA sequencing pipelines on AWS.

Notice: Please open the notebook under /notebooks/awsCluster/BasicCFNClusterSetup.ipynb to install CFNCluster package on your Jupyter-notebook server before running the notebook.

1. Configure AWS key pair, data location on S3 and the project information


In [ ]:
import os
import sys

sys.path.append(os.getcwd().replace("notebooks/awsCluster", "src/awsCluster"))
from util import DesignFileLoader

## S3 input and output address.
s3_input_files_address = "s3://path/to/s3_input_files_address"
s3_output_files_address = "s3://path/to/s3_output_files_address"

## CFNCluster name
your_cluster_name = "cluster_name"

## The private key pair for accessing cluster.
private_key = "/path/to/aws_priate_key.pem"

## Project information
project_name = "project_name"

## The use case number
# use_case = 1: whole-genome sequencing pipeline
# use_case = 2.1: the workflow "star_htseq_workflow" of the RNA sequencing pipeline
# use_case = 2.2: the workflow "kallisto_deseq_workflow" of the RNA sequencing pipeline
# use_case = 2.3: the workflow "star_gatk_workflow" of the RNA sequencing pipeline
# use_case = 3: ChiP sequencing pipeline
# use_case = 4: miRNA sequencing pipeline
use_case = "1"

## If delete cfncluster after job is done.
delete_cfncluster = False

2. Create CFNCluster

Notice: The CFNCluster package can be only installed on Linux box which supports pip installation.


In [ ]:
from cfnCluster import CFNClusterManager, ConnectionManager

## Create a new cluster
master_ip_address = CFNClusterManager.create_cfn_cluster(cluster_name=your_cluster_name)
ssh_client = ConnectionManager.connect_master(hostname=master_ip_address,
               username="ec2-user",
               private_key_file=private_key)

After you verified the project information, you can execute the pipeline. When the job is done, you will see the log infomration returned from the cluster.

Use case 1: Run the Whole-genome sequencing pipeline


In [ ]:
from dnaSeq import WGSPipelineManager

if use_case == "1":
    ## analysis steps include: "fastqc", "bwa-alignment", "post-alignment", "gatk-haplotype"
    analysis_steps = ["fastqc", "bwa-alignment"]
    
    ## load design file by calling a function load_design_file in GroupFileMaker.
    design_file = os.getcwd().replace("notebooks/awsCluster", "data/awsCluster/dnaseq_design_example.txt")
    sample_list, group_list = DesignFileLoader.load_design_file(design_file)
    
    WGSPipelineManager.execute(ssh_client, project_name, analysis_steps, s3_input_files_address,
                       sample_list, group_list, s3_output_files_address)

Use case 2.1: Run the workflow "star_htseq_workflow" of the RNA sequencing pipeline


In [ ]:
from rnaSeq import RNAPipelineManager
    
if use_case == "2.1":
    ## star_htseq_workflow
    workflow = "star_htseq_workflow" 

    ## "fastqc", "alignment", "counting"
    analysis_steps = ["fastqc", "alignment", "counting"]
    
    ## load design file by calling a function load_design_file in GroupFileMaker.
    design_file = os.getcwd().replace("notebooks/awsCluster", "data/awsCluster/rnaseq_design_example.txt")
    sample_list, group_list = DesignFileLoader.load_design_file(design_file)
    
    ## run the RNA sequencing pipeline
    RNAPipelineManager.run_analysis(ssh_client, workflow, project_name, analysis_steps,
                     s3_input_files_address, sample_list, group_list, s3_output_files_address)

Use case 2.2: Run the workflow "kallisto_deseq_workflow" of the RNA sequencing pipeline


In [ ]:
from rnaSeq import RNAPipelineManager
    
if use_case == "2.2":
    ## kallisto_deseq_workflow
    workflow = "kallisto_deseq_workflow" 

    ## "fastqc", "alignment", "counting", "differential_calculation"
    analysis_steps = ["fastqc"]
    
    ## load design file by calling a function load_design_file in GroupFileMaker.
    design_file = os.getcwd().replace("notebooks/awsCluster", "data/awsCluster/rnaseq_design_example.txt")
    sample_list, group_list = DesignFileLoader.load_design_file(design_file)
    
    ## run the RNA sequencing pipeline
    RNAPipelineManager.run_analysis(ssh_client, workflow, project_name, analysis_steps,
                     s3_input_files_address, sample_list, group_list, s3_output_files_address)

Use case 2.3: Run the workflow "star_gatk_workflow" of the RNA sequencing pipeline


In [ ]:
from rnaSeq import RNAPipelineManager
    
if use_case == "2.3":
    ## star_gatk_workflow
    workflow = "star_gatk_workflow" 

    ## "fastqc", "alignment", "variant_calling"
    analysis_steps = ["fastqc"]
    
    ## load design file by calling a function load_design_file in GroupFileMaker.
    design_file = os.getcwd().replace("notebooks/awsCluster", "data/awsCluster/rnaseq_vc_design_example.txt")
    sample_list, group_list = DesignFileLoader.load_design_file(design_file)
    
    ## run the RNA sequencing pipeline
    RNAPipelineManager.run_analysis(ssh_client, workflow, project_name, analysis_steps,
                     s3_input_files_address, sample_list, group_list, s3_output_files_address)

Use case 3: Run the ChiP sequencing pipeline


In [ ]:
from chipSeq import ChipPipelineManager

if use_case == "3":
    ## "homer_workflow"
    workflow = "homer_workflow" 

    ## "fastqc", "alignment", "make_tag_directory", "make_UCSC_file", "find_peaks", "annotate_peaks", "pos2bed", "find_motifs_genome"
    analysis_steps = ["annotate_peaks", "pos2bed", "find_motifs_genome"]

    ## the style option can be either "factor", "histone".
    style = "histone"

    ## current available genomes: hg18, hg19, mm9, mm10
    genome = "hg19"

    ## load design file by calling a function load_design_file in GroupFileMaker.
    design_file = os.getcwd().replace("notebooks/awsCluster", "data/awsCluster/chipSeq_design_example.txt")
    sample_list, group_list = DesignFileLoader.load_chipseq_design_file(design_file)

    ## run the ChiP sequencing pipeline
    ChipPipelineManager.run_analysis(ssh_client, workflow, project_name, analysis_steps,
                     s3_input_files_address, sample_list, group_list, style, genome, s3_output_files_address)

Use case 4: Run the miRNA sequencing pipeline


In [ ]:
from miRNASeq import SmallRNAPipelineManager

if use_case == "4":
    ## "fastqc", "bowtie-alignment", "counting"
    analysis_steps = ["fastqc", "bowtie-alignment", "counting"]
    
    ## load design file by calling a function load_design_file in GroupFileMaker.
    design_file = os.getcwd().replace("notebooks/awsCluster", "data/awsCluster/smallrnaseq_design_example.txt")
    sample_list, group_list = DesignFileLoader.load_design_file(design_file)
    
    SmallRNAPipelineManager.execute(ssh_client, project_name, analysis_steps, s3_input_files_address,
                       sample_list, group_list, s3_output_files_address)

To check the processing status


In [ ]:
from miRNASeq import SmallRNAPipelineManager
from chipSeq import ChipPipelineManager
from dnaSeq import WGSPipelineManager
from rnaSeq import RNAPipelineManager

if use_case == "1":
    WGSPipelineManager.check_processing_status(ssh_client)
if use_case.startswith("2"):
    RNAPipelineManager.check_processing_status(ssh_client)
if use_case == "3":
    ChipPipelineManager.check_processing_status(ssh_client) 
if use_case == "4":
    SmallRNAPipelineManager.check_processing_status(ssh_client)

To delete the cluster, you just need to set the cluster name and call the below function.


In [ ]:
from cfnCluster import CFNClusterManager

if delete_cfncluster == True:
    CFNClusterManager.delete_cfn_cluster(cluster_name=your_cluster_name)

TODO: Using configuration builder to add option settings for each module.


In [ ]:
from rnaSeq import ConfigureBuilder

## creating a list of options for configuration.
## configuring Star options
star_option_list = [["runThreadN", "1"], ["clip3pNbases", "0"], ["outFileNamePrefix", "mysam"], ["outReadsUnmapped", "None"]]
ConfigureBuilder.configure_star(star_option_list)

## configuring Kallisto options
kallisto_option_list = [["fragment-length", "50"], ["bootstrap-samples", "0"], ["seed", "42"]]
ConfigureBuilder.configure_kallisto(kallisto_option_list)