In [18]:
import os
import sys
import synapseclient
import shutil
from synapseclient import File, Project, Wiki, Folder
import tempfile
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
%run 'synapse_demo_utils_functions.ipynb'
In [19]:
syn = synapseclient.login('synapsedemo','demo')
The subsampled raw dataset have been stored in a public synapse project. The following are the direct links to unique synapse id for the fastq data. This would serv
In [ ]:
brain_reads = 'syn2468554'
adrenal_reads = 'syn2468552'
#lets download the fastq data
brain_reads = syn.get(brain_reads)
adrenal_reads = syn.get(adrenal_reads)
For demo purposed we are using a small region of chr19 (300000-350000 Mb) of the hg19 reference genome. This could also be fetched from directly from synapse.
You can then use mapper of your choice to map these reads. We will use STAR to map the reads
2.1 Setting up the local environment
In [4]:
local_analysis_dir = tempfile.mkdtemp(prefix='temp_Demo_RNA-Seq_workflow')
#the following wud be replaced by the location of the STAR executable
STAR_executable = '/Users/abhishek/softwares/STAR_2.3.0e.OSX_x86_64/STAR'
#dir to store the STAR genome index for the reference genome
ref_genome_dir = create_dir(local_analysis_dir + '/ref_genome/')
2.2 Creating a genome index
In [ ]:
#the reference genome
ref_genome = 'syn2468557'
ref_genome = syn.get(ref_genome)
#create a STAR genome index
star_genome_index = STAR_executable + ' --runMode genomeGenerate --genomeDir {genome_dir} --genomeFastaFiles {genome_fasta}'
star_genome_index = star_genome_index.format(genome_dir=ref_genome_dir, genome_fasta = ref_genome.path)
! $star_genome_index
2.3 Map adrenal & brain tissue reads
In [7]:
#constructing a basic command to run the mapping
#this command could be replaced by any mapper of choice
basic_star_command_template = STAR_executable +' --runThreadN 1 --genomeDir {genome_dir} --outFileNamePrefix {prefix} --outSAMunmapped Within --readFilesIn {read_1}'
In [ ]:
#run the mapping commands
adrenal_star_command = basic_star_command_template.format(genome_dir = ref_genome_dir,
prefix = local_analysis_dir + '/adrenal_',
read_1 = adrenal_reads.path
)
! $adrenal_star_command
brain_star_command = basic_star_command_template.format(genome_dir = ref_genome_dir,
prefix = local_analysis_dir + '/brain_',
read_1 = brain_reads.path
)
! $brain_star_command
Key things required
For help refer to Synapse Getting Started Guide
In [11]:
#create a project
project_name = 'RNA_Seq_Project-%s' % randomword(5) ##mainly to create a random unique project name
myProj = syn.store(Project(project_name))
#create a folder to store the bams
mapped_folder = syn.store(Folder('mapped_files', parent = myProj))
In [12]:
brain_mapped_file = get_FilesList(local_analysis_dir, 'brain.*sam')[0]
brain_mapped_file = File(brain_mapped_file, parent = mapped_folder.id, synapseStore=False)
executed = [pushToSynapse(syn,brain_star_command, parentId=mapped_folder.id)]
used = [brain_reads]
brain_mapped_file = syn.store(brain_mapped_file, used = used, executed= executed)
#################
adrenal_mapped_file = get_FilesList(local_analysis_dir, 'adrenal.*sam')[0]
adrenal_mapped_file = synapseclient.File(adrenal_mapped_file, parent = mapped_folder , synapseStore=False)
executed = [pushToSynapse(syn, adrenal_star_command, parentId=mapped_folder.id)]
used = [adrenal_reads]
adrenal_mapped_file = syn.store(adrenal_mapped_file, used = used, executed= executed)
and this time lets push the provenance as we go to synapse
Key things needed
3.1 fetch the annotation & create a folder to store the counts
In [14]:
annotation_file = 'syn2468559' #pre-stored in synaspe
annotation_file = syn.get(annotation_file)
#lets create a folder to keep the counts
counts_folder = syn.store(synapseclient.Folder('counts', parentId=myProj.id))
In [15]:
counts_files = []
for sam in [adrenal_mapped_file, brain_mapped_file]:
count_file = sam.path.replace('.sam','_htseq-counts.txt')
command = "htseq-count -f sam -s no -q %s %s > %s" % (sam.path, annotation_file.path, count_file)
! $command
#pushing the counts file to synapse and
used = [sam, annotation_file]
executed = pushToSynapse(syn, command, parentId = counts_folder.id)
count_file = File(count_file, parentId=counts_folder.id)
count_file = syn.store(count_file, used=used, executed=executed)
#keeping for later use
counts_files.append(count_file)
In [16]:
cleaned_counts = merge_htseq_counts(counts_files)
cleaned_counts
Out[16]:
In [58]:
cleaned_counts.apply(lambda x: np.log2(x), axis=1).plot(rot=90, title="raw counts(log)", kind="bar")
plt.savefig('raw_read_counts.png')
4.1 Store the final merged counts with provenance
In [103]:
outFile = local_analysis_dir + '/merged_counts.tsv'
cleaned_counts.to_csv(outFile, sep="\t", header=True, index=True )
cleaned_counts_syn = syn.store(File(outFile,parentId = counts_folder.id), used=[x.id for x in counts_files])
In [109]:
wiki = temp_create_wiki(syn, cleaned_counts, 'raw_read_counts.png', myProj, cleaned_counts_syn)
In [110]:
syn.onweb(myProj)
In [17]:
syn.delete(myProj)
shutil.rmtree(local_analysis_dir)
In [ ]: