Omics pipe is an open-source, modular computational platform that automates ‘best practice’ multi-omics data analysis pipelines.
This Jupyter notebook wraps the functionality of Omics Pipe into an easy-to-use interactive Jupyter notebook and parses
the output for genomic interpretation. Read more about Omics Pipe at https://pythonhosted.org/omics_pipe/.
In [ ]:
#Omics Pipe Overview
from IPython.display import Image
Image(filename='/data/chip/2606129465-omics_pipe_overview.png', width=500, height=100)
In [ ]:
#Import Omics pipe and module dependencies
import yaml
from omics_pipe.parameters.default_parameters import default_parameters
from ruffus import *
import sys
import os
import time
import datetime
import drmaa
import csv
from omics_pipe.utils import *
from IPython.display import IFrame
import pandas
import glob
import os
import matplotlib.pyplot as plt
from matplotlib_venn import venn2,venn3, venn3_circles
%matplotlib inline
#%matplotlib notebook
import qgrid
qgrid.nbinstall(overwrite=True)
qgrid.set_defaults(remote_js=True, precision=4)
from IPython.display import HTML
import mygene
#Download scripts from https://github.com/gdavidson/ChIPseq_tools
sys.path.append('/data/chip/ChIPseq_tools-master') #append path to downloaded scripts
import compareGeneLists as compare
now = datetime.datetime.now()
date = now.strftime("%Y-%m-%d %H:%M")
In [ ]:
#Change top directory to locate result files
os.chdir("/data/chip")
In [ ]:
###Customize parameters: Specify sample names and conditions
sample_names = ["1_2percent_input_R1",
"1_h3k4me3_R1",
"1_h3k9ac_R1",
"1_h3k9me3_R1",
"2_2percent_input_R1",
"2_h3k4me3_R1",
"2_h3k9ac_R1",
"2_h3k9me3_R1",
"3_2percent_input_R1",
"3_h3k4me3_R1",
"3_h3k9ac_R1",
"3_h3k9me3_R1",
"4_2percent_input_R1",
"4_h3k4me3_R1",
"4_h3k9ac_R1",
"4_h3k9me3_R1",
"5_2percent_input_R1",
"5_h3k4me3_R1",
"5_h3k9ac_R1",
"5_h3k9me3_R1",
"6_2percent_input_R1",
"6_h3k4me3_R1",
"6_h3k9ac_R1",
"6_h3k9me3_R1"]
condition = ["Control",
"H3K4me3",
"H3K4ac",
"H3K9me3",
"Control",
"H3K4me3",
"H3K4ac",
"H3K9me3",
"Control",
"H3K4me3",
"H3K4ac",
"H3K9me3",
"Control",
"H3K4me3",
"H3K4ac",
"H3K9me3",
"Control",
"H3K4me3",
"H3K4ac",
"H3K9me3",
"Control",
"H3K4me3",
"H3K4ac",
"H3K9me3"
]
lib_type = ["single_end"]*len(condition)
In [ ]:
#Update Metadata File
meta = {'Sample': pandas.Series(sample_names), 'condition': pandas.Series(condition) , 'libType': pandas.Series(lib_type)}
meta_df = pandas.DataFrame(data = meta)
deseq_meta_new = "/data/chip/new_meta.csv"
meta_df.to_csv(deseq_meta_new,index=False)
print meta_df
In [ ]:
#Define pairs for differential peak calling (ChIP-input or Treatment-Control)
pairs = '1_h3k4me3_R1-1_2percent_input_R1 1_h3k9ac_R1-1_2percent_input_R1 1_h3k9me3_R1-1_2percent_input_R1 2_h3k4me3_R1-2_2percent_input_R1 2_h3k9ac_R1-2_2percent_input_R1 2_h3k9me3_R1-2_2percent_input_R1 3_h3k4me3_R1-3_2percent_input_R1 3_h3k9ac_R1-3_2percent_input_R1 3_h3k9me3_R1-3_2percent_input_R1 4_h3k4me3_R1-4_2percent_input_R1 4_h3k9ac_R1-4_2percent_input_R1 4_h3k9me3_R1-4_2percent_input_R1 5_h3k4me3_R1-5_2percent_input_R1 5_h3k9ac_R1-5_2percent_input_R1 5_h3k9me3_R1-5_2percent_input_R1 6_h3k4me3_R1-6_2percent_input_R1 6_h3k9ac_R1-6_2percent_input_R1 6_h3k9me3_R1-6_2percent_input_R1 6_2percent_input_R1-4_2percent_input_R1 6_h3k4me3_R1-4_h3k4me3_R1 6_h3k9ac_R1-4_h3k9ac_R1 6_h3k9me3_R1-4_h3k9me3_R1 5_2percent_input_R1-4_2percent_input_R1 5_h3k4me3_R1-4_h3k4me3_R1 5_h3k9ac_R1-4_h3k9ac_R1 5_h3k9me3_R1-4_h3k9me3_R1 6_2percent_input_R1-3_2percent_input_R1 6_h3k4me3_R1-3_h3k4me3_R1 6_h3k9ac_R1-3_h3k9ac_R1 6_h3k9me3_R1-3_h3k9me3_R1 5_2percent_input_R1-3_2percent_input_R1 5_h3k4me3_R1-3_h3k4me3_R1 5_h3k9ac_R1-3_h3k9ac_R1 5_h3k9me3_R1-3_h3k9me3_R1 1_2percent_input_R1-3_2percent_input_R1 1_h3k4me3_R1-3_h3k4me3_R1 1_h3k9ac_R1-3_h3k9ac_R1 1_h3k9me3_R1-3_h3k9me3_R1 2_2percent_input_R1-3_2percent_input_R1 2_h3k4me3_R1-3_h3k4me3_R1 2_h3k9ac_R1-3_h3k9ac_R1 2_h3k9me3_R1-3_h3k9me3_R1 1_2percent_input_R1-4_2percent_input_R1 1_h3k4me3_R1-4_h3k4me3_R1 1_h3k9ac_R1-4_h3k9ac_R1 1_h3k9me3_R1-4_h3k9me3_R1 2_2percent_input_R1-4_2percent_input_R1 2_h3k4me3_R1-4_h3k4me3_R1 2_h3k9ac_R1-4_h3k9ac_R1 2_h3k9me3_R1-4_h3k9me3_R1'
In [ ]:
#Define pairs of peaks to compare
pairs_to_compare = ['5_h3k4me3_R1_vs_5_2percent_input_R1-3_h3k4me3_R1_vs_3_2percent_input_R1','5_h3k9ac_R1_vs_5_2percent_input_R1-3_h3k9ac_R1_vs_3_2percent_input_R1','5_h3k9me3_R1_vs_5_2percent_input_R1-3_h3k9me3_R1_vs_3_2percent_input_R1','5_h3k4me3_R1_vs_5_2percent_input_R1-4_h3k4me3_R1_vs_4_2percent_input_R1','5_h3k9ac_R1_vs_5_2percent_input_R1-4_h3k9ac_R1_vs_4_2percent_input_R1','5_h3k9me3_R1_vs_5_2percent_input_R1-4_h3k9me3_R1_vs_4_2percent_input_R1','6_h3k4me3_R1_vs_6_2percent_input_R1-3_h3k4me3_R1_vs_3_2percent_input_R1','6_h3k9ac_R1_vs_6_2percent_input_R1-3_h3k9ac_R1_vs_3_2percent_input_R1','6_h3k9me3_R1_vs_6_2percent_input_R1-3_h3k9me3_R1_vs_3_2percent_input_R1','6_h3k4me3_R1_vs_6_2percent_input_R1-4_h3k4me3_R1_vs_4_2percent_input_R1','6_h3k9ac_R1_vs_6_2percent_input_R1-4_h3k9ac_R1_vs_4_2percent_input_R1','6_h3k9me3_R1_vs_6_2percent_input_R1-4_h3k9me3_R1_vs_4_2percent_input_R1','1_h3k4me3_R1_vs_1_2percent_input_R1-3_h3k4me3_R1_vs_3_2percent_input_R1','1_h3k9ac_R1_vs_1_2percent_input_R1-3_h3k9ac_R1_vs_3_2percent_input_R1','1_h3k9me3_R1_vs_1_2percent_input_R1-3_h3k9me3_R1_vs_3_2percent_input_R1','2_h3k4me3_R1_vs_2_2percent_input_R1-4_h3k4me3_R1_vs_4_2percent_input_R1','2_h3k9ac_R1_vs_2_2percent_input_R1-4_h3k9ac_R1_vs_4_2percent_input_R1','2_h3k9me3_R1_vs_2_2percent_input_R1-4_h3k9me3_R1_vs_4_2percent_input_R1','1_h3k4me3_R1_vs_1_2percent_input_R1-3_h3k4me3_R1_vs_3_2percent_input_R1','1_h3k9ac_R1_vs_1_2percent_input_R1-3_h3k9ac_R1_vs_3_2percent_input_R1','1_h3k9me3_R1_vs_1_2percent_input_R1-3_h3k9me3_R1_vs_3_2percent_input_R1','2_h3k4me3_R1_vs_2_2percent_input_R1-4_h3k4me3_R1_vs_4_2percent_input_R1','2_h3k9ac_R1_vs_2_2percent_input_R1-4_h3k9ac_R1_vs_4_2percent_input_R1','2_h3k9me3_R1_vs_2_2percent_input_R1-4_h3k9me3_R1_vs_4_2percent_input_R1','6_h3k4me3_R1_vs_4_h3k4me3_R1-6_h3k9me3_R1_vs_4_h3k9me3_R1','5_h3k4me3_R1_vs_4_h3k4me3_R1-5_h3k9me3_R1_vs_4_h3k9me3_R1','6_h3k4me3_R1_vs_3_h3k4me3_R1-6_h3k9me3_R1_vs_3_h3k9me3_R1','5_h3k4me3_R1_vs_3_h3k4me3_R1-5_h3k9me3_R1_vs_3_h3k9me3_R1','6_h3k9ac_R1_vs_4_h3k9ac_R1-6_h3k9me3_R1_vs_4_h3k9me3_R1','5_h3k9ac_R1_vs_4_h3k9ac_R1-5_h3k9me3_R1_vs_4_h3k9me3_R1','6_h3k9ac_R1_vs_3_h3k9ac_R1-6_h3k9me3_R1_vs_3_h3k9me3_R1','5_h3k9ac_R1_vs_3_h3k9ac_R1-5_h3k9me3_R1_vs_3_h3k9me3_R1','6_h3k4me3_R1_vs_4_h3k4me3_R1-6_h3k9ac_R1_vs_4_h3k9ac_R1','5_h3k4me3_R1_vs_4_h3k4me3_R1-5_h3k9ac_R1_vs_4_h3k9ac_R1','6_h3k4me3_R1_vs_3_h3k4me3_R1-6_h3k9ac_R1_vs_3_h3k9ac_R1','5_h3k4me3_R1_vs_3_h3k4me3_R1-5_h3k9ac_R1_vs_3_h3k9ac_R1']
In [ ]:
###Update parameters, such as GENOME, GTF_FILE, paths, etc
parameters = "/root/src/omics-pipe/tests/test_params_ChIPseq_HOMER_AWS.yaml"
stream = file(parameters, 'r')
params = yaml.load(stream)
params.update({"SAMPLE_LIST": sample_names})
params.update({"PAIR_LIST": pairs})
params.update({"R_VERSION": '3.2.3'})
params.update({"GENOME": '/database/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa'})
params.update({"REF_GENES": '/database/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.gtf'})
params.update({"RAW_DATA_DIR": '/data/data'})
params.update({"TEMP_DIR": '/data/data/tmp'})
params.update({"PIPE_MULTIPROCESS": 100})
params.update({"STAR_VERSION": '2.4.5a'})
params.update({"PARAMS_FILE": '/data/results/updated_params.yaml'})
params.update({"LOG_PATH": ':/data/results/logs'})
params.update({"QC_PATH": "/data/results/QC"})
params.update({"FLAG_PATH": "/data/results/flags"})
params.update({"BOWTIE_RESULTS": "/data/results/bowtie"})
params.update({"HOMER_RESULTS": "/data/results/homer"})
params.update({"BOWTIE_INDEX": "/data/database/Homo_sapiens/UCSC/hg19/Sequence/BowtieIndex/genome"})
params.update({"ENDS": 'SE'})
params.update({"HOMER_VERSION": '4.6'})
params.update({"TRIMMED_DATA_PATH": "/data/results/trimmed"})
params.update({"HOMER_TRIM_OPTIONS": "-3 GATCGGAAGAGCACACGTCT -mis 1 -minMatchLength 6 -min 45"})
params.update({"HOMER_PEAKS_OPTIONS": "-o auto -region -size 1000 -minDist 2500"})
params.update({"HOMER_MOTIFS_OPTIONS": "-start -1000 -end 100 -len 8,10 -p 4"})
params.update({"HOMER_ANNOTATE_OPTIONS":""})
params.update({"HOMER_GENOME": "hg19"})
#update params
default_parameters.update(params)
#write yaml file
stream = file('updated_params.yaml', 'w')
yaml.dump(params,stream)
p = Bunch(default_parameters)
#View Parameters
print "Run Parameters: \n" + str(params)
The following commands execute the Omics Pipe ChIPseq HOMER pipeline http://homer.salk.edu/homer/index.html
In [ ]:
### Omics Pipe Pipelines
from IPython.display import Image
Image(filename='/data/chip/2365251253-omics_pipe_pipelines_20140402.png', width=700, height=250)
In [ ]:
###Run Omics Pipe from the command line
!omics_pipe ChIPseq_HOMER /data/chip/updated_params.yaml
In [ ]:
#Change top directory to locate result files
os.chdir("/data/chip")
In [ ]:
#Display Omics Pipe Pipeline Run Status
#pipeline = './flags/pipeline_combined_%s.pdf' % date
pipeline = './flags/pipeline_combined_2016-05-16 17:41.pdf'
IFrame(pipeline, width=700, height=500)
Quality control of the raw data (fastq files) was assessed using the tool FastQC (http://www.bioinformatics.babraham.ac.uk/projects/fastqc/). The results for all samples are summarized below, and samples are given a PASS/FAIL rating.
In [ ]:
###Summarize FastQC raw data QC results per sample
results_dir = './QC/'
# Below is the complete list of labels in the summary file
summary_labels = ["Basic Statistics", "Per base sequence quality", "Per tile sequence quality",
"Per sequence quality scores", "Per base sequence content", "Per sequence GC content",
"Per base N content", "Sequence Length Distribution", "Sequence Duplication Levels",
"Overrepresented sequences", "Adapter Content", "Kmer Content"]
# Below is the list I anticipate caring about; I leave the full list above in case it turns out later
# I anticipated wrong and need to update this one.
labels_of_interest = ["Basic Statistics", "Per base sequence quality"]
# Look for each file named summary.txt in each subdirectory named *_fastqc in the results directory
summary_wildpath = os.path.join(results_dir, '*/*_fastqc', "summary.txt")
summary_filepaths = [x for x in glob.glob(summary_wildpath)]
#print os.getcwd()
# Examine each of these files to find lines starting with "FAIL" or "WARN"
for curr_summary_path in summary_filepaths:
has_error = False
#print(divider)
with open(curr_summary_path, 'r') as f:
for line in f:
if line.startswith("FAIL") or line.startswith("WARN"):
fields = line.split("\t")
if not has_error:
print(fields[2].strip() + ": PASS") # the file name
has_error = True
if fields[1] in labels_of_interest:
print(fields[0] + "\t" + fields[1])
In [ ]:
#Display QC results for individual samples
sample = "6_h3k9me3_R1"
name = '/data/chip/QC/%s_fastqc/fastqc_report.html' % (sample)
#name = './QC/%s/%s_fastqc/fastqc_report.html' % (sample,sample)
IFrame(name, width=1000, height=600)
The samples were aligned to the genome with the Bowtie aligner (http://bowtie-bio.sourceforge.net/index.shtml). The alignment statistics for all samples are summarized and displayed below. Samples that do not pass the alignment quality filter (Good quality = # aligned reads > 10 million and % aligned > 60%) are excluded from downstream analyses.
In [ ]:
#Run samstat to produce summary statistics from Bowtie output
!samstat ./bowtie/*/*.bam
In [ ]:
##Summarize Alignment QC Statistics
import sys
from io import StringIO
align_dir = './bowtie/'
# Look for each file named summary.txt in each subdirectory named *_fastqc in the results directory
summary_wildpath = os.path.join(align_dir, '*/', "*.bam.samstat.html")
#summary_wildpath = os.path.join(star_dir, "*Log.final.out")
summary_filepaths = [x for x in glob.glob(summary_wildpath)]
#print summary_filepaths
alignment_stats = pandas.DataFrame()
for curr_summary_path in summary_filepaths:
#with open(curr_summary_path, 'r') as f:
filename = curr_summary_path.replace("./bowtie/","")
filename2 = filename.replace(".bam.samstat.html","")
filename3 = filename2.replace("/*","")
dfs = pandas.read_html(curr_summary_path, header =0)
df = dfs[0]
raw_reads1 = df["Number"]
raw_reads = raw_reads1[6]
aligned_reads1 = df["Number"]
aligned_reads = aligned_reads1[0]
percent_aligned1 = df["Percentage"]
percent_aligned = percent_aligned1[0]
d = {"Sample": pandas.Series(filename3), "Raw_Reads": pandas.Series(float(raw_reads)),
"Aligned_Reads": pandas.Series(float(aligned_reads)),
"Percent_Uniquely_Aligned": pandas.Series(percent_aligned)}
p = pandas.DataFrame(data=d)
alignment_stats = alignment_stats.append(p)
#print alignment_stats
alignment_stats.to_csv("alignment_stats_summary.csv",index=False)
#View interactive table
qgrid.show_grid(alignment_stats, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 200})
In [ ]:
#Barplot of number of aligned reads per sample
plt.figure(figsize=(10,10))
ax = plt.subplot(111)
alignment_stats.plot(ax=ax, kind='barh', title='# of Reads')
ax.axis(x='off')
ax.axvline(x=10000000, linewidth=2, color='Red', zorder=0)
#plt.xlabel('# Aligned Reads',fontsize=16)
for i, x in enumerate(alignment_stats.Sample):
ax.text(0, i + 0, x, ha='right', va= "bottom", fontsize='medium')
plt.savefig('./alignment_stats_%s' %date ,dpi=300) # save figure
In [ ]:
###Flag samples with poor alignment or low numbers of reads
df = alignment_stats
failed_samples = df.loc[(df.Aligned_Reads < 10000000) | (df.Percent_Uniquely_Aligned < 40), ['Sample','Raw_Reads', 'Aligned_Reads', 'Percent_Uniquely_Aligned']]
print failed_samples
#View interactive table
#qgrid.show_grid(failed_samples, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 200})
In [ ]:
#View Alignment Statistics for failed samples
for failed in failed_samples["Sample"]:
#fname = "/data/results/star/%s/Log.final.out" % failed
fname = "./bowtie/%s.bam.samstat.html" % failed
print fname
IFrame(fname, width=1000, height=600)
In [ ]:
###Samples that passed QC for alignment
passed_samples = df.loc[(df.Aligned_Reads > 10000000) | (df.Percent_Uniquely_Aligned > 40), ['Sample','Raw_Reads', 'Aligned_Reads', 'Percent_Uniquely_Aligned']]
print "Number of samples that passed alignment QC = " + str(len(passed_samples))
#View interactive table
#qgrid.show_grid(passed_samples, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 200})
In [ ]:
#View Alignment Statistics for passed samples
for passed in passed_samples["Sample"]:
#fname = "/data/results/star/%s/Log.final.out" % passed
fname = "./bowtie/%s.bam.samstat.html" % passed
print passed
IFrame(fname, width=1000, height=600)
In [ ]:
for sample in sample_names:
fi = "./%s/tagCountDistribution.txt" % sample
counts1 = pandas.read_csv(fi, sep="\t")
counts = counts1.head(10)
counts.set_index = 0
counts[[1]].plot.bar().set_title(sample)
plt.savefig('./clonal_distribution_plot_%s' %sample ,dpi=300) # save figure
tagAutocorrelation.txt - The autocorrelation routine creates a distribution of distances between adjacent reads in the genome. If reads are mapped to the same strand, they are added to the first column. If adjacent reads map to different strands, they are added to the 2nd column. The results from autocorrelation analysis are very useful for troubleshooting problems with the experiment, and are used to estimate the fragment length for ChIP-Seq and MNase-Seq.
In [ ]:
for sample in sample_names:
fi = "./%s/tagAutocorrelation.txt" % sample
tags = pandas.read_csv(fi, sep="\t")
#Distance in bp(Fragment Length Estimate: 164)(Peak Width Estimate: 164) Same Strand (+ for Watson strand, - for Crick) Opposite Strand
tags.columns = ['Relative_Distance_Between_Reads(bp)', 'Same_Strand', 'Opposite_Strand']
ax1 = tags.plot(x='Relative_Distance_Between_Reads(bp)', y=['Same_Strand','Opposite_Strand'])
ax1.set_ylim(10000,250000)
ax1.set_xlim(-1000,1000)
ax1.set_title(sample)
plt.savefig('./autocorrelation_plot_%s' %sample ,dpi=300) # save figure
HOMER (Hypergeometric Optimization of Motif EnRichment) is a suite of tools for Motif Discovery and next-gen sequencing analysis. It is a collection of command line programs for unix-style operating systems written in Perl and C++. HOMER was primarily written as a de novo motif discovery algorithm and is well suited for finding 8-20 bp motifs in large scale genomics data. http://homer.salk.edu/homer/index.html
In [ ]:
pairs1 = pairs.replace(" ", ",")
pairs2 = pairs1.replace("-", "_vs_")
pairs3 = pairs2.split(",")
peak_stats = pandas.DataFrame()
for pair in pairs3:
fname = "./%s/regions.txt" % pair
with open(fname, 'r') as fin:
head = [next(fin) for x in xrange(40)]
df = pandas.DataFrame(head)
df.columns=["col"]
df['col'] = df['col'].str.replace('\n','')
df = pandas.DataFrame(df.col.str.split('=',1).tolist(),columns = ['sample',pair])
df_items = df[['sample']]
df_values = df[[pair]]
peak_stats = pandas.concat([peak_stats, df_values],axis=1)
#print pair
peak_stats = pandas.concat([df_items,peak_stats],axis=1)
peak_stats =peak_stats.transpose()
peak_stats =peak_stats.dropna(axis=1)
peak_stats.columns = peak_stats.iloc[0]
peak_stats = peak_stats[1:]
peak_stats.to_csv("peak_stats_summary.csv",index=False)
#View interactive table
qgrid.show_grid(peak_stats, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 200})
In [ ]:
#Display peak summary graphs
#Barplot of number of peaks per sample
num_peaks = peak_stats.iloc[:,[1]]
num_peaks.columns = ["Number of Peaks"]
num_peaks =num_peaks.convert_objects(convert_numeric=True)
num_peaks = num_peaks.sort_values(["Number of Peaks"],axis=0,ascending=False)
num_peaks.plot.bar(figsize=(15, 5))
plt.savefig('./peaks_summary.png' ,dpi=300) # save figure
Approximate IP effeciency describes the fraction of tags found in peaks versus. genomic background. This provides an estimate of how well the ChIP worked. Certain antibodies like H3K4me3, ERa, or PU.1 will yield very high IP efficiencies (>20%), while most rand in the 1-20% range. Once this number dips below 1% it's a good sign the ChIP didn't work very well and should probably be optimized. http://homer.salk.edu/homer/ngs/peaks.html
In [ ]:
#Display IP efficiency summary graphs, with horizontal line at y=1
IP = peak_stats.iloc[:,[8]]
IP.columns = ["IP_Efficiency"]
IP['IP_Efficiency'] = IP['IP_Efficiency'].replace('%','',regex=True).astype('float')
IP =IP.sort_values(['IP_Efficiency'],axis=0,ascending=False)
IP.plot.bar(figsize=(15, 5))
plt.axhline(y=1, color = "red", linewidth = 2)
plt.savefig('./ipefficiency_summary.png' ,dpi=300) # save figure
In [ ]:
#Summarize annotation stats
annot_stats = pandas.DataFrame()
for pair in pairs3:
fname = "./%s/regions.annotate.txt" % pair
fi = pandas.read_csv(fname, sep="\t")
fi.columns = [c.replace(' ', '_') for c in fi.columns]
fi.Gene_Type.value_counts().plot(kind="pie",figsize=(6, 6))
plt.axis('equal')
plt.title(pair)
plt.savefig('./Peaks_Gene_Type_pie_%s.png' %pair ,dpi=300) # save figure
plt.show()
#qgrid.show_grid(fi, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 200})
#View interactive table
#qgrid.show_grid(peak_stats, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 200})
In [ ]:
#Summarize annotation stats
annot_stats = pandas.DataFrame()
for pair in pairs3:
fname = "./%s/regions.annotate.txt" % pair
fi = pandas.read_csv(fname, sep="\t")
fi.columns = [c.replace(' ', '_') for c in fi.columns]
fi['Annotation'] = fi['Annotation'].replace('\(.*?\)','',regex=True)
fi['Annotation'] = fi['Annotation'].replace(' \.*?','',regex=True)
fi['Annotation'] = fi['Annotation'].replace('\..*$','',regex=True)
fi.Annotation.value_counts().plot(kind="pie", figsize=(8, 8))
plt.axis('equal')
plt.title(pair)
plt.savefig('./Peaks_Gene_Type_pie_%s' %pair ,dpi=300) # save figure
plt.show()
In [ ]:
#Download scripts from https://github.com/gdavidson/ChIPseq_tools
import sys
sys.path.append('/data/chip/ChIPseq_tools-master') #append path to downloaded scripts
import getFromAnnotations as gfa
for pair in pairs3:
annotationList = gfa.getAnnotationList('%s/regions.annotate.txt' %pair)
#plot distances
try:
#pie chart
pieChartMap = gfa.getPieChartMap(annotationList)
gfa.pieChart(pieChartMap, pair)
plt.show()
plt.savefig('./Pie_Chart_with_numbers_%s' %pair ,dpi=300) # save figure
except ValueError:
next
In [ ]:
#qgrid.show_grid(fi.sample(200), grid_options={'forceFitColumns': False, 'defaultColumnWidth': 200})
In [ ]:
#Summarize annotation stats
kegg_stats = pandas.DataFrame()
for pair in pairs3:
fname = "./%s_GO/kegg.txt" % pair
fi = pandas.read_csv(fname, sep="\t")
fi.columns = [c.replace(' ', '_') for c in fi.columns]
fi = fi.loc[fi["Enrichment"] < 0.05]
fi["comparison"] = pair
kegg_stats = kegg_stats.append(fi)
#write summary to file
kegg_stats.to_csv("kegg_stats_summary.csv",index=False)
#View interactive table
qgrid.show_grid(kegg_stats, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 200})
View enriched motifs http://homer.salk.edu/homer/ngs/peakMotifs.html
In general, when analyzing ChIP-Seq / ChIP-Chip peaks you should expect to see strong enrichment for a motif resembling the site recognized by the DNA binding domain of the factor you are studying. Enrichment p-values reported by HOMER should be very very significant (i.e. << 1e-50). If this is not the case, there is a strong possibility that the experiment may have failed in one way or another. For example, the peaks could be of low quality because the factor is not expressed very high.
In [ ]:
#Summarize enriched motifs stats
motif_stats = pandas.DataFrame()
for pair in pairs3:
fname = "./%s/MotifOutput/knownResults.txt" % pair
fi = pandas.read_csv(fname, sep="\t")
fi.columns = ["Motif_Name", "Consensus", "P-value", "Log_P-value", "q-value_Benjamini", "#TargetSequenceswithMotif",
"%TargetSequenceswithMotif","#BackgroundSequenceswithMotif", "%BackgroundSequenceswithMotif",]
fi = fi.loc[fi["P-value"] < 1e-50]
fi["comparison"] = pair
motif_stats = motif_stats.append(fi)
#write summary to file
motif_stats.to_csv("motif_stats_summary.csv",index=False)
#View interactive table
qgrid.show_grid(motif_stats, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 200})
In [ ]:
#Summarize peaks in promoters
promoter_stats = pandas.DataFrame()
for pair in pairs3:
fname = "./%s/regions.annotate.txt" % pair
fi = pandas.read_csv(fname, sep="\t")
fi.columns = [c.replace(' ', '_') for c in fi.columns]
fi['Annotation'] = fi['Annotation'].replace('\(.*?\)','',regex=True)
fi['Annotation'] = fi['Annotation'].replace(' \.*?','',regex=True)
fi['Annotation'] = fi['Annotation'].replace('\..*$','',regex=True)
fi = fi.loc[fi["Annotation"] == "promoter-TSS"]
fi["comparison"] = pair
fi.Gene_Type.value_counts().plot(kind="bar", figsize=(8, 8))
plt.title("Peaks in Promoters by Gene Type -" + pair)
plt.show()
plt.xlabel('Gene Type', fontsize=12)
plt.ylabel('# of Peaks', fontsize=12)
plt.savefig('./Promoter_Peaks_Gene_Type_bar_%s' %pair ,dpi=300) # save figure
promoter_stats = promoter_stats.append(fi)
#write summary to file
promoter_stats.to_csv("promoter_stats_summary.csv",index=False)
In [ ]:
genes_df = pandas.read_csv("./genes_of_interest_validated_junctions.csv")
gene_names = genes_df["gene"]
genes_stats = pandas.DataFrame()
for pair in pairs3:
fname = "./%s/regions.annotate.txt" % pair
fi = pandas.read_csv(fname, sep="\t")
fi.columns = [c.replace(' ', '_') for c in fi.columns]
fi['Annotation'] = fi['Annotation'].replace('\(.*?\)','',regex=True)
fi['Annotation'] = fi['Annotation'].replace(' \.*?','',regex=True)
fi['Annotation'] = fi['Annotation'].replace('\..*$','',regex=True)
fi = fi.loc[fi["Gene_Name"].isin(gene_names)]
fi["comparison"] = pair
if fi.Annotation.empty:
next
else:
fi.Annotation.value_counts().plot(kind="bar", figsize=(8, 8))
plt.title("Peaks in Promoters by Annotation -" + pair)
plt.show()
plt.xlabel('Annotation', fontsize=12)
plt.ylabel('# of Peaks', fontsize=12)
plt.savefig('./Genes_of_Interest_Peaks_Annotation_bar_%s' %pair ,dpi=300) # save figure
genes_stats = genes_stats.append(fi)
#write summary to file
genes_stats.to_csv("genes_stats_summary.csv",index=False)
In [ ]:
#Download scripts from https://github.com/gdavidson/ChIPseq_tools
import sys
sys.path.append('/data/chip/ChIPseq_tools-master') #append path to downloaded scripts
import getFromAnnotations as gfa
for pair in pairs3:
annotationList = gfa.getAnnotationList('%s/regions.annotate.txt' %pair)
#plot distances
try:
distanceList,countMap = gfa.getDistanceList(annotationList)
gfa.histDistances(distanceList, pair)
plt.show()
plt.savefig('./TSS_distance_%s' %pair ,dpi=300) # save figure
gfa.plotDistances(countMap)
plt.show()
plt.savefig('./TSS_distance_bp_%s' %pair ,dpi=300) # save figure
except ValueError:
next
In [ ]:
genes_stats2 = genes_stats[['Gene_Name','comparison']]
genes_stats2.Gene_Name.value_counts().plot(kind="bar", figsize=(15, 8), stacked=True)
plt.xlabel('Genes of Interest', fontsize=12)
plt.ylabel('# of Peaks', fontsize=12)
plt.savefig('./Genes_of_Interest_Peaks_all.png' ,dpi=300) # save figure
In [ ]:
genes_stats2 = genes_stats[['Gene_Name','comparison']]
genes_stats2.comparison.value_counts().plot(kind="barh", figsize=(15, 8), stacked=True)
plt.xlabel('# Peaks', fontsize=12)
plt.ylabel('Comparison', fontsize=12)
plt.savefig('./Genes_of_Interest_Comparison_Peaks_all.png' ,dpi=300) # save figure
In [ ]:
sub_df = genes_stats2.groupby(['Gene_Name']).comparison.value_counts().unstack()
sub_df.plot(kind='bar',stacked=True, figsize=(15, 8)).legend(loc='center left', bbox_to_anchor=(1.0, 0.5) )
plt.xlabel('Genes of Interest', fontsize=12)
plt.ylabel('# of Peaks', fontsize=12)
plt.savefig('./Genes_of_Interest_Peaks_by_comparison.png' ,dpi=300) # save figure
In [ ]:
sub_df = genes_stats2.groupby(['comparison']).Gene_Name.value_counts().unstack()
sub_df.plot(kind='barh',stacked=True, figsize=(15, 8)).legend(loc='center left', bbox_to_anchor=(1.0, 0.5) )
plt.xlabel('# Peaks', fontsize=12)
plt.ylabel('Comparison', fontsize=12)
plt.savefig('./Comparison_by_genes_of_interest.png' ,dpi=300) # save figure
In [ ]:
for pairs in pairs_to_compare:
#print pairs
pairs_split = pairs.split("-")
pair1 = pairs_split[0]
pair2= pairs_split[1]
peaks1 = pandas.read_csv('./%s/regions.annotate.txt' %pair1, sep="\t")
peaks2 = pandas.read_csv('./%s/regions.annotate.txt' %pair2, sep="\t")
peaks1.columns = [c.replace(' ', '_') for c in peaks1.columns]
peaks1.columns.values[0] = "Peak_ID"
peaks2.columns = [c.replace(' ', '_') for c in peaks2.columns]
peaks2.columns.values[0] = "Peak_ID"
peaks1_list = peaks1['Gene_Name'].tolist()
peaks2_list = peaks2['Gene_Name'].tolist()
venn2([set(peaks1_list), set(peaks2_list)], (pair1,pair2))
plt.show()
plt.savefig('./Venn_Analysis_Genes_with_Peaks_%s.png' %pairs ,dpi=300) # save figure
commonGenes, uniqueL1, uniqueL2 = compare.compareLists(peaks1_list, peaks2_list)
commonGenes_df = pandas.DataFrame(commonGenes, columns = ["commonGenes"])
commonGenes_df.to_csv("CommonGenes_%s.csv" %pairs)
uniqueL1_df = pandas.DataFrame(uniqueL1, columns = ["uniqueL1"])
uniqueL1_df.to_csv("uniqueL1_%s_%s.csv" %(pair1, pairs))
uniqueL2_df = pandas.DataFrame(uniqueL2, columns = ["uniqueL2"])
uniqueL2_df.to_csv("uniqueL2_%s_%s.csv" %(pair2, pairs))
Cut and copy the following URLs to create custom tracks for your samples in UCSC Genome Browser
In [ ]:
for sample in sample_names:
url = "http://ccbb-analysis.s3.amazonaws.com/%s/%s.ucsc.bedGraph.gz" %(sample,sample)
print url
In [ ]:
IFrame("https://genome.ucsc.edu/cgi-bin/hgCustom?hgsid=504023239_5efJ2ONTkgrqUm6AcaAkNGcyXKmn", width=900, height=500)
In [ ]: