Call DNA-DNA interactions from raw ChiA-PET data

Define data folders, data, and sample name


In [ ]:
#remember that fasta should have built bowtie2 indices
annotation="/annotation_dir/"
tmp="/input_data/"
input_dir="/output_dir/"
output_dir="/output_dir/"
#WT condition
input_fq_1="rnaseq_mesc_1_1M.fq.gz"
input_fq_2="rnaseq_mesc_2_1M.fq.gz"
sample_name="rnaseq_mesc_1M_wt"

Run ChiA-PET2


In [ ]:
!ChIA-PET2 -g /data/hg19.fa -f /data/fastq/SRR2054934_pass_1.fastq.gz -r /data/fastq/SRR2054934_pass_2.fastq.gz \
    -b /data/hg19.chrom.sizes -A ACGCGATATCTTATCTGACT -B AGTCAGATAAGATATCGCGT \
    -m 1 -s 1 -t 4 -M "-q .1" -k 2  -C 1

Run Origami... kindaaaaa real janky atm


In [ ]:
#where all this stuff came from
##note: some of this was done sequentially, ie before doing final conversion interaction calling has to finish. 
##this means that this file cannot be run from start to finish in one go, instead different segments will have to be commented out or in

cd /nfs/young_ata4/ENHANCER_RNA/Workshop/CENTRALIZED_FILE_FREEZE/20170412

##########################origami##########################
#mkdir Origami
#cd Origami

##lock away the scripts
#mkdir Origami_v1.1_freeze
#cd Origami_v1.1_freeze
#cp -R /home/dsday/projects/ORIGAMI/RunningVersion/origami-v1.1-alpha-2/ origami-v1.1-alpha-2
#cp /nfs/young_ata4/ORIGAMI/Data/whitehead-read-name-adjust.sh whitehead-read-name-adjust.sh
#cp /nfs/young_ata4/ENHANCER_RNA/Workshop/uniform-calling.sh uniform-calling.sh
##cp -R /nfs/young_ata4/ENHANCER_RNA/Workshop/DSD/LoopingAltClassification/ConIntDevel ConInt 
#copy new version of this
#cp -R /nfs/young_ata4/ENHANCER_RNA/Workshop/DSD/LoopingAltClassification/ConIntDevel20170423 20170422_ConInt 

origami_dir="/nfs/young_ata4/ENHANCER_RNA/Workshop/CENTRALIZED_FILE_FREEZE/20170412/Origami/"
origami_vers="Origami_v1.1_freeze/origami-v1.1-alpha-2"
##############################################################################################
########################                YY1                          #########################
##pre-process
cd $origami_dir
##YY1
#mkdir YY1_rep1_rep2_merged
cd YY1_rep1_rep2_merged
##rep1
#ln -s /lab/solexa_public/Young/160122_WIGTC-HISEQ2B_C6E4BANXX/QualityScore/AGGCAG-s_7_1_sequence.txt AGGCAG-s_7_1_sequence.txt
#ln -s /lab/solexa_public/Young/160122_WIGTC-HISEQ2B_C6E4BANXX/QualityScore/AGGCAG-s_7_2_sequence.txt AGGCAG-s_7_2_sequence.txt

##rep2 
#tar --strip-components 5 --to-stdout -xzvf /lab/solexa_public/Young/160308_WIGTC-HISEQA_HKFTJBCXX/QualityScore/AGGCAG-s_1_1_sequence.txt.tar.gz > AGGCAG-s_1_1_sequence.txt
#tar --strip-components 5 --to-stdout -xzvf /lab/solexa_public/Young/160308_WIGTC-HISEQA_HKFTJBCXX/QualityScore/AGGCAG-s_1_2_sequence.txt.tar.gz > AGGCAG-s_1_2_sequence.txt
#tar --strip-components 5 --to-stdout -xzvf /lab/solexa_public/Young/160308_WIGTC-HISEQA_HKFTJBCXX/QualityScore/AGGCAG-s_2_1_sequence.txt.tar.gz > AGGCAG-s_2_1_sequence.txt
#tar --strip-components 5 --to-stdout -xzvf /lab/solexa_public/Young/160308_WIGTC-HISEQA_HKFTJBCXX/QualityScore/AGGCAG-s_2_2_sequence.txt.tar.gz > AGGCAG-s_2_2_sequence.txt

##merge rep1 and rep2
#cat AGGCAG-s_7_1_sequence.txt AGGCAG-s_1_1_sequence.txt AGGCAG-s_2_1_sequence.txt > YY1.merged_1.txt 
#cat AGGCAG-s_7_2_sequence.txt AGGCAG-s_1_2_sequence.txt AGGCAG-s_2_2_sequence.txt > YY1.merged_2.txt 


#call this damn thing
#bsub -q14 -e YY1_rep1_rep2_merged.alignment.err -o YY1_rep1_rep2_merged.alignment.out -J YY1_rep1_rep2_alignment "$origami_dir$origami_vers/bin/origami-alignment --output=$origami_dir/YY1_rep1_rep2_merged --pp=$origami_dir$origami_vers/whitehead-read-name-adjust.sh --macs-gsize=mm /nfs/genomes/mouse_gp_jul_07/bowtie/mm9 YY1.merged_1.txt YY1.merged_2.txt" 
##once the first part (trimming + alignment is done) this gets run, dan says the alignment  can also randomly crash out in the middle and need to be restarted from the beginning...
#bsub -q14 -e YY1_rep1_rep2_merged.analysis.err -o YY1_rep1_rep2_merged.analysis.out -J YY1_rep1_rep2_analysis -w "ended("YY1_rep1_rep2_alignment")" "$origami_dir$origami_vers/uniform-calling.sh YY1_rep1_rep2_merge"

####do some cleaning up once everything worked
#rm -f AGGCAG*
#rm -f YY1.merged*
#rm -f *fq*c

##once analysis is done do some file conversion because CSV files are the dumbest thing ever
#cd analysis
#$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.csv 3 > 20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_all.bedpe
#$origami_dir$origami_vers/bin/origami-conversion -c washu analysis-results.csv 3 > 20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_all.washu
#$origami_dir$origami_vers/bin/origami-conversion bed analysis-results.csv 3 > 20170419.YY1_rep1_rep2_merged.origami_v1.1.all.ints_bed
##find significant (>0.9) interactions
#awk -F "\"*,\"*" '{if ($9 > 0.9) print $0}'  analysis-results.csv  > analysis-results.significant.csv
#$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.csv 3 > 20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_0.9.bedpe
#$origami_dir$origami_vers/bin/origami-conversion -c washu analysis-results.significant.csv 3 > 20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_0.9.washu
#$origami_dir$origami_vers/bin/origami-conversion bed analysis-results.significant.csv 3 > 20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_0.9.bed


awk -F "\"*,\"*" '{if ($9 > 0.8) print $0}'  analysis-results.csv  > analysis-results.significant.0.8.csv
$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.0.8.csv 3 >  20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_0.8.bedpe

awk -F "\"*,\"*" '{if ($9 > 0.7) print $0}'  analysis-results.csv  > analysis-results.significant.0.7.csv
$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.0.7.csv 3 >  20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_0.7.bedpe

awk -F "\"*,\"*" '{if ($9 > 0.6) print $0}'  analysis-results.csv  > analysis-results.significant.0.6.csv
$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.0.6.csv 3 >  20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_0.6.bedpe

awk -F "\"*,\"*" '{if ($9 > 0.5) print $0}'  analysis-results.csv  > analysis-results.significant.0.5.csv
$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.0.5.csv 3 >  20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_0.5.bedpe

awk -F "\"*,\"*" '{if ($9 > 0.4) print $0}'  analysis-results.csv  > analysis-results.significant.0.4.csv
$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.0.4.csv 3 >  20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_0.4.bedpe

awk -F "\"*,\"*" '{if ($9 > 0.3) print $0}'  analysis-results.csv  > analysis-results.significant.0.3.csv
$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.0.3.csv 3 >  20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_0.3.bedpe

awk -F "\"*,\"*" '{if ($9 > 0.2) print $0}'  analysis-results.csv  > analysis-results.significant.0.2.csv
$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.0.2.csv 3 >  20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_0.2.bedpe

awk -F "\"*,\"*" '{if ($9 > 0.1) print $0}'  analysis-results.csv  > analysis-results.significant.0.1.csv
$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.0.1.csv 3 >  20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_0.1.bedpe

awk -F "\"*,\"*" '{if ($9 > 0.01) print $0}'  analysis-results.csv  > analysis-results.significant.0.01.csv
$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.0.01.csv 3 >  20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_0.01.bedpe



##############################################################################################
########################                CTCF                         #########################
cd /nfs/young_ata4/ENHANCER_RNA/Workshop/CENTRALIZED_FILE_FREEZE/20170412/Origami

#mkdir CTCF_rep1
#cd CTCF_rep1

#bsub "tar --strip-components 5 --to-stdout -xzvf /lab/solexa_public/Young/160523_WIGTC-HISEQ2A_C90H5ANXX/QualityScore/TAAGGC-s_7_1_sequence.txt.tar.gz > CTCF_1.txt"
#bsub "tar --strip-components 5 --to-stdout -xzvf /lab/solexa_public/Young/160523_WIGTC-HISEQ2A_C90H5ANXX/QualityScore/TAAGGC-s_7_2_sequence.txt.tar.gz > CTCF_2.txt"
#bsub -q14 -e CTCF_rep1_merged.alignment.err -o CTCF_rep1_merged.alignment.out -J CTCF_rep1_alignment "$origami_dir$origami_vers/bin/origami-alignment --output=$origami_dir/CTCF_rep1 --pp=$origami_dir$origami_vers/whitehead-read-name-adjust.sh --macs-gsize=mm /nfs/genomes/mouse_gp_jul_07/bowtie/mm9 CTCF_1.txt CTCF_2.txt" 
##once the first part (trimming + alignment is done) the analysis gets run, dan says the alignment  can also randomly crash out in the middle and need to be restarted from the beginning...
#bsub -q14 -e CTCF_rep1.analysis.err -o CTCF_rep1.analysis.out -J CTCF_rep1.analysis -w "ended("CTCF_rep1_alignment")" "$origami_dir$origami_vers/uniform-calling.sh CTCF_rep1"

##do some cleaning up once everything worked
#rm -f CTCF_1.txt
#rm -f CTCF_2.txt
#rm -f *fq*

##once analysis is done do some file conversion because CSV files are the dumbest thing ever
#cd analysis

#$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.csv 3 > 20170419.CTCF_rep1.origami_v1.1.ints_all.bedpe
#$origami_dir$origami_vers/bin/origami-conversion -c washu analysis-results.csv 3 > 20170419.CTCF_rep1.origami_v1.1.ints_all.washu
#$origami_dir$origami_vers/bin/origami-conversion bed analysis-results.csv 3 > 20170419.CTCF_rep1.origami_v1.1.ints_all.bed
##find significant (>0.9) interactions
#awk -F "\"*,\"*" '{if ($9 > 0.9) print $0}'  analysis-results.csv  > analysis-results.significant.csv
#$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.csv 3 >  20170419.CTCF_rep1.origami_v1.1.ints_0.9.bedpe
#$origami_dir$origami_vers/bin/origami-conversion -c washu analysis-results.significant.csv 3 > 20170419.CTCF_rep1.origami_v1.1.ints_0.9.washu
#$origami_dir$origami_vers/bin/origami-conversion bed analysis-results.significant.csv 3 > 20170419.CTCF_rep1.origami_v1.1.ints_0.9.bed

awk -F "\"*,\"*" '{if ($9 > 0.8) print $0}'  analysis-results.csv  > analysis-results.significant.0.8.csv
$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.0.8.csv 3 >  20170419.CTCF_rep1.origami_v1.1.ints_0.8.bedpe

awk -F "\"*,\"*" '{if ($9 > 0.7) print $0}'  analysis-results.csv  > analysis-results.significant.0.7.csv
$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.0.7.csv 3 >  20170419.CTCF_rep1.origami_v1.1.ints_0.7.bedpe

awk -F "\"*,\"*" '{if ($9 > 0.6) print $0}'  analysis-results.csv  > analysis-results.significant.0.6.csv
$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.0.6.csv 3 >  20170419.CTCF_rep1.origami_v1.1.ints_0.6.bedpe

awk -F "\"*,\"*" '{if ($9 > 0.5) print $0}'  analysis-results.csv  > analysis-results.significant.0.5.csv
$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.0.5.csv 3 >  20170419.CTCF_rep1.origami_v1.1.ints_0.5.bedpe

awk -F "\"*,\"*" '{if ($9 > 0.4) print $0}'  analysis-results.csv  > analysis-results.significant.0.4.csv
$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.0.4.csv 3 >  20170419.CTCF_rep1.origami_v1.1.ints_0.4.bedpe

awk -F "\"*,\"*" '{if ($9 > 0.3) print $0}'  analysis-results.csv  > analysis-results.significant.0.3.csv
$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.0.3.csv 3 >  20170419.CTCF_rep1.origami_v1.1.ints_0.3.bedpe

awk -F "\"*,\"*" '{if ($9 > 0.2) print $0}'  analysis-results.csv  > analysis-results.significant.0.2.csv
$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.0.2.csv 3 >  20170419.CTCF_rep1.origami_v1.1.ints_0.2.bedpe

awk -F "\"*,\"*" '{if ($9 > 0.1) print $0}'  analysis-results.csv  > analysis-results.significant.0.1.csv
$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.0.1.csv 3 >  20170419.CTCF_rep1.origami_v1.1.ints_0.1.bedpe

awk -F "\"*,\"*" '{if ($9 > 0.01) print $0}'  analysis-results.csv  > analysis-results.significant.0.01.csv
$origami_dir$origami_vers/bin/origami-conversion bedpe analysis-results.significant.0.01.csv 3 >  20170419.CTCF_rep1.origami_v1.1.ints_0.01.bedpe


#Rscript /nfs/young_ata3/abew/R/bedpetowashu.R  -i 20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_all.bedpe  -o 20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_all.centered.washu

##############################################################################################
########################        interaction classification           #########################

cd /nfs/young_ata4/ENHANCER_RNA/Workshop/CENTRALIZED_FILE_FREEZE/20170412/Origami
#mkdir Int_Class
#cd Int_Class
#ln -s ../CTCF_rep1/analysis/20170419.CTCF_rep1.origami_v1.1.ints_0.9.bedpe 20170419.CTCF_rep1.origami_v1.1.ints_0.9.bedpe
#ln -s ../YY1_rep1_rep2_merged/analysis/20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_0.9.bedpe 20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_0.9.bedpe
#bsub -q14 -e Int_Class.err -o Int_Class.out "$origami_dir$origami_vers/ConInt/continuous-integration.sh --keep-tmp 20170423.classinfo.xlsx"

mkdir 20170423_DD_Int_Class
cd 20170423_DD_Int_Class
ln -s ../CTCF_rep1/analysis/20170419.CTCF_rep1.origami_v1.1.ints_0.9.bedpe 20170419.CTCF_rep1.origami_v1.1.ints_0.9.bedpe
ln -s ../YY1_rep1_rep2_merged/analysis/20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_0.9.bedpe 20170419.YY1_rep1_rep2_merged.origami_v1.1.ints_0.9.bedpe
bsub -q14 -e Int_Class.err -o Int_Class.out "$origami_dir$origami_vers/20170422_ConInt/continuous-integration.sh -a -e --keep-tmp 20170423.classinfo.xlsx"


####classify by my script so that we can keep K27Ac that overlap promoters

#cd /nfs/young_ata4/ENHANCER_RNA/Workshop/CENTRALIZED_FILE_FREEZE/20170412/Origami
#mkdir AW_Int_class
#cd AW_Int_class/

bash AW_int_class.txt
Rscript AW.intclass.R 

##########################genomic data##########################
#cd /nfs/young_ata4/ENHANCER_RNA/Workshop/CENTRALIZED_FILE_FREEZE/20170412
#mkdir Genomic_Data
#cd Genomic_Data
#K27Ac from Xiong
#cp /nfs/young_ata4/ENHANCER_RNA/Workshop/Peak_Calls/mES_H3K27ac_wctrl_peaks.bed mES_H3K27ac_wctrl_peaks.bed 
#OSN from Whyte
#cp /nfs/young_ata4/WORKSHOP/SuperEnhancer/mES/mES_HiSeq_OSN_enhs.gff.by_comp mES_HiSeq_OSN_enhs.gff.by_comp
#Transcripts from Brian
#cp /nfs/genomes/mouse_gp_jul_07/gtf/mm9.refseq.02_01_2017.gtf mm9.refseq.02_01_2017.gtf

#Promoters (called by: grep -w transcript /nfs/genomes/mouse_gp_jul_07/gtf/mm9.refseq.02_01_2017.gtf | awk -F'[\t ]' '{print $1 "|" $10 "|" $4 "|" $5 "|" $7}' | sort | uniq | grep -v "random" | sed s/\"//g | sed s/\;//g | awk -F'[\|]' '{
 # if($5=="+")
  #  print $1 "\t" $2 "|" NR "\t\t" $3-2000 "\t" $3+2000 "\t\t+\t\t" $2 "|" NR
 # else if($5=="-")
  #  print $1 "\t" $2 "|" NR "\t\t" $4-2000 "\t" $4+2000 "\t\t-\t\t" $2 "|" NR}' > mm9.refseq.02_01_2017.4kbpromsCollapsed mm9.refseq.02_01_2017.4kbproms.gff

#CTCF
#cp /nfs/young_ata4/ENHANCER_RNA/Workshop/Peak_Calls/mES_CTCF_wctrl_peaks.bed mES_CTCF_wctrl_peaks.bed 

#insulated neighborhoods - downloaded from website 4/22/17 @ 1 pm
#wget http://younglab.wi.mit.edu/INs/mES-data.xlsx
#mv mES-data.xlsx Hnisz.review.murine.ESC.IN.xlsx
#bullshit code that took me too long to write that removes the bullshit lines that denes put in there and also extracts unique anchors
#Rscript reformatIN.R 

##brian then does some voodoo magic to make a new list b/c things are never simple

#intersectBed -u -wa -a /nfs/young_ata4/ENHANCER_RNA/Workshop/Peak_Calls/mES_CTCF_wctrl_peaks.bed -b /nfs/young_ata4/ENHANCER_RNA/Workshop/CENTRALIZED_FILE_FREEZE/20170412/Genomic_Data/Hnisz.review.murine.ESC.IN.uniqueanchors.bed > mES_CTCF_wctrl_peaks.inINanchors.bed
#intersectBed -v -b /nfs/young_ata4/ENHANCER_RNA/Workshop/Peak_Calls/mES_H3K27ac_wctrl_peaks.bed -a mES_CTCF_wctrl_peaks.inINanchors.bed >| temp
#intersectBed -v -a temp -b mm9.refseq.02_01_2017.4kbproms.gff | awk -F\\t '{printf ("%s\t%s\t\t%8d\t%8d\t\t.\t\t%s\n", $1,$4, (($3-$2)/2)+($2-2000), (($3-$2)/2)+($2+2000), $4)}' > mES_CTCF_wctrl_peaks.inINanchors.nonEP.4kb.gff
#intersectBed -v -a temp -b mm9.refseq.02_01_2017.4kbproms.gff > mES_CTCF_wctrl_peaks.inINanchors.nonEP.gff
#rm -f temp.bed

#mm9 chrom size
#cp /nfs/genomes/mouse_gp_jul_07/anno/chromInfo.txt chromInfo.txt

##yy1 
cp /nfs/young_ata4/ENHANCER_RNA/Workshop/Peak_Calls/mES_YY1_abChIP_wctrl_1e-9_peaks.bed mES_YY1_abChIP_wctrl_1e-9_peaks.bed