In [3]:
import sys
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import glob
import math
import re
from collections import defaultdict
from matplotlib.font_manager import FontProperties
import subprocess
import scipy
from scipy.stats.stats import pearsonr
%matplotlib inline
matplotlib.rcParams['savefig.dpi'] = 3 * matplotlib.rcParams['savefig.dpi']
In [11]:
# Get raw data
global outfilepath
sampleName='ddx3mut'
outfilepath=os.getcwd() + '/results/%s/'%sampleName
print outfilepath
In [15]:
def modifyName(filepath,newTag):
# Useage: Modifies the filepath name.
# Input: File path of format <path>/<name>.fastq and a string to add to the name.
# Output: Returns the modified path of type <old path>_<new modifier>.fastq
head, tail = os.path.split(filepath)
oldname = tail.split('.')[0]
newName = head+"/"+oldname+"_"+newTag
return newName
def runBowtie(fastqFiles):
# Useage: Short read mapping to reference (hg19).
# Input: Fastq files of replicates (not paired end).
# Output: Path to samfile for each read.
program = 'bowtie2'
mappedReads=[]
unMappedReads=[]
print "Performing Bowtie..."
# Parameters
k='1'
# In -k mode, Bowtie 2 searches for up to N distinct, valid alignments for each read.
# N equals the integer specified with the -k parameter.
# That is, if -k 2 is specified, Bowtie 2 will search for at most 2 distinct alignments
# See http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml
for infastq in fastqFiles:
print infastq
index = os.getcwd() + '/docs/viralrepeat/retroViral'
outfile = modifyName(infastq,"mappedToRetroviral.sam")
unmapped = modifyName(infastq,"notMappedToRetroviral.fastq")
print "Input file:"
print infastq
print 'Genome index:'
print index
print "Output file (mapped):"
print outfile
print "Output file (unmapped)"
print unmapped
proc = subprocess.Popen([program,'-x',index,'-k',k,'-U',infastq,'--un',unmapped,'-S',outfile],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
out, err = proc.communicate()
result = out.decode()
error = err.decode()
print "Result : ",result
print "Error : ",error
mappedReads = mappedReads + [outfile]
unMappedReads = unMappedReads + [unmapped]
return (mappedReads,unMappedReads)
# Run Bowtie
print "Try"
readsForRetroviralmapping=glob.glob(outfilepath+"*_notMappedTorepeat.fastq")
if readsForRetroviralmapping == []:
readsForRetroviralmapping=glob.glob(outfilepath+"*_notMappedToRepeat.fastq") # Older file name convention
print readsForRetroviralmapping
mappedReads,unmappedReads=runBowtie(readsForRetroviralmapping)
In [17]:
print glob.glob(outfilepath+"*_notMappedTorepeat.fastq")
In [6]:
def runSamtools(samfiles):
# Useage: Samfile processing.
# Input: Sam files from Bowtie mapping.
# Output: Duplicate removed, sorted bedFiles.
program = 'samtools'
program2 = 'bamToBed'
outBedFiles=[]
for samfile in samfiles:
# Convert to bamfile
bamfile = samfile.replace('.sam', '.bam')
proc = subprocess.Popen( [program, 'view', '-bS', '-o', bamfile, samfile])
proc.communicate()
# Sort the bamfile and note that samtools sort adds the .bam handle
bamfile_sort = bamfile.replace('.bam', '_sorted')
proc2 = subprocess.Popen( [program, 'sort', bamfile, bamfile_sort])
proc2.communicate()
# Convert to bedFile
bedFile = bamfile_sort.replace('_sorted', '_withDupes.bed')
outfh = open(bedFile, 'w')
proc3 = subprocess.Popen( [program2,'-i', bamfile_sort+'.bam'],stdout=outfh)
proc3.communicate()
outBedFiles=outBedFiles+[bedFile]
return outBedFiles
# Run Samtools
print "Process mapped data"
mappedReads=glob.glob(outfilepath+"*mappedToRetroviral.sam")
mappedBedFiles=runSamtools(mappedReads)
In [7]:
def makeRepeatAnnotation():
# Repeat index sequence
repeatGenomeBuild=os.getcwd() + '/docs/viralrepeat/Hs_viralRepeat.fa'
repeat_genome=np.genfromtxt(repeatGenomeBuild,dtype='string')
repeat_genome_bases=repeat_genome[1]
repeat_genome_size=len(repeat_genome[1])
# Repeat index positions
repeatAnnotation=os.getcwd() + '/docs/viralrepeat/Hs_viralindex_positions.txt'
repeatAnnotDF=pd.DataFrame(pd.read_table(repeatAnnotation,header=None))
repeatAnnotDF.columns=['Name','Length','IndexStart','IndexEnd']
# Python list extraction is not end index inclusive; to extract sequence, use end + 1.
repeatAnnotDF['End_for_extraction']=repeatAnnotDF['IndexEnd']+1
repeatAnnotDF=repeatAnnotDF.set_index('Name',drop=False)
return (repeat_genome_bases,repeatAnnotDF)
# - Repeat annotation -
repeat_genome_bases,repeatAnnotDF=makeRepeatAnnotation()
In [8]:
# - Get mapped bedfiles -
def readBed(path):
bedFile = pd.read_table(path,dtype=str,header=None)
bedFile.columns=['Index','Start','Stop','Name','QS','Strand']
bedFile['Start']=bedFile['Start'].astype(int)
return bedFile
mappedBed=glob.glob(outfilepath+"*mappedToRetroviral_withDupes.bed")
bedR1=readBed(mappedBed[0])
bedR2=readBed(mappedBed[1])
In [203]:
# - Record gene position of RT stop -
recordHits=pd.DataFrame()
for ix in repeatAnnotDF.index:
end=repeatAnnotDF.loc[ix,'IndexEnd']
repName=repeatAnnotDF.loc[ix,'Name']
hits_r1=bedR1[(bedR1['Start']<int(repeatAnnotDF.loc[ix,'IndexEnd'])) & (bedR1['Start']>int(repeatAnnotDF.loc[ix,'IndexStart']))].shape[0]
hits_r2=bedR2[(bedR2['Start']<int(repeatAnnotDF.loc[ix,'IndexEnd'])) & (bedR2['Start']>int(repeatAnnotDF.loc[ix,'IndexStart']))].shape[0]
recordHits.loc[repName,'hits_r1']=hits_r1
recordHits.loc[repName,'hits_r2']=hits_r2
recordHits['sum']=recordHits['hits_r1']+recordHits['hits_r2']
recordHits.fillna(0,inplace=True)
recordHits.sort(['sum'],inplace=True,ascending=False)
In [204]:
# - Evaluate consistency between replicates -
plt.subplot(2,2,1)
plt.scatter(recordHits['hits_r1']/bedR1.shape[0],recordHits['hits_r2']/bedR2.shape[0])
plt.xticks(fontsize=5)
plt.yticks(fontsize=5)
plt.xlabel('Normalized Hits / repeat RNA (Rep1)',fontsize=5)
plt.ylabel('Normalized Hits / repeat RNA (Rep2)',fontsize=5)
plt.subplot(2,2,2)
plt.scatter(np.log10(recordHits['hits_r1']),np.log10(recordHits['hits_r2']))
plt.xticks(fontsize=5)
plt.yticks(fontsize=5)
plt.xlabel('Log10 (Raw hits / repeat RNA, Rep1)',fontsize=5)
plt.ylabel('Log10 (Raw hits / repeat RNA, Rep2)',fontsize=5)
Out[204]:
In [205]:
grZero=recordHits[recordHits['sum']>0]
print grZero.shape
grZero
Out[205]:
In [196]:
# - Evaluate -
### RANK (0 to N) of REPEAT RNA FROM ABOVE LIST
rank=0
####
repName=grZero.index[rank]
# Hits
hits_r1=bedR1[(bedR1['Start']<int(repeatAnnotDF.loc[repName,'IndexEnd'])) & (bedR1['Start']>int(repeatAnnotDF.loc[repName,'IndexStart']))]
hits_r2=bedR2[(bedR2['Start']<int(repeatAnnotDF.loc[repName,'IndexEnd'])) & (bedR2['Start']>int(repeatAnnotDF.loc[repName,'IndexStart']))]
# Histogram
binSize=1
bins=range(repeatAnnotDF.loc[repName,'IndexStart'],repeatAnnotDF.loc[repName,'IndexEnd']+2,binSize) # Make sure bins are end coordinate inclusive
histr1,bins=np.histogram(hits_r1,bins=bins)
histr2,bins=np.histogram(hits_r2,bins=bins)
width=0.7*(bins[1]-bins[0])
center=(bins[:-1] + bins[1:])/2
plt.bar(np.array(range(0,histr1.shape[0])),np.array(list(histr1)),width=1,align='center',color='blue',alpha=0.45)
plt.bar(np.array(range(0,histr1.shape[0])),np.array(list(histr2)),width=1,align='center',color='red',alpha=0.45)
plt.xlim(0,bins.shape[0])
plt.xticks(fontsize=5)
plt.yticks(fontsize=5)
plt.title('Coverage histogram for %s'%repName,fontsize=10)
plt.xlabel('RNA position',fontsize=5)
plt.ylabel('# of reads per base',fontsize=5)
'''
# Save data
record=pd.DataFrame()
pos=range(0,bins[-1]-bins[0])
record['Base']=pos
record['R1_count']=histr1
record['R2_count']=histr2
pathToSave=outfilepath + '%s_coverage'%repName
record.to_csv(pathToSave)
'''
Out[196]:
In [ ]: