In [3]:
import sys
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import pandas as pd
import glob 
import math
import re
from collections import defaultdict
from matplotlib.font_manager import FontProperties
import subprocess
import scipy
from scipy.stats.stats import pearsonr 
%matplotlib inline
matplotlib.rcParams['savefig.dpi'] = 3 * matplotlib.rcParams['savefig.dpi']

In [11]:
# Get raw data
global outfilepath
sampleName='ddx3mut'
outfilepath=os.getcwd() + '/results/%s/'%sampleName
print outfilepath


/arrayAhome/lmartin/CLIP/results/ddx3mut/

In [15]:
def modifyName(filepath,newTag):
	# Useage: Modifies the filepath name. 
	# Input: File path of format <path>/<name>.fastq and a string to add to the name.
	# Output: Returns the modified path of type <old path>_<new modifier>.fastq
    head, tail = os.path.split(filepath)
    oldname = tail.split('.')[0]
    newName = head+"/"+oldname+"_"+newTag
    return newName

def runBowtie(fastqFiles):
    # Useage: Short read mapping to reference (hg19).
    # Input: Fastq files of replicates (not paired end).
    # Output: Path to samfile for each read.
    program = 'bowtie2'
    mappedReads=[]
    unMappedReads=[]
    print "Performing Bowtie..."
    # Parameters
    k='1'
    # In -k mode, Bowtie 2 searches for up to N distinct, valid alignments for each read.
    # N equals the integer specified with the -k parameter. 
    # That is, if -k 2 is specified, Bowtie 2 will search for at most 2 distinct alignments
    # See http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml
    for infastq in fastqFiles:
        print infastq

        index = os.getcwd() + '/docs/viralrepeat/retroViral'
        outfile = modifyName(infastq,"mappedToRetroviral.sam")
        unmapped = modifyName(infastq,"notMappedToRetroviral.fastq")
        
        print "Input file:"
        print infastq 
        print 'Genome index:'
        print index
        print "Output file (mapped):"
        print outfile
        print "Output file (unmapped)"
        print unmapped
        proc = subprocess.Popen([program,'-x',index,'-k',k,'-U',infastq,'--un',unmapped,'-S',outfile],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
        out, err = proc.communicate()
        result = out.decode()
        error = err.decode()
        print "Result : ",result 
        print "Error : ",error
        mappedReads = mappedReads + [outfile]
        unMappedReads = unMappedReads + [unmapped]
    return (mappedReads,unMappedReads)

# Run Bowtie
print "Try"  
readsForRetroviralmapping=glob.glob(outfilepath+"*_notMappedTorepeat.fastq")
if readsForRetroviralmapping == []:
    readsForRetroviralmapping=glob.glob(outfilepath+"*_notMappedToRepeat.fastq") # Older file name convention
print readsForRetroviralmapping
mappedReads,unmappedReads=runBowtie(readsForRetroviralmapping)


Try
[]
Performing Bowtie...

In [17]:
print glob.glob(outfilepath+"*_notMappedTorepeat.fastq")


[]

In [6]:
def runSamtools(samfiles):
    # Useage: Samfile processing.
    # Input: Sam files from Bowtie mapping.
    # Output: Duplicate removed, sorted bedFiles.
    program = 'samtools'
    program2 = 'bamToBed'
    outBedFiles=[]
    
    for samfile in samfiles:
            
        # Convert to bamfile
        bamfile = samfile.replace('.sam', '.bam')  
        proc = subprocess.Popen( [program, 'view', '-bS', '-o', bamfile, samfile])
        proc.communicate()
        
        # Sort the bamfile and note that samtools sort adds the .bam handle
        bamfile_sort = bamfile.replace('.bam', '_sorted') 
        proc2 = subprocess.Popen( [program, 'sort', bamfile, bamfile_sort])
        proc2.communicate()
        
        # Convert to bedFile
        bedFile = bamfile_sort.replace('_sorted', '_withDupes.bed')
        outfh = open(bedFile, 'w')
        proc3 = subprocess.Popen( [program2,'-i', bamfile_sort+'.bam'],stdout=outfh)
        proc3.communicate()
        
        outBedFiles=outBedFiles+[bedFile]
        
    return outBedFiles

# Run Samtools
print "Process mapped data"  
mappedReads=glob.glob(outfilepath+"*mappedToRetroviral.sam")
mappedBedFiles=runSamtools(mappedReads)


Process mapped data

In [7]:
def makeRepeatAnnotation():
    
    # Repeat index sequence 
    repeatGenomeBuild=os.getcwd() + '/docs/viralrepeat/Hs_viralRepeat.fa'
    repeat_genome=np.genfromtxt(repeatGenomeBuild,dtype='string')
    repeat_genome_bases=repeat_genome[1]
    repeat_genome_size=len(repeat_genome[1])
    # Repeat index positions
    repeatAnnotation=os.getcwd() + '/docs/viralrepeat/Hs_viralindex_positions.txt'
    repeatAnnotDF=pd.DataFrame(pd.read_table(repeatAnnotation,header=None))
    repeatAnnotDF.columns=['Name','Length','IndexStart','IndexEnd']
    # Python list extraction is not end index inclusive; to extract sequence, use end + 1.
    repeatAnnotDF['End_for_extraction']=repeatAnnotDF['IndexEnd']+1 
    repeatAnnotDF=repeatAnnotDF.set_index('Name',drop=False)
    return (repeat_genome_bases,repeatAnnotDF)

# - Repeat annotation - 
repeat_genome_bases,repeatAnnotDF=makeRepeatAnnotation()

In [8]:
# - Get mapped bedfiles - 
def readBed(path):
    bedFile = pd.read_table(path,dtype=str,header=None)
    bedFile.columns=['Index','Start','Stop','Name','QS','Strand']
    bedFile['Start']=bedFile['Start'].astype(int)
    return bedFile

mappedBed=glob.glob(outfilepath+"*mappedToRetroviral_withDupes.bed")
bedR1=readBed(mappedBed[0])
bedR2=readBed(mappedBed[1])


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-8-844d1c99b4db> in <module>()
      7 
      8 mappedBed=glob.glob(outfilepath+"*mappedToRetroviral_withDupes.bed")
----> 9 bedR1=readBed(mappedBed[0])
     10 bedR2=readBed(mappedBed[1])

IndexError: list index out of range

In [203]:
# - Record gene position of RT stop - 
recordHits=pd.DataFrame()
for ix in repeatAnnotDF.index:
    end=repeatAnnotDF.loc[ix,'IndexEnd']
    repName=repeatAnnotDF.loc[ix,'Name']
    hits_r1=bedR1[(bedR1['Start']<int(repeatAnnotDF.loc[ix,'IndexEnd'])) & (bedR1['Start']>int(repeatAnnotDF.loc[ix,'IndexStart']))].shape[0]
    hits_r2=bedR2[(bedR2['Start']<int(repeatAnnotDF.loc[ix,'IndexEnd'])) & (bedR2['Start']>int(repeatAnnotDF.loc[ix,'IndexStart']))].shape[0]
    recordHits.loc[repName,'hits_r1']=hits_r1
    recordHits.loc[repName,'hits_r2']=hits_r2
recordHits['sum']=recordHits['hits_r1']+recordHits['hits_r2']
recordHits.fillna(0,inplace=True)
recordHits.sort(['sum'],inplace=True,ascending=False)

In [204]:
# - Evaluate consistency between replicates - 
plt.subplot(2,2,1)
plt.scatter(recordHits['hits_r1']/bedR1.shape[0],recordHits['hits_r2']/bedR2.shape[0])
plt.xticks(fontsize=5)
plt.yticks(fontsize=5)
plt.xlabel('Normalized Hits / repeat RNA (Rep1)',fontsize=5)
plt.ylabel('Normalized Hits / repeat RNA (Rep2)',fontsize=5)

plt.subplot(2,2,2)
plt.scatter(np.log10(recordHits['hits_r1']),np.log10(recordHits['hits_r2']))
plt.xticks(fontsize=5)
plt.yticks(fontsize=5)
plt.xlabel('Log10 (Raw hits / repeat RNA, Rep1)',fontsize=5)
plt.ylabel('Log10 (Raw hits / repeat RNA, Rep2)',fontsize=5)


Out[204]:
<matplotlib.text.Text at 0x2aaacc10f750>

In [205]:
grZero=recordHits[recordHits['sum']>0]
print grZero.shape
grZero


(472, 3)
Out[205]:
hits_r1 hits_r2 sum
LTR5_Hs_ERV2_Primates 197151 386172 583323
LTR5_ERV2_Homo_sapiens 57040 116121 173161
LTR5B_ERV2_Homo_sapiens 9948 17292 27240
SVA_A_SINE_Homo_sapiens 8350 13221 21571
LTR5A_ERV2_Primates 7387 12042 19429
HERV49I_ERV1_Homo_sapiens 4731 6210 10941
HERVH_ERV1_Eutheria 2005 3971 5976
HERVKC4_ERV2_Eutheria 2551 3323 5874
L1HS_L1_Homo_sapiens 1288 2897 4185
HERVK_ERV2_Primates 773 3095 3868
L1_L1_Homo_sapiens 954 2085 3039
ALU_SINE1/7SL_Primates 1121 1373 2494
L1PREC1_L1_Homo_sapiens 308 700 1008
MER22_MER22_Homo_sapiens 380 475 855
LTR7Y_ERV3_Primates 157 419 576
LTR12C_ERV1_Homo_sapiens 186 337 523
LTR7A_ERV3_Homo_sapiens 143 375 518
L1PA4_L1_Homo_sapiens 143 357 500
L1PA3_L1_Homo_sapiens 130 340 470
LTR13_ERV2_Homo_sapiens 202 263 465
MER11A_ERV2_Homo_sapiens 187 243 430
L1PA5_L1_Homo_sapiens 110 295 405
LTR26_ERV1_Homo_sapiens 165 233 398
L1PA2_L1_Homo_sapiens 109 264 373
MER11C_ERV2_Homo_sapiens 115 226 341
L1PB2c_L1_Homo_sapiens 118 215 333
L1PA6_L1_Homo_sapiens 85 241 326
LTR12E_ERV1_Homo_sapiens 142 163 305
L1PREC2_L1_Homo_sapiens 94 197 291
L1PA7_5_L1_Homo_sapiens 105 160 265
LTR7B_ERV3_Homo_sapiens 76 182 258
THE1_I_LTR_Retrotransposon_Homo_sapiens 84 174 258
ALR_SAT_Homo_sapiens 64 158 222
HERV9_ERV1_Homo_sapiens 66 155 221
L1PA7_L1_Homo_sapiens 57 151 208
HERVL_ERV3_Homo_sapiens 56 142 198
MER9_ERV2_Homo_sapiens 54 140 194
L1PA8_L1_Homo_sapiens 48 125 173
ALR1_SAT_Homo_sapiens 26 139 165
SVA2_MSAT_Eutheria 99 64 163
HERV-K14I_ERV2_Homo_sapiens 47 87 134
MER11B_ERV2_Primates 48 81 129
ALRb_SAT_Primates 25 93 118
LTR17_ERV1_Homo_sapiens 39 77 116
HERV17_ERV1_Homo_sapiens 39 74 113
TIGGER1_Mariner/Tc1_Homo_sapiens 33 77 110
UHG_snRNA_Homo_sapiens 28 78 106
L1P_MA2_L1_Eutheria 31 68 99
HARLEQUIN_ERV1_Homo_sapiens 25 72 97
L1PA10_L1_Homo_sapiens 25 68 93
HERVK9I_ERV2_Homo_sapiens 23 67 90
LTR14C_ERV2_Homo_sapiens 41 44 85
LTR14B_ERV2_Homo_sapiens 31 53 84
LTR12D_ERV1_Homo_sapiens 29 51 80
L1PB1_L1_Homo_sapiens 25 52 77
THE1B_ERV3_Hominidae 25 47 72
LTR25-int_ERV1_Primates 28 36 64
HERVG25_Endogenous_Retrovirus_Homo_sapiens 21 42 63
THE1A_ERV3_Homo_sapiens 17 46 63
L1PBA_5_L1_Homo_sapiens 17 46 63
... ... ...

472 rows × 3 columns


In [196]:
# - Evaluate - 
### RANK (0 to N) of REPEAT RNA FROM ABOVE LIST 
rank=0
####

repName=grZero.index[rank]
# Hits
hits_r1=bedR1[(bedR1['Start']<int(repeatAnnotDF.loc[repName,'IndexEnd'])) & (bedR1['Start']>int(repeatAnnotDF.loc[repName,'IndexStart']))]
hits_r2=bedR2[(bedR2['Start']<int(repeatAnnotDF.loc[repName,'IndexEnd'])) & (bedR2['Start']>int(repeatAnnotDF.loc[repName,'IndexStart']))]
# Histogram
binSize=1
bins=range(repeatAnnotDF.loc[repName,'IndexStart'],repeatAnnotDF.loc[repName,'IndexEnd']+2,binSize) # Make sure bins are end coordinate inclusive
histr1,bins=np.histogram(hits_r1,bins=bins)
histr2,bins=np.histogram(hits_r2,bins=bins)
width=0.7*(bins[1]-bins[0])
center=(bins[:-1] + bins[1:])/2
plt.bar(np.array(range(0,histr1.shape[0])),np.array(list(histr1)),width=1,align='center',color='blue',alpha=0.45)
plt.bar(np.array(range(0,histr1.shape[0])),np.array(list(histr2)),width=1,align='center',color='red',alpha=0.45)
plt.xlim(0,bins.shape[0])
plt.xticks(fontsize=5)
plt.yticks(fontsize=5)
plt.title('Coverage histogram for %s'%repName,fontsize=10)
plt.xlabel('RNA position',fontsize=5)
plt.ylabel('# of reads per base',fontsize=5)

'''
# Save data
record=pd.DataFrame()
pos=range(0,bins[-1]-bins[0])
record['Base']=pos
record['R1_count']=histr1
record['R2_count']=histr2
pathToSave=outfilepath + '%s_coverage'%repName
record.to_csv(pathToSave)
'''


Out[196]:
"\n# Save data\nrecord=pd.DataFrame()\npos=range(0,bins[-1]-bins[0])\nrecord['Base']=pos\nrecord['R1_count']=histr1\nrecord['R2_count']=histr2\npathToSave=outfilepath + '%s_coverage'%repName\nrecord.to_csv(pathToSave)\n"

In [ ]: