notebook.community

Edit and run



In [3]:

    
import sys
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import pandas as pd
import glob 
import math
import re
from collections import defaultdict
from matplotlib.font_manager import FontProperties
import subprocess
import scipy
from scipy.stats.stats import pearsonr 
%matplotlib inline
matplotlib.rcParams['savefig.dpi'] = 3 * matplotlib.rcParams['savefig.dpi']



In [11]:

    
# Get raw data
global outfilepath
sampleName='ddx3mut'
outfilepath=os.getcwd() + '/results/%s/'%sampleName
print outfilepath









    



/arrayAhome/lmartin/CLIP/results/ddx3mut/



In [15]:

    
def modifyName(filepath,newTag):
	# Useage: Modifies the filepath name. 
	# Input: File path of format <path>/<name>.fastq and a string to add to the name.
	# Output: Returns the modified path of type <old path>_<new modifier>.fastq
    head, tail = os.path.split(filepath)
    oldname = tail.split('.')[0]
    newName = head+"/"+oldname+"_"+newTag
    return newName

def runBowtie(fastqFiles):
    # Useage: Short read mapping to reference (hg19).
    # Input: Fastq files of replicates (not paired end).
    # Output: Path to samfile for each read.
    program = 'bowtie2'
    mappedReads=[]
    unMappedReads=[]
    print "Performing Bowtie..."
    # Parameters
    k='1'
    # In -k mode, Bowtie 2 searches for up to N distinct, valid alignments for each read.
    # N equals the integer specified with the -k parameter. 
    # That is, if -k 2 is specified, Bowtie 2 will search for at most 2 distinct alignments
    # See http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml
    for infastq in fastqFiles:
        print infastq

        index = os.getcwd() + '/docs/viralrepeat/retroViral'
        outfile = modifyName(infastq,"mappedToRetroviral.sam")
        unmapped = modifyName(infastq,"notMappedToRetroviral.fastq")
        
        print "Input file:"
        print infastq 
        print 'Genome index:'
        print index
        print "Output file (mapped):"
        print outfile
        print "Output file (unmapped)"
        print unmapped
        proc = subprocess.Popen([program,'-x',index,'-k',k,'-U',infastq,'--un',unmapped,'-S',outfile],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
        out, err = proc.communicate()
        result = out.decode()
        error = err.decode()
        print "Result : ",result 
        print "Error : ",error
        mappedReads = mappedReads + [outfile]
        unMappedReads = unMappedReads + [unmapped]
    return (mappedReads,unMappedReads)

# Run Bowtie
print "Try"  
readsForRetroviralmapping=glob.glob(outfilepath+"*_notMappedTorepeat.fastq")
if readsForRetroviralmapping == []:
    readsForRetroviralmapping=glob.glob(outfilepath+"*_notMappedToRepeat.fastq") # Older file name convention
print readsForRetroviralmapping
mappedReads,unmappedReads=runBowtie(readsForRetroviralmapping)









    



Try
[]
Performing Bowtie...



In [17]:

    
print glob.glob(outfilepath+"*_notMappedTorepeat.fastq")

[]



In [6]:

    
def runSamtools(samfiles):
    # Useage: Samfile processing.
    # Input: Sam files from Bowtie mapping.
    # Output: Duplicate removed, sorted bedFiles.
    program = 'samtools'
    program2 = 'bamToBed'
    outBedFiles=[]
    
    for samfile in samfiles:
            
        # Convert to bamfile
        bamfile = samfile.replace('.sam', '.bam')  
        proc = subprocess.Popen( [program, 'view', '-bS', '-o', bamfile, samfile])
        proc.communicate()
        
        # Sort the bamfile and note that samtools sort adds the .bam handle
        bamfile_sort = bamfile.replace('.bam', '_sorted') 
        proc2 = subprocess.Popen( [program, 'sort', bamfile, bamfile_sort])
        proc2.communicate()
        
        # Convert to bedFile
        bedFile = bamfile_sort.replace('_sorted', '_withDupes.bed')
        outfh = open(bedFile, 'w')
        proc3 = subprocess.Popen( [program2,'-i', bamfile_sort+'.bam'],stdout=outfh)
        proc3.communicate()
        
        outBedFiles=outBedFiles+[bedFile]
        
    return outBedFiles

# Run Samtools
print "Process mapped data"  
mappedReads=glob.glob(outfilepath+"*mappedToRetroviral.sam")
mappedBedFiles=runSamtools(mappedReads)









    



Process mapped data



In [7]:

    
def makeRepeatAnnotation():
    
    # Repeat index sequence 
    repeatGenomeBuild=os.getcwd() + '/docs/viralrepeat/Hs_viralRepeat.fa'
    repeat_genome=np.genfromtxt(repeatGenomeBuild,dtype='string')
    repeat_genome_bases=repeat_genome[1]
    repeat_genome_size=len(repeat_genome[1])
    # Repeat index positions
    repeatAnnotation=os.getcwd() + '/docs/viralrepeat/Hs_viralindex_positions.txt'
    repeatAnnotDF=pd.DataFrame(pd.read_table(repeatAnnotation,header=None))
    repeatAnnotDF.columns=['Name','Length','IndexStart','IndexEnd']
    # Python list extraction is not end index inclusive; to extract sequence, use end + 1.
    repeatAnnotDF['End_for_extraction']=repeatAnnotDF['IndexEnd']+1 
    repeatAnnotDF=repeatAnnotDF.set_index('Name',drop=False)
    return (repeat_genome_bases,repeatAnnotDF)

# - Repeat annotation - 
repeat_genome_bases,repeatAnnotDF=makeRepeatAnnotation()



In [8]:

    
# - Get mapped bedfiles - 
def readBed(path):
    bedFile = pd.read_table(path,dtype=str,header=None)
    bedFile.columns=['Index','Start','Stop','Name','QS','Strand']
    bedFile['Start']=bedFile['Start'].astype(int)
    return bedFile

mappedBed=glob.glob(outfilepath+"*mappedToRetroviral_withDupes.bed")
bedR1=readBed(mappedBed[0])
bedR2=readBed(mappedBed[1])









    



---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-8-844d1c99b4db> in <module>()
      7 
      8 mappedBed=glob.glob(outfilepath+"*mappedToRetroviral_withDupes.bed")
----> 9 bedR1=readBed(mappedBed[0])
     10 bedR2=readBed(mappedBed[1])

IndexError: list index out of range



In [203]:

    
# - Record gene position of RT stop - 
recordHits=pd.DataFrame()
for ix in repeatAnnotDF.index:
    end=repeatAnnotDF.loc[ix,'IndexEnd']
    repName=repeatAnnotDF.loc[ix,'Name']
    hits_r1=bedR1[(bedR1['Start']<int(repeatAnnotDF.loc[ix,'IndexEnd'])) & (bedR1['Start']>int(repeatAnnotDF.loc[ix,'IndexStart']))].shape[0]
    hits_r2=bedR2[(bedR2['Start']<int(repeatAnnotDF.loc[ix,'IndexEnd'])) & (bedR2['Start']>int(repeatAnnotDF.loc[ix,'IndexStart']))].shape[0]
    recordHits.loc[repName,'hits_r1']=hits_r1
    recordHits.loc[repName,'hits_r2']=hits_r2
recordHits['sum']=recordHits['hits_r1']+recordHits['hits_r2']
recordHits.fillna(0,inplace=True)
recordHits.sort(['sum'],inplace=True,ascending=False)



In [204]:

    
# - Evaluate consistency between replicates - 
plt.subplot(2,2,1)
plt.scatter(recordHits['hits_r1']/bedR1.shape[0],recordHits['hits_r2']/bedR2.shape[0])
plt.xticks(fontsize=5)
plt.yticks(fontsize=5)
plt.xlabel('Normalized Hits / repeat RNA (Rep1)',fontsize=5)
plt.ylabel('Normalized Hits / repeat RNA (Rep2)',fontsize=5)

plt.subplot(2,2,2)
plt.scatter(np.log10(recordHits['hits_r1']),np.log10(recordHits['hits_r2']))
plt.xticks(fontsize=5)
plt.yticks(fontsize=5)
plt.xlabel('Log10 (Raw hits / repeat RNA, Rep1)',fontsize=5)
plt.ylabel('Log10 (Raw hits / repeat RNA, Rep2)',fontsize=5)









    Out[204]:





<matplotlib.text.Text at 0x2aaacc10f750>



In [205]:

    
grZero=recordHits[recordHits['sum']>0]
print grZero.shape
grZero









    



(472, 3)






    Out[205]:






  
    
      
      hits_r1
      hits_r2
      sum
    
  
  
    
      LTR5_Hs_ERV2_Primates
       197151
       386172
       583323
    
    
      LTR5_ERV2_Homo_sapiens
        57040
       116121
       173161
    
    
      LTR5B_ERV2_Homo_sapiens
         9948
        17292
        27240
    
    
      SVA_A_SINE_Homo_sapiens
         8350
        13221
        21571
    
    
      LTR5A_ERV2_Primates
         7387
        12042
        19429
    
    
      HERV49I_ERV1_Homo_sapiens
         4731
         6210
        10941
    
    
      HERVH_ERV1_Eutheria
         2005
         3971
         5976
    
    
      HERVKC4_ERV2_Eutheria
         2551
         3323
         5874
    
    
      L1HS_L1_Homo_sapiens
         1288
         2897
         4185
    
    
      HERVK_ERV2_Primates
          773
         3095
         3868
    
    
      L1_L1_Homo_sapiens
          954
         2085
         3039
    
    
      ALU_SINE1/7SL_Primates
         1121
         1373
         2494
    
    
      L1PREC1_L1_Homo_sapiens
          308
          700
         1008
    
    
      MER22_MER22_Homo_sapiens
          380
          475
          855
    
    
      LTR7Y_ERV3_Primates
          157
          419
          576
    
    
      LTR12C_ERV1_Homo_sapiens
          186
          337
          523
    
    
      LTR7A_ERV3_Homo_sapiens
          143
          375
          518
    
    
      L1PA4_L1_Homo_sapiens
          143
          357
          500
    
    
      L1PA3_L1_Homo_sapiens
          130
          340
          470
    
    
      LTR13_ERV2_Homo_sapiens
          202
          263
          465
    
    
      MER11A_ERV2_Homo_sapiens
          187
          243
          430
    
    
      L1PA5_L1_Homo_sapiens
          110
          295
          405
    
    
      LTR26_ERV1_Homo_sapiens
          165
          233
          398
    
    
      L1PA2_L1_Homo_sapiens
          109
          264
          373
    
    
      MER11C_ERV2_Homo_sapiens
          115
          226
          341
    
    
      L1PB2c_L1_Homo_sapiens
          118
          215
          333
    
    
      L1PA6_L1_Homo_sapiens
           85
          241
          326
    
    
      LTR12E_ERV1_Homo_sapiens
          142
          163
          305
    
    
      L1PREC2_L1_Homo_sapiens
           94
          197
          291
    
    
      L1PA7_5_L1_Homo_sapiens
          105
          160
          265
    
    
      LTR7B_ERV3_Homo_sapiens
           76
          182
          258
    
    
      THE1_I_LTR_Retrotransposon_Homo_sapiens
           84
          174
          258
    
    
      ALR_SAT_Homo_sapiens
           64
          158
          222
    
    
      HERV9_ERV1_Homo_sapiens
           66
          155
          221
    
    
      L1PA7_L1_Homo_sapiens
           57
          151
          208
    
    
      HERVL_ERV3_Homo_sapiens
           56
          142
          198
    
    
      MER9_ERV2_Homo_sapiens
           54
          140
          194
    
    
      L1PA8_L1_Homo_sapiens
           48
          125
          173
    
    
      ALR1_SAT_Homo_sapiens
           26
          139
          165
    
    
      SVA2_MSAT_Eutheria
           99
           64
          163
    
    
      HERV-K14I_ERV2_Homo_sapiens
           47
           87
          134
    
    
      MER11B_ERV2_Primates
           48
           81
          129
    
    
      ALRb_SAT_Primates
           25
           93
          118
    
    
      LTR17_ERV1_Homo_sapiens
           39
           77
          116
    
    
      HERV17_ERV1_Homo_sapiens
           39
           74
          113
    
    
      TIGGER1_Mariner/Tc1_Homo_sapiens
           33
           77
          110
    
    
      UHG_snRNA_Homo_sapiens
           28
           78
          106
    
    
      L1P_MA2_L1_Eutheria
           31
           68
           99
    
    
      HARLEQUIN_ERV1_Homo_sapiens
           25
           72
           97
    
    
      L1PA10_L1_Homo_sapiens
           25
           68
           93
    
    
      HERVK9I_ERV2_Homo_sapiens
           23
           67
           90
    
    
      LTR14C_ERV2_Homo_sapiens
           41
           44
           85
    
    
      LTR14B_ERV2_Homo_sapiens
           31
           53
           84
    
    
      LTR12D_ERV1_Homo_sapiens
           29
           51
           80
    
    
      L1PB1_L1_Homo_sapiens
           25
           52
           77
    
    
      THE1B_ERV3_Hominidae
           25
           47
           72
    
    
      LTR25-int_ERV1_Primates
           28
           36
           64
    
    
      HERVG25_Endogenous_Retrovirus_Homo_sapiens
           21
           42
           63
    
    
      THE1A_ERV3_Homo_sapiens
           17
           46
           63
    
    
      L1PBA_5_L1_Homo_sapiens
           17
           46
           63
    
    
      
      ...
      ...
      ...
    
  

472 rows × 3 columns



In [196]:

    
# - Evaluate - 
### RANK (0 to N) of REPEAT RNA FROM ABOVE LIST 
rank=0
####

repName=grZero.index[rank]
# Hits
hits_r1=bedR1[(bedR1['Start']<int(repeatAnnotDF.loc[repName,'IndexEnd'])) & (bedR1['Start']>int(repeatAnnotDF.loc[repName,'IndexStart']))]
hits_r2=bedR2[(bedR2['Start']<int(repeatAnnotDF.loc[repName,'IndexEnd'])) & (bedR2['Start']>int(repeatAnnotDF.loc[repName,'IndexStart']))]
# Histogram
binSize=1
bins=range(repeatAnnotDF.loc[repName,'IndexStart'],repeatAnnotDF.loc[repName,'IndexEnd']+2,binSize) # Make sure bins are end coordinate inclusive
histr1,bins=np.histogram(hits_r1,bins=bins)
histr2,bins=np.histogram(hits_r2,bins=bins)
width=0.7*(bins[1]-bins[0])
center=(bins[:-1] + bins[1:])/2
plt.bar(np.array(range(0,histr1.shape[0])),np.array(list(histr1)),width=1,align='center',color='blue',alpha=0.45)
plt.bar(np.array(range(0,histr1.shape[0])),np.array(list(histr2)),width=1,align='center',color='red',alpha=0.45)
plt.xlim(0,bins.shape[0])
plt.xticks(fontsize=5)
plt.yticks(fontsize=5)
plt.title('Coverage histogram for %s'%repName,fontsize=10)
plt.xlabel('RNA position',fontsize=5)
plt.ylabel('# of reads per base',fontsize=5)

'''
# Save data
record=pd.DataFrame()
pos=range(0,bins[-1]-bins[0])
record['Base']=pos
record['R1_count']=histr1
record['R2_count']=histr2
pathToSave=outfilepath + '%s_coverage'%repName
record.to_csv(pathToSave)
'''









    Out[196]:





"\n# Save data\nrecord=pd.DataFrame()\npos=range(0,bins[-1]-bins[0])\nrecord['Base']=pos\nrecord['R1_count']=histr1\nrecord['R2_count']=histr2\npathToSave=outfilepath + '%s_coverage'%repName\nrecord.to_csv(pathToSave)\n"



In [ ]:

	hits_r1	hits_r2	sum
LTR5_Hs_ERV2_Primates	197151	386172	583323
LTR5_ERV2_Homo_sapiens	57040	116121	173161
LTR5B_ERV2_Homo_sapiens	9948	17292	27240
SVA_A_SINE_Homo_sapiens	8350	13221	21571
LTR5A_ERV2_Primates	7387	12042	19429
HERV49I_ERV1_Homo_sapiens	4731	6210	10941
HERVH_ERV1_Eutheria	2005	3971	5976
HERVKC4_ERV2_Eutheria	2551	3323	5874
L1HS_L1_Homo_sapiens	1288	2897	4185
HERVK_ERV2_Primates	773	3095	3868
L1_L1_Homo_sapiens	954	2085	3039
ALU_SINE1/7SL_Primates	1121	1373	2494
L1PREC1_L1_Homo_sapiens	308	700	1008
MER22_MER22_Homo_sapiens	380	475	855
LTR7Y_ERV3_Primates	157	419	576
LTR12C_ERV1_Homo_sapiens	186	337	523
LTR7A_ERV3_Homo_sapiens	143	375	518
L1PA4_L1_Homo_sapiens	143	357	500
L1PA3_L1_Homo_sapiens	130	340	470
LTR13_ERV2_Homo_sapiens	202	263	465
MER11A_ERV2_Homo_sapiens	187	243	430
L1PA5_L1_Homo_sapiens	110	295	405
LTR26_ERV1_Homo_sapiens	165	233	398
L1PA2_L1_Homo_sapiens	109	264	373
MER11C_ERV2_Homo_sapiens	115	226	341
L1PB2c_L1_Homo_sapiens	118	215	333
L1PA6_L1_Homo_sapiens	85	241	326
LTR12E_ERV1_Homo_sapiens	142	163	305
L1PREC2_L1_Homo_sapiens	94	197	291
L1PA7_5_L1_Homo_sapiens	105	160	265
LTR7B_ERV3_Homo_sapiens	76	182	258
THE1_I_LTR_Retrotransposon_Homo_sapiens	84	174	258
ALR_SAT_Homo_sapiens	64	158	222
HERV9_ERV1_Homo_sapiens	66	155	221
L1PA7_L1_Homo_sapiens	57	151	208
HERVL_ERV3_Homo_sapiens	56	142	198
MER9_ERV2_Homo_sapiens	54	140	194
L1PA8_L1_Homo_sapiens	48	125	173
ALR1_SAT_Homo_sapiens	26	139	165
SVA2_MSAT_Eutheria	99	64	163
HERV-K14I_ERV2_Homo_sapiens	47	87	134
MER11B_ERV2_Primates	48	81	129
ALRb_SAT_Primates	25	93	118
LTR17_ERV1_Homo_sapiens	39	77	116
HERV17_ERV1_Homo_sapiens	39	74	113
TIGGER1_Mariner/Tc1_Homo_sapiens	33	77	110
UHG_snRNA_Homo_sapiens	28	78	106
L1P_MA2_L1_Eutheria	31	68	99
HARLEQUIN_ERV1_Homo_sapiens	25	72	97
L1PA10_L1_Homo_sapiens	25	68	93
HERVK9I_ERV2_Homo_sapiens	23	67	90
LTR14C_ERV2_Homo_sapiens	41	44	85
LTR14B_ERV2_Homo_sapiens	31	53	84
LTR12D_ERV1_Homo_sapiens	29	51	80
L1PB1_L1_Homo_sapiens	25	52	77
THE1B_ERV3_Hominidae	25	47	72
LTR25-int_ERV1_Primates	28	36	64
HERVG25_Endogenous_Retrovirus_Homo_sapiens	21	42	63
THE1A_ERV3_Homo_sapiens	17	46	63
L1PBA_5_L1_Homo_sapiens	17	46	63
	...	...	...