In [2]:
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import cmath
import math
import glob
import subprocess
%matplotlib inline
import matplotlib
import scipy
from scipy import stats
matplotlib.rcParams['savefig.dpi'] = 2 * matplotlib.rcParams['savefig.dpi']
In [7]:
def getGeneList(name,geneType):
geneListpath=os.getcwd() + '/results/'+name+'/rawdata/PlotData_ReadsPerGene_%s'%geneType
path=os.getcwd() + '/results/'+name+'/rawdata/PlotData_ReadsPerPipeFile'
data=pd.read_csv(path)
data.columns=['Index','FileName','Reads']
data['FileName']=pd.Series(['Raw (R1)','Raw (R2)','3p Trim (R1)','3p Trim (R2)','Filter (R1)','Filter (R2)','No dupes (R1)','No dupes (R2)','RepeatMapped (R1)','RepeatMaped (R2)','Hg19Mapped (R1)','Hg19Mapped (R2)','Blacklist (R1)','Blacklist (R2)','RepeatMask (R1)','RepeatMask (R2)','ClipperIn','ClipperOut'])
data=data.set_index('FileName')
valueForNorm=data.loc['ClipperOut','Reads']
geneListDF=pd.DataFrame(pd.read_table(geneListpath,header=None,delimiter=','))
geneListDF.columns=['GeneName','Count']
geneListDF.index=geneListDF['GeneName']
geneListDF['Normalized']=geneListDF['Count']/valueForNorm
return geneListDF
def mergeGeneLists(df1,df2):
allGenes=list(df1['GeneName'])+list(df2['GeneName'])
alUniqGenes=list(set(allGenes))
mergedDF=pd.DataFrame(index=alUniqGenes)
mergedDF['Counts_1']=df1['Count']
mergedDF['Counts_1_Norm']=df1['Normalized']
mergedDF['Counts_2']=df2['Count']
mergedDF['Counts_2_Norm']=df2['Normalized']
mergedDF.fillna(0,inplace=True)
mergedDF['Sum']=mergedDF['Counts_1']+mergedDF['Counts_2']
mergedDF.sort(['Sum'],ascending=True,inplace=True)
return mergedDF
def makeScatter(target,toCompare,geneType,colors,axisLimit):
target_geneList=getGeneList(target,geneType)
for i in range(0,len(toCompare)):
if i == len(toCompare)-1:
alpha=1
else:
alpha=0.25
background=toCompare[i]
background_geneList=getGeneList(background,geneType)
DF1=mergeGeneLists(target_geneList,background_geneList)
plt.scatter(np.array(DF1['Counts_1_Norm']),np.array(DF1['Counts_2_Norm']),label=toCompare[i],color=colors[i],alpha=0.25)
slope,intercept,r_value,p_value,std_err=scipy.stats.linregress(np.array(DF1['Counts_1_Norm']),np.array(DF1['Counts_2_Norm']))
print toCompare[i]
print "r-squared:", r_value**2
plt.xlim(-0.1*plt.xlim()[1],plt.xlim()[1])
plt.ylim(-0.1*plt.xlim()[1],plt.xlim()[1])
plt.title('Tag count per %s gene across datasets.'%geneType,fontsize=5)
plt.xlabel("Normalized tag count per gene for %s"%target,fontsize=5)
plt.ylabel("Normalized tag count per gene for other datasets.",fontsize=5)
plt.tick_params(axis='xticks',labelsize=5,fontsize=5)
plt.tick_params(axis='yticks',labelsize=5,fontsize=5)
plt.yticks(fontsize=5)
plt.xticks(rotation=90,fontsize=5)
plt.legend(loc='upper left', scatterpoints = 1,prop={'size':5})
plt.grid()
if axisLimit: # Plot limit
plt.xlim(-0.1*axisLimit,axisLimit)
plt.ylim(-0.1*axisLimit,axisLimit)
# Label DF for output.
DF1.columns=['Counts_%s'%target,'Norm_Counts_%s'%target,'Counts_%s'%background,'Norm_Counts_%s'%background,'Sum']
return DF1
In [10]:
fig1=plt.figure(1)
plt.subplot(2,2,1)
# *** INPUT : Dataset (x-axis) against which others will be compared.
target='DDX5e0'
# *** INPUT : Datasets (y-axis) for comparison.
toCompare=['ddx3wtA','ddx3wt','MMhur', 'ddx3mutA']
# *** INPUT : Colors. Must match length of toCompare.
colors=['blue','red','yellow','green']
# *** INPUT : Specify the gene type to compare.
geneType='proteinCoding'
# Specify the axis limit (in case one wants to zoom in).
axisLimit=None
DF1=makeScatter(target,toCompare,geneType,colors,axisLimit)
plt.xlim([-0.001,0.02])
plt.ylim([-0.001,0.02])
plt.tight_layout()
outfilepath=os.getcwd() + '/results/%s/'%target
#fig1.savefig(outfilepath+'S5_proteinMetaanalysis.png',format='png',bbox_inches='tight',dpi=200,pad_inches=0.5)
#fig1.savefig(outfilepath+'S5_proteinMetaanalysis.pdf',format='pdf',bbox_inches='tight',dpi=200,pad_inches=0.5)
In [255]:
from scipy.interpolate import interp1d
# Examine differentially expressed genes
DF1['Delta']=np.abs(DF1['Norm_Counts_hcvwt']-DF1['Norm_Counts_jfh1v2'])
DF1.sort('Delta',inplace=True,ascending=False)
data=np.array(DF1['Delta'])
DF1['Percentile']=[stats.percentileofscore(data, i, kind='strict') for i in data]
In [262]:
target='jfh1v2'
pathToSave=os.getcwd()+'/results/%s/rawdata/'%target+'proteinCoding_diff'
DF1.to_csv(pathToSave)
In [263]:
tmp=DF1[DF1['Counts_jfh1v2']==0]
pathToSave=os.getcwd()+'/results/%s/rawdata/'%target+'proteinCoding_diff_onlyWT'
tmp.to_csv(pathToSave)
In [264]:
tmp=DF1[DF1['Counts_hcvwt']==0]
pathToSave=os.getcwd()+'/results/%s/rawdata/'%target+'proteinCoding_diff_onlyJFH1'
tmp.to_csv(pathToSave)
In [114]:
# *** OPTION TO SAVE DATA : Sort the data by total hits.
# Note that this will apply to the last dataset in toCompare if it contain multiple.
DF1.sort('Sum',inplace=True,ascending=False)
pathToSave=os.getcwd() + '/results/%s/rawdata/'%target + '%s_metaanalysis'%geneType
DF1.to_csv(pathToSave)
In [15]:
def makeRepeatAnnotation(org):
# Repeat index sequence
if org=='hg19': repeatGenomeBuild=os.getcwd() + '/docs/hg19/repeat/repeatRNA.fa'
elif org=='mm9': repeatGenomeBuild=os.getcwd() + '/docs/mm9/repeat/Mm_repeatRNA.fa'
repeat_genome=np.genfromtxt(repeatGenomeBuild,dtype='string')
repeat_genome_bases=repeat_genome[1]
repeat_genome_size=len(repeat_genome[1])
# Repeat index positions
if org=='hg19':repeatAnnotation=os.getcwd() + '/docs/hg19/repeat/Hs_repeatIndex_positions.txt'
else:repeatAnnotation=os.getcwd() + '/docs/mm9/repeat/Mm_repeatIndex_positions.txt'
repeatAnnotDF=pd.DataFrame(pd.read_table(repeatAnnotation,header=None))
repeatAnnotDF.columns=['Name','Length','IndexStart','IndexEnd']
repeatAnnotDF['End_for_extraction']=repeatAnnotDF['IndexEnd']+1
return (repeat_genome_bases,repeatAnnotDF)
def getRepeatRNAData(repeatName,filePaths,binSize,flag):
# Create data frame for storage
storageDF=pd.DataFrame()
# Fill it with histogram data from samples
colNamesForExtraction=[]
for i in range(0,len(filePaths)):
# Get RT file
path=filePaths[i]
# Get repeat data
path_repReads=path+'/rawdata/PlotData_RepeatRNAreads_%s'%repeatName
hits_per_rep=pd.read_csv(path_repReads)
if hits_per_rep.shape[0] > 0:
RTpositions=hits_per_rep['RT_stop']
start=hits_per_rep.loc[0,'Repeat_Start']
end=hits_per_rep.loc[0,'Repeat_End']
# Histogram of RT stops across gene body
bins=range(start,end+2,1)
hist,bins=np.histogram(RTpositions,bins=bins)
width=0.7*(bins[1]-bins[0])
center=(bins[:-1] + bins[1:])/2
histPlot=np.array(hist,dtype=float)
# Read accounting
data=pd.read_csv(path+'/rawdata/PlotData_ReadsPerPipeFile')
data.columns=['Index','FileName','Reads']
data['FileName']=pd.Series(['Raw (R1)','Raw (R2)','3p Trim (R1)','3p Trim (R2)','Filter (R1)','Filter (R2)','No dupes (R1)','No dupes (R2)','RepeatMapped (R1)','RepeatMapped (R2)','Hg19Mapped (R1)','Hg19Mapped (R2)','Blacklist (R1)','Blacklist (R2)','RepeatMask (R1)','RepeatMask (R2)','ClipperIn','ClipperOut'])
data=data.set_index('FileName')
m1=np.mean([data.loc['RepeatMapped (R1)','Reads'],data.loc['RepeatMapped (R2)','Reads']])
m2=np.mean([data.loc['Hg19Mapped (R1)','Reads'],data.loc['Hg19Mapped (R2)','Reads']])
totalMapped=m1+m2
# Normalize data
histNorm=np.array(histPlot/float(totalMapped),dtype=float)
# Extract sequence and record counts
sampleName=path.split('/')[-2]
readsPerBase=np.array(list(hist))
colName=sampleName+'RT_stops'
storageDF[colName]=readsPerBase
colName=sampleName+'Total_mapped'
storageDF[colName]=totalMapped
readsPerBaseNorm=np.array(list(histNorm))
colNameNorm=sampleName+'RT_stops_norm'
storageDF[colNameNorm]=readsPerBaseNorm
else:
sampleName=path.split('/')[-2]
colNameNorm=sampleName+'RT_stops_norm'
storageDF[colNameNorm]=0
# List of names for normalized data to extract
colNamesForExtraction=colNamesForExtraction+[colNameNorm]
#if not flag:
# storageDF['BinCenter']=center
if not flag: # If no flag (= use sequence index), then record bin centers
storageDF['BinCenter']=center
else:
try:
sequence=repeat_genome_bases[start:end+1]
storageDF['Sequence']=pd.Series(list(sequence))
except:
print "Error setting sequence. Ensure bin size is = 1 so that data shape and sequence size are consistent."
# Dataframe, column names for normalized data, start index postion
return (storageDF,colNamesForExtraction,start)
In [18]:
# *** INPUT : SPECIFY repeat RNA ***
repeatName='U1'
# *** INPUT : SPECIFY DATASET AND ORGANISM (mm9 or hg19)***
target='DDX5e0'
org='hg19'
# *** INPUT : SPECIFY DATASETS TO COMPARE TO TARGET ***
toCompare=['ddx3mut','ddx3wt','ddx3wtA', 'ddx3mutA']
# *** INPUT : Choose bin size for repeat RNA histograms
binsize=5
# *** INPUT : Colors
colors=['blue','green','red','yellow']
# - Histogram for target dataset -
filePaths=[os.getcwd() + '/results/%s/'%sampleName for sampleName in [target]+toCompare]
repeat_genome_bases,repeatAnnotDF=makeRepeatAnnotation(org)
plt.subplot2grid((4,4),(0,0),colspan=2,rowspan=2)
if binsize != 1:
recordSequence=0 # Cannot record sequence in the output dataframe.
storageDF,colNamesForExtraction,startIndexPosition=getRepeatRNAData(repeatName,filePaths,binsize,recordSequence)
plt.bar(np.array(storageDF.index),storageDF[colNamesForExtraction[0]],width=1,align='center',color='blue',alpha=0.45)
plt.xlim(0,storageDF.index[-1])
if repeatName=='rDNA':
cutoffBase=13314 # Plot a limited windowsize for rDNA.
plt.xlim([0,cutoffBase])
plt.tick_params(axis='x',labelbottom='off')
plt.xlabel("Base position",fontsize=5)
plt.tick_params(axis='y',labelsize=5)
plt.ylabel("Counts per bin \n Bin size = %s"%binsize,fontsize=5)
plt.title('%s coverage for %s'%(repeatName,target),fontsize=5)
yMax=plt.ylim()[1]
plt.grid()
# - Scatterplot -
plt.subplot2grid((4,4),(0,2),colspan=2,rowspan=2)
for i in range(0,len(toCompare)):
if i == len(toCompare)-1:
alpha=1
else:
alpha=0.25
plt.scatter(np.array(storageDF[colNamesForExtraction[0]]),np.array(storageDF[colNamesForExtraction[i+1]]),color=colors[i],alpha=alpha,label=toCompare[i])
plt.xlabel("%s"%target,fontsize=5)
plt.ylabel("Counts per base",fontsize=5)
plt.title('Compare datasets',fontsize=5)
plt.tick_params(axis='xticks',labelsize=5)
plt.tick_params(axis='yticks',labelsize=5)
plt.xticks(rotation=45,fontsize=5)
plt.yticks(fontsize=5)
plt.grid()
plt.legend(loc='upper left', scatterpoints = 1,prop={'size':5})
maxVal=yMax
plt.xlim(-0.1*maxVal,maxVal)
plt.ylim(-0.1*maxVal,maxVal)
plt.tight_layout()
pathToSave=os.getcwd() + '/results/%s/rawdata/'%target + '%s_metaanalysis'%repeatName
storageDF.to_csv(pathToSave)
In [20]:
# - Heatmap -
plt.subplot2grid((4,4),(2,0),colspan=4,rowspan=1)
recordSequence=0
storageDF_rawData,colNamesForExtraction,startIndexPosition=getRepeatRNAData(repeatName,filePaths,binsize,recordSequence)
absoluteBases=np.array(storageDF_rawData['BinCenter'])-startIndexPosition
storageDF_rawData=storageDF_rawData.set_index(absoluteBases)
storageDF=storageDF_rawData[colNamesForExtraction]
# Settings
bottomLabels=1
# Display limited data for rDNA.
if repeatName=='rDNA':
cutoffBase=13314
indexer=np.array(absoluteBases)<cutoffBase
storageDF=storageDF.loc[indexer,:]
else: cutoffBase=0
# - Toplogy per position -
DF=storageDF.div(storageDF.sum(axis=1),axis=0)
DF.fillna(0,inplace=True) # Any bases positions with no reads get corrected
DF=DF.transpose()
clipStorage=np.array(DF)
sampleLabels=np.array(DF.index)
baseLabels=np.array(DF.columns)
p=plt.pcolormesh(clipStorage,cmap='Reds')
locs,labels=plt.xticks(range(baseLabels.size),baseLabels)
plt.setp(labels, rotation=45, fontsize=4)
plt.xlim([0,len(baseLabels)])
sampleLabels=np.array([samLabel.split('RT_stops_norm')[0] for samLabel in sampleLabels]) # Shorten the labels
locs,labels=plt.yticks(range(sampleLabels.size),sampleLabels)
plt.setp(labels, fontsize=4)
if not bottomLabels:
plt.tick_params(\
axis='x', # changes apply to the x-axis
which='off', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='off') # labels along the bottom edge are off
plt.ylim(0,len(sampleLabels))
plt.title('%s tag count per base across datasets'%repeatName,fontsize=5)
# - Topology heatmap -
plt.subplot2grid((4,4),(3,0),colspan=4,rowspan=1)
if cutoffBase>0: # Remove all positions that are less than a cutoff
indexer=np.array(absoluteBases)<cutoffBase
storageDF=storageDF.loc[indexer,:]
DF=storageDF.div(storageDF.sum(axis=0),axis=1) # Normalize across each dataset
DF.fillna(0,inplace=True) # Any bases positions with no reads get corrected
DF=DF.transpose()
clipStorage=np.array(DF)
sampleLabels=np.array(DF.index)
baseLabels=np.array(DF.columns)
p=plt.pcolormesh(clipStorage,cmap='Reds')
locs,labels=plt.xticks(range(baseLabels.size),baseLabels)
plt.setp(labels, rotation=45, fontsize=4)
if not bottomLabels:
plt.tick_params(\
axis='x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='off') # labels along the bottom edge are off
plt.xlabel('Base position',fontsize=5)
plt.xlim([0,len(baseLabels)])
sampleLabels=np.array([samLabel.split('RT_stops_norm')[0] for samLabel in sampleLabels]) # Shorten the labels
locs,labels=plt.yticks(range(sampleLabels.size),sampleLabels)
plt.setp(labels, fontsize=4)
plt.ylim(0,len(sampleLabels))
plt.title('%s binding topology per base within each dataset'%repeatName,fontsize=5)
plt.tight_layout()
In [ ]: