In [2]:
import sys
import os
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
import re
import cmath
import math
import glob
import subprocess
%matplotlib inline
import matplotlib
import scipy
from scipy import stats
matplotlib.rcParams['savefig.dpi'] = 2 * matplotlib.rcParams['savefig.dpi']

Gene metaanalysis -


In [7]:
def getGeneList(name,geneType):
    geneListpath=os.getcwd() + '/results/'+name+'/rawdata/PlotData_ReadsPerGene_%s'%geneType
    
    path=os.getcwd() + '/results/'+name+'/rawdata/PlotData_ReadsPerPipeFile'
    data=pd.read_csv(path)
    data.columns=['Index','FileName','Reads']
    data['FileName']=pd.Series(['Raw (R1)','Raw (R2)','3p Trim (R1)','3p Trim (R2)','Filter (R1)','Filter (R2)','No dupes (R1)','No dupes (R2)','RepeatMapped (R1)','RepeatMaped (R2)','Hg19Mapped (R1)','Hg19Mapped (R2)','Blacklist (R1)','Blacklist (R2)','RepeatMask (R1)','RepeatMask (R2)','ClipperIn','ClipperOut'])
    data=data.set_index('FileName')
    valueForNorm=data.loc['ClipperOut','Reads']
    
    geneListDF=pd.DataFrame(pd.read_table(geneListpath,header=None,delimiter=','))
    geneListDF.columns=['GeneName','Count']
    geneListDF.index=geneListDF['GeneName']
    geneListDF['Normalized']=geneListDF['Count']/valueForNorm
    return geneListDF
 
def mergeGeneLists(df1,df2):
    allGenes=list(df1['GeneName'])+list(df2['GeneName'])
    alUniqGenes=list(set(allGenes))
    mergedDF=pd.DataFrame(index=alUniqGenes)
    mergedDF['Counts_1']=df1['Count']
    mergedDF['Counts_1_Norm']=df1['Normalized']
    mergedDF['Counts_2']=df2['Count']
    mergedDF['Counts_2_Norm']=df2['Normalized']
    mergedDF.fillna(0,inplace=True)
    mergedDF['Sum']=mergedDF['Counts_1']+mergedDF['Counts_2']
    mergedDF.sort(['Sum'],ascending=True,inplace=True)
    return mergedDF

def makeScatter(target,toCompare,geneType,colors,axisLimit):
    target_geneList=getGeneList(target,geneType)
    for i in range(0,len(toCompare)):   
        if i == len(toCompare)-1:
            alpha=1
        else:
            alpha=0.25
        background=toCompare[i]
        background_geneList=getGeneList(background,geneType)
        DF1=mergeGeneLists(target_geneList,background_geneList)
        plt.scatter(np.array(DF1['Counts_1_Norm']),np.array(DF1['Counts_2_Norm']),label=toCompare[i],color=colors[i],alpha=0.25)
        slope,intercept,r_value,p_value,std_err=scipy.stats.linregress(np.array(DF1['Counts_1_Norm']),np.array(DF1['Counts_2_Norm']))
        print toCompare[i]
        print "r-squared:", r_value**2
    plt.xlim(-0.1*plt.xlim()[1],plt.xlim()[1])
    plt.ylim(-0.1*plt.xlim()[1],plt.xlim()[1])
    plt.title('Tag count per %s gene across datasets.'%geneType,fontsize=5)
    plt.xlabel("Normalized tag count per gene for %s"%target,fontsize=5)
    plt.ylabel("Normalized tag count per gene for other datasets.",fontsize=5)
    plt.tick_params(axis='xticks',labelsize=5,fontsize=5) 
    plt.tick_params(axis='yticks',labelsize=5,fontsize=5)
    plt.yticks(fontsize=5)
    plt.xticks(rotation=90,fontsize=5)
    plt.legend(loc='upper left', scatterpoints = 1,prop={'size':5})
    plt.grid()
    if axisLimit: # Plot limit
        plt.xlim(-0.1*axisLimit,axisLimit)
        plt.ylim(-0.1*axisLimit,axisLimit)    
    
    # Label DF for output.
    DF1.columns=['Counts_%s'%target,'Norm_Counts_%s'%target,'Counts_%s'%background,'Norm_Counts_%s'%background,'Sum']
    return DF1

In [10]:
fig1=plt.figure(1)

plt.subplot(2,2,1)
# *** INPUT : Dataset (x-axis) against which others will be compared.
target='DDX5e0' 
# *** INPUT : Datasets (y-axis) for comparison.
toCompare=['ddx3wtA','ddx3wt','MMhur', 'ddx3mutA'] 
# *** INPUT : Colors. Must match length of toCompare.
colors=['blue','red','yellow','green'] 
# *** INPUT : Specify the gene type to compare.
geneType='proteinCoding'
# Specify the axis limit (in case one wants to zoom in).
axisLimit=None 
DF1=makeScatter(target,toCompare,geneType,colors,axisLimit) 
plt.xlim([-0.001,0.02])
plt.ylim([-0.001,0.02])
plt.tight_layout()

outfilepath=os.getcwd() + '/results/%s/'%target
#fig1.savefig(outfilepath+'S5_proteinMetaanalysis.png',format='png',bbox_inches='tight',dpi=200,pad_inches=0.5)
#fig1.savefig(outfilepath+'S5_proteinMetaanalysis.pdf',format='pdf',bbox_inches='tight',dpi=200,pad_inches=0.5)


ddx3wtA
r-squared: 0.00385349226517
ddx3wt
r-squared: 0.00243741529771
MMhur
r-squared: 0.000951453063541
ddx3mutA
r-squared: 0.00186622808485

In [255]:
from scipy.interpolate import interp1d

# Examine differentially expressed genes
DF1['Delta']=np.abs(DF1['Norm_Counts_hcvwt']-DF1['Norm_Counts_jfh1v2'])
DF1.sort('Delta',inplace=True,ascending=False)
data=np.array(DF1['Delta'])
DF1['Percentile']=[stats.percentileofscore(data, i, kind='strict') for i in data]

In [262]:
target='jfh1v2'
pathToSave=os.getcwd()+'/results/%s/rawdata/'%target+'proteinCoding_diff' 
DF1.to_csv(pathToSave)

In [263]:
tmp=DF1[DF1['Counts_jfh1v2']==0]
pathToSave=os.getcwd()+'/results/%s/rawdata/'%target+'proteinCoding_diff_onlyWT' 
tmp.to_csv(pathToSave)

In [264]:
tmp=DF1[DF1['Counts_hcvwt']==0]
pathToSave=os.getcwd()+'/results/%s/rawdata/'%target+'proteinCoding_diff_onlyJFH1' 
tmp.to_csv(pathToSave)

In [114]:
# *** OPTION TO SAVE DATA : Sort the data by total hits. 
# Note that this will apply to the last dataset in toCompare if it contain multiple.
DF1.sort('Sum',inplace=True,ascending=False)
pathToSave=os.getcwd() + '/results/%s/rawdata/'%target + '%s_metaanalysis'%geneType 
DF1.to_csv(pathToSave)

Repeat RNA metaanalysis -


In [15]:
def makeRepeatAnnotation(org):
    # Repeat index sequence 
    if org=='hg19': repeatGenomeBuild=os.getcwd() + '/docs/hg19/repeat/repeatRNA.fa'
    elif org=='mm9': repeatGenomeBuild=os.getcwd() + '/docs/mm9/repeat/Mm_repeatRNA.fa'
    repeat_genome=np.genfromtxt(repeatGenomeBuild,dtype='string')
    repeat_genome_bases=repeat_genome[1]
    repeat_genome_size=len(repeat_genome[1])
    # Repeat index positions
    if org=='hg19':repeatAnnotation=os.getcwd() + '/docs/hg19/repeat/Hs_repeatIndex_positions.txt'
    else:repeatAnnotation=os.getcwd() + '/docs/mm9/repeat/Mm_repeatIndex_positions.txt'
    repeatAnnotDF=pd.DataFrame(pd.read_table(repeatAnnotation,header=None))
    repeatAnnotDF.columns=['Name','Length','IndexStart','IndexEnd']
    repeatAnnotDF['End_for_extraction']=repeatAnnotDF['IndexEnd']+1 
    return (repeat_genome_bases,repeatAnnotDF)

def getRepeatRNAData(repeatName,filePaths,binSize,flag):
        
    # Create data frame for storage 
    storageDF=pd.DataFrame()
    
    # Fill it with histogram data from samples
    colNamesForExtraction=[]
    for i in range(0,len(filePaths)):
        
        # Get RT file
        path=filePaths[i]
        
        # Get repeat data
        path_repReads=path+'/rawdata/PlotData_RepeatRNAreads_%s'%repeatName
        hits_per_rep=pd.read_csv(path_repReads)
        
        if hits_per_rep.shape[0] > 0:
            RTpositions=hits_per_rep['RT_stop']
            start=hits_per_rep.loc[0,'Repeat_Start']
            end=hits_per_rep.loc[0,'Repeat_End']
                 
            # Histogram of RT stops across gene body
            bins=range(start,end+2,1)
            hist,bins=np.histogram(RTpositions,bins=bins)
            width=0.7*(bins[1]-bins[0])
            center=(bins[:-1] + bins[1:])/2
            histPlot=np.array(hist,dtype=float)
                
            # Read accounting
            data=pd.read_csv(path+'/rawdata/PlotData_ReadsPerPipeFile')
            data.columns=['Index','FileName','Reads']
            data['FileName']=pd.Series(['Raw (R1)','Raw (R2)','3p Trim (R1)','3p Trim (R2)','Filter (R1)','Filter (R2)','No dupes (R1)','No dupes (R2)','RepeatMapped (R1)','RepeatMapped (R2)','Hg19Mapped (R1)','Hg19Mapped (R2)','Blacklist (R1)','Blacklist (R2)','RepeatMask (R1)','RepeatMask (R2)','ClipperIn','ClipperOut'])
            data=data.set_index('FileName')
            m1=np.mean([data.loc['RepeatMapped (R1)','Reads'],data.loc['RepeatMapped (R2)','Reads']])
            m2=np.mean([data.loc['Hg19Mapped (R1)','Reads'],data.loc['Hg19Mapped (R2)','Reads']])
            totalMapped=m1+m2
    
            # Normalize data
            histNorm=np.array(histPlot/float(totalMapped),dtype=float)
            
            # Extract sequence and record counts
            sampleName=path.split('/')[-2]
            readsPerBase=np.array(list(hist))
            colName=sampleName+'RT_stops'
            storageDF[colName]=readsPerBase
            colName=sampleName+'Total_mapped'
            storageDF[colName]=totalMapped
            readsPerBaseNorm=np.array(list(histNorm))
            colNameNorm=sampleName+'RT_stops_norm'
            storageDF[colNameNorm]=readsPerBaseNorm
        
        else:
            sampleName=path.split('/')[-2]
            colNameNorm=sampleName+'RT_stops_norm'
            storageDF[colNameNorm]=0
            
        # List of names for normalized data to extract
        colNamesForExtraction=colNamesForExtraction+[colNameNorm]
        
        #if not flag:
        #    storageDF['BinCenter']=center
    
    if not flag: # If no flag (= use sequence index), then record bin centers
        storageDF['BinCenter']=center
    else:
        try:
            sequence=repeat_genome_bases[start:end+1]
            storageDF['Sequence']=pd.Series(list(sequence))
        except:
            print "Error setting sequence. Ensure bin size is = 1 so that data shape and sequence size are consistent."
    
    # Dataframe, column names for normalized data, start index postion
    return (storageDF,colNamesForExtraction,start)

In [18]:
# *** INPUT : SPECIFY repeat RNA ***
repeatName='U1'
# *** INPUT : SPECIFY DATASET AND ORGANISM (mm9 or hg19)***
target='DDX5e0' 
org='hg19'
# *** INPUT : SPECIFY DATASETS TO COMPARE TO TARGET ***
toCompare=['ddx3mut','ddx3wt','ddx3wtA', 'ddx3mutA']  
# *** INPUT : Choose bin size for repeat RNA histograms
binsize=5
# *** INPUT : Colors
colors=['blue','green','red','yellow']

# - Histogram for target dataset -
filePaths=[os.getcwd() + '/results/%s/'%sampleName for sampleName in [target]+toCompare]
repeat_genome_bases,repeatAnnotDF=makeRepeatAnnotation(org)
plt.subplot2grid((4,4),(0,0),colspan=2,rowspan=2)
if binsize != 1:
    recordSequence=0 # Cannot record sequence in the output dataframe.
storageDF,colNamesForExtraction,startIndexPosition=getRepeatRNAData(repeatName,filePaths,binsize,recordSequence)
plt.bar(np.array(storageDF.index),storageDF[colNamesForExtraction[0]],width=1,align='center',color='blue',alpha=0.45)
plt.xlim(0,storageDF.index[-1])
if repeatName=='rDNA':
    cutoffBase=13314 # Plot a limited windowsize for rDNA.
    plt.xlim([0,cutoffBase])
plt.tick_params(axis='x',labelbottom='off') 
plt.xlabel("Base position",fontsize=5)
plt.tick_params(axis='y',labelsize=5)  
plt.ylabel("Counts per bin \n Bin size = %s"%binsize,fontsize=5)
plt.title('%s coverage for %s'%(repeatName,target),fontsize=5)
yMax=plt.ylim()[1]
plt.grid()

# - Scatterplot -
plt.subplot2grid((4,4),(0,2),colspan=2,rowspan=2)
for i in range(0,len(toCompare)):
    if i == len(toCompare)-1:
        alpha=1
    else:
        alpha=0.25
    plt.scatter(np.array(storageDF[colNamesForExtraction[0]]),np.array(storageDF[colNamesForExtraction[i+1]]),color=colors[i],alpha=alpha,label=toCompare[i])
plt.xlabel("%s"%target,fontsize=5)
plt.ylabel("Counts per base",fontsize=5)
plt.title('Compare datasets',fontsize=5)
plt.tick_params(axis='xticks',labelsize=5) 
plt.tick_params(axis='yticks',labelsize=5)
plt.xticks(rotation=45,fontsize=5)
plt.yticks(fontsize=5)
plt.grid()
plt.legend(loc='upper left', scatterpoints = 1,prop={'size':5})
maxVal=yMax
plt.xlim(-0.1*maxVal,maxVal)
plt.ylim(-0.1*maxVal,maxVal)

plt.tight_layout()

pathToSave=os.getcwd() + '/results/%s/rawdata/'%target + '%s_metaanalysis'%repeatName
storageDF.to_csv(pathToSave)



In [20]:
# - Heatmap - 
plt.subplot2grid((4,4),(2,0),colspan=4,rowspan=1)
recordSequence=0
storageDF_rawData,colNamesForExtraction,startIndexPosition=getRepeatRNAData(repeatName,filePaths,binsize,recordSequence)
absoluteBases=np.array(storageDF_rawData['BinCenter'])-startIndexPosition
storageDF_rawData=storageDF_rawData.set_index(absoluteBases)
storageDF=storageDF_rawData[colNamesForExtraction]

# Settings
bottomLabels=1

# Display limited data for rDNA.
if repeatName=='rDNA':
    cutoffBase=13314
    indexer=np.array(absoluteBases)<cutoffBase
    storageDF=storageDF.loc[indexer,:]
else: cutoffBase=0
   
# - Toplogy per position -
DF=storageDF.div(storageDF.sum(axis=1),axis=0)
DF.fillna(0,inplace=True) # Any bases positions with no reads get corrected
DF=DF.transpose()
clipStorage=np.array(DF)
sampleLabels=np.array(DF.index)
baseLabels=np.array(DF.columns)
p=plt.pcolormesh(clipStorage,cmap='Reds')
locs,labels=plt.xticks(range(baseLabels.size),baseLabels)
plt.setp(labels, rotation=45, fontsize=4)
plt.xlim([0,len(baseLabels)])
sampleLabels=np.array([samLabel.split('RT_stops_norm')[0] for samLabel in sampleLabels]) # Shorten the labels
locs,labels=plt.yticks(range(sampleLabels.size),sampleLabels)
plt.setp(labels, fontsize=4)
if not bottomLabels:
    plt.tick_params(\
    axis='x',          # changes apply to the x-axis
    which='off',       # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off') # labels along the bottom edge are off 
plt.ylim(0,len(sampleLabels))
plt.title('%s tag count per base across datasets'%repeatName,fontsize=5)  

# - Topology heatmap - 
plt.subplot2grid((4,4),(3,0),colspan=4,rowspan=1)
if cutoffBase>0: # Remove all positions that are less than a cutoff
    indexer=np.array(absoluteBases)<cutoffBase
    storageDF=storageDF.loc[indexer,:]
DF=storageDF.div(storageDF.sum(axis=0),axis=1) # Normalize across each dataset
DF.fillna(0,inplace=True) # Any bases positions with no reads get corrected
DF=DF.transpose()
clipStorage=np.array(DF)
sampleLabels=np.array(DF.index)
baseLabels=np.array(DF.columns)
p=plt.pcolormesh(clipStorage,cmap='Reds')
locs,labels=plt.xticks(range(baseLabels.size),baseLabels)
plt.setp(labels, rotation=45, fontsize=4)
if not bottomLabels:
    plt.tick_params(\
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off') # labels along the bottom edge are off 
plt.xlabel('Base position',fontsize=5)
plt.xlim([0,len(baseLabels)])
sampleLabels=np.array([samLabel.split('RT_stops_norm')[0] for samLabel in sampleLabels]) # Shorten the labels
locs,labels=plt.yticks(range(sampleLabels.size),sampleLabels)
plt.setp(labels, fontsize=4)
plt.ylim(0,len(sampleLabels))
plt.title('%s binding topology per base within each dataset'%repeatName,fontsize=5)  

plt.tight_layout()



In [ ]: