In [64]:
import pandas as pd
import sqlite3
import math
import numpy as np
from scipy.stats import pearsonr
from scipy.stats import spearmanr
In [43]:
# Get real read counts
filename = "/shared/sudlab1/General/projects/utrons_project/Simulations/SimulationUtronsQuant/realCounts.txt"
realCounts = pd.read_csv(filename, sep="\t", header=None, index_col=0)
lengths = realCounts.ix[:,22]
txnames = realCounts.ix[:,21]
# Get RPK values
def getRpks(col, num):
col = pd.concat([col, lengths], axis=1)
col["%d"%num] = col[num] / (col[22]/1000)
return col["%d"%num]
# Get RPK totals
def getTotals(col):
total = sum(col)
return total
# Get sample tpms
def getTpms(col, scalingfactor, num):
col["tpm"] = (col / scalingfactor)
return col["tpm"]
# Run the above functions and return a dataframe
def runFunctions(num):
column = realCounts[num]
temp1 = getRpks(column, num)
total = (getTotals(temp1))
scalingfactor = total / 1000000
a = getTpms(temp1, scalingfactor, num)
return a
# cycle through functions with all 20 samples and get DF of real tpms
realTpms = txnames
for num in range(1,21,1):
sampleTpms = runFunctions(num)
realTpms = pd.concat([realTpms, sampleTpms], axis=1)
realTpms[0:5]
Out[43]:
In [40]:
# Get salmon Estimated Read Counts
# Connect to salmon db and annotations
cnx = sqlite3.connect('/shared/sudlab1/General/projects/utrons_project/Simulations/SimulationUtronsQuant/SimulationUtrons.salmon_extra20samples.db')
cnx.execute("ATTACH '/shared/sudlab1/General/annotations/hg38_noalt_ensembl85/csvdb' as annotations")
Out[40]:
In [41]:
# get salmon tpms using sql queries and concatenating the dataframes
def getSalmonTpms(sample, run, index):
query_text1 = '''
SELECT name, tpm AS tpm
FROM salmon_quant AS q
WHERE Name like "ENS%" and q.track="'''+sample+'''"
ORDER BY name
'''
sampletpms = pd.read_sql_query(query_text1, cnx)
sampletpms["%d"%index] = sampletpms["tpm"]
if run == 1:
return sampletpms[["Name", "%d"%index]]
else:
return sampletpms["%d"%index]
salmonTpms = getSalmonTpms("GC-WT-41", 1, 1)
for num in range(42,61,1):
sample = "GC-WT-%d" % num
indexnum = num - 40
tpms = getSalmonTpms(sample, 0, indexnum)
salmonTpms = pd.concat([salmonTpms, tpms], axis=1)
In [68]:
%pylab inline
def plotVals(dataframe):
realvals = dataframe.ix[:,1].tolist()
realvals = [np.log(a) for a in realvals]
salmonvals = dataframe.ix[:,3].tolist()
salmonvals = [np.log(a) for a in salmonvals]
pylab.plot(realvals,salmonvals, ',', color="blue", alpha=0.8)
correlation = spearmanr(salmonvals, realvals)
return correlation[0]
correlations =[]
for sample in range(1,21,1):
salmon = salmonTpms[["Name", str(sample)]]
real = realTpms[[21, str(sample)]]
merged = pd.merge(real, salmon, left_on=21, right_on="Name")
x = plotVals(merged)
correlations.append(x)
# Plot dashed lines
pylab.plot([-25,15], [0,0], ":k", color="black")
pylab.plot([0,0], [-25,15], ":k", color="black")
pylab.plot([-25,15], [-25,15], ":k", color="orange")
# Figure options
pylab.xlim(-10,5); pylab.ylim(-10,5)
pylab.xlabel("log(real TPMs)"); pylab.ylabel("log(salmon TPMs)")
pylab.savefig("./images/6_SalmonVsReal", dpi=500)
In [90]:
print sum(correlations) / len(correlations)