Looking at the salmon expression estimates vs the real read counts outputted from polyester


In [64]:
import pandas as pd
import sqlite3
import math
import numpy as np
from scipy.stats import pearsonr
from scipy.stats import spearmanr

In [43]:
# Get real read counts
filename = "/shared/sudlab1/General/projects/utrons_project/Simulations/SimulationUtronsQuant/realCounts.txt"
realCounts = pd.read_csv(filename, sep="\t", header=None, index_col=0)
lengths = realCounts.ix[:,22]
txnames = realCounts.ix[:,21]


# Get RPK values
def getRpks(col, num):
    col = pd.concat([col, lengths], axis=1)
    col["%d"%num] = col[num] / (col[22]/1000)
    return col["%d"%num]


# Get RPK totals
def getTotals(col):
    total = sum(col)
    return total 


# Get sample tpms
def getTpms(col, scalingfactor, num):
    col["tpm"] = (col / scalingfactor)
    return col["tpm"]


# Run the above functions and return a dataframe
def runFunctions(num):
    column = realCounts[num]
    temp1 = getRpks(column, num)
    total = (getTotals(temp1))
    scalingfactor = total / 1000000
    a = getTpms(temp1, scalingfactor, num)
    return a
    

# cycle through functions with all 20 samples and get DF of real tpms
realTpms = txnames
for num in range(1,21,1):
    sampleTpms = runFunctions(num)
    realTpms = pd.concat([realTpms, sampleTpms], axis=1)
    
realTpms[0:5]


Out[43]:
21 1 2 3 4 5 6 7 8 9 ... 11 12 13 14 15 16 17 18 19 20
0
0 ENST00000456328 5.067741 4.101791 5.067741 4.451921 4.899155 4.293158 5.067741 4.899155 4.899155 ... 4.731721 4.101791 5.067741 5.067741 4.101791 4.101791 4.899155 4.293158 4.451921 4.101791
1 ENST00000450305 4.690853 3.883469 4.690853 3.796983 5.163601 4.265955 4.690853 5.163601 5.163601 ... 4.850001 3.883469 4.690853 4.690853 3.883469 3.883469 5.163601 4.265955 3.796983 3.883469
2 ENST00000488147 3.687019 3.377186 3.687019 2.905569 4.915205 4.027307 3.687019 4.915205 4.915205 ... 4.394382 3.377186 3.687019 3.687019 3.377186 3.377186 4.915205 4.027307 2.905569 3.377186
3 ENST00000619216 3.319595 3.007785 3.319595 2.614045 4.536473 3.821522 3.319595 4.536473 4.536473 ... 4.033159 3.007785 3.319595 3.319595 3.007785 3.007785 4.536473 3.821522 2.614045 3.007785
4 ENST00000473358 4.586511 3.844869 4.586511 3.661621 5.199104 4.288477 4.586511 5.199104 5.199104 ... 4.848854 3.844869 4.586511 4.586511 3.844869 3.844869 5.199104 4.288477 3.661621 3.844869

5 rows × 21 columns


In [40]:
# Get salmon Estimated Read Counts

# Connect to salmon db and annotations
cnx = sqlite3.connect('/shared/sudlab1/General/projects/utrons_project/Simulations/SimulationUtronsQuant/SimulationUtrons.salmon_extra20samples.db')
cnx.execute("ATTACH '/shared/sudlab1/General/annotations/hg38_noalt_ensembl85/csvdb' as annotations")


Out[40]:
<sqlite3.Cursor at 0x7fde9c85e7a0>

In [41]:
# get salmon tpms using sql queries and concatenating the dataframes

def getSalmonTpms(sample, run, index):
    query_text1 = '''
        SELECT name, tpm AS tpm
        FROM salmon_quant AS q
        WHERE Name like "ENS%" and q.track="'''+sample+'''"
        ORDER BY name
        '''
    
    sampletpms = pd.read_sql_query(query_text1, cnx)
    sampletpms["%d"%index] = sampletpms["tpm"]
    if run == 1:
        return sampletpms[["Name", "%d"%index]]
    else:
        return sampletpms["%d"%index]
    
    
salmonTpms = getSalmonTpms("GC-WT-41", 1, 1)
for num in range(42,61,1):
    sample = "GC-WT-%d" % num
    indexnum = num - 40
    tpms = getSalmonTpms(sample, 0, indexnum)
    salmonTpms = pd.concat([salmonTpms, tpms], axis=1)

In [68]:
%pylab inline

def plotVals(dataframe):
    realvals = dataframe.ix[:,1].tolist()
    realvals = [np.log(a) for a in realvals]
    
    salmonvals = dataframe.ix[:,3].tolist()
    salmonvals = [np.log(a) for a in salmonvals]
    
    pylab.plot(realvals,salmonvals, ',', color="blue", alpha=0.8)
    
    correlation = spearmanr(salmonvals, realvals)
    return correlation[0]
 
correlations =[]
for sample in range(1,21,1):
    salmon = salmonTpms[["Name", str(sample)]]
    real = realTpms[[21, str(sample)]]
    merged = pd.merge(real, salmon, left_on=21, right_on="Name")
    x = plotVals(merged)
    correlations.append(x)

# Plot dashed lines
pylab.plot([-25,15], [0,0], ":k", color="black")
pylab.plot([0,0], [-25,15], ":k", color="black")
pylab.plot([-25,15], [-25,15], ":k", color="orange")
    
    
# Figure options    
pylab.xlim(-10,5); pylab.ylim(-10,5)
pylab.xlabel("log(real TPMs)"); pylab.ylabel("log(salmon TPMs)")
pylab.savefig("./images/6_SalmonVsReal", dpi=500)


Populating the interactive namespace from numpy and matplotlib
/home/mba13ns/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:8: RuntimeWarning: divide by zero encountered in log

In [90]:
print sum(correlations) / len(correlations)


0.387244203809
Big source of bad correlations seems to be the gc bias models? Sample groups: 1, 3, 5, 7, 13, 14 2, 15, 20 4, 19 6, 18 10 11 12, 16 Group correlations: 41, 41, 42, 42, 41 49, 47, 50 52, 45 21, 18, 20, 18 40, 40 43 29 41, 47 Correlations within the groups tend to be grouped together