In [1]:
import pandas as pd
import sqlite3
cnx = sqlite3.connect('/shared/sudlab1/General/projects/utrons_project/Simulations/RealUtronsQuant/431bladderSimQuants.stringtie.db')# Connect to quant db
cnx.execute("ATTACH '/shared/sudlab1/General/projects/utrons_project/BladderCancerUtrons/431BladderUtrons.db' as utrons") # Attach real data db
cnx.execute("ATTACH '/shared/sudlab1/General/annotations/hg38_noalt_ensembl85/csvdb' as annotations") # Attach Annotations
Out[1]:
In [2]:
# Get FPKM totals for each sample
query_text1 = '''
SELECT sum(FPKM), q.track
FROM agg_agg_agg_stringtie_transcript_data AS q
GROUP BY q.track
ORDER BY q.track
'''
totals = pd.read_sql_query(query_text1, cnx)
totals = totals.groupby("track").sum()
In [4]:
def getTpms(sample, run):
query_text2 = """
SELECT t_name AS txId,ti.gene_name AS gene, FPKM
FROM agg_agg_agg_stringtie_transcript_data AS q
INNER JOIN utrons.novel_utrons_ids AS uid
ON uid.transcript_id=q.t_name
INNER JOIN annotations.transcript_info AS ti
ON ti.transcript_id = uid.match_transcript_id
WHERE uid.track='agg-agg-agg' AND q.track='"""+sample+"""' AND t_name like "MSTRG%"
GROUP BY uid.transcript_id
ORDER BY uid.transcript_id
"""
sampleTotal = totals.ix[sample][0]
sampleQuants = pd.read_sql(query_text2, cnx)
sampleQuants["TPM"] = (sampleQuants["FPKM"] / sampleTotal) * 1000000
if run == 1:
return sampleQuants[["txId","gene", "TPM"]]
if run ==2:
return sampleQuants["TPM"]
utronTpms = getTpms("GC-WT-1", 1)
for num in range(2,61,1):
print num
sample = "GC-WT-%d" %num
sampleTpms = getTpms(sample, 2)
utronTpms = pd.concat([utronTpms, sampleTpms], axis=1)
In [7]:
"""
Loop through a range of tpms and get the percent of utrons which quantify below the tpm threshold
"""
numGenes = len(utronTpms)
def getZeros(tpm):
allSamples = 0
notallSamples = 0
total = 0
for gene in range(0, numGenes,1):
geneTpms = utronTpms.ix[gene, 1:].tolist()
if len([a for a in geneTpms if a <= tpm]) == 60:
allSamples += 1
else:
notallSamples += 1
total += 1
percentage = float(allSamples) / total
return percentage
percentages = []
for num in [float(a)/10 for a in range(0,50, 2)]:
print num,
percentage = getZeros(num)
percentages.append(percentage)
In [18]:
"""
Plot values from above
"""
%pylab inline
tpms = [float(a)/10 for a in range(0,50,2)]
print percentages
pylab.plot(tpms, percentages, marker=".")
pylab.ylabel("percentage"); pylab.xlabel("TPM")
pylab.savefig("./images/8_RealUtronQuantInSims", dpi=300)
In [16]:
"""
Print genes with samples which quantify as > 5 in any of the 60 samples
"""
filename = open("/shared/sudlab1/General/projects/utrons_project/misc_files/Utrons_5tpmInSims.txt", 'w')
for gene in range(0, numGenes,1):
txName = utronTpms.ix[gene, 1]
geneTpms = utronTpms.ix[gene, 2:].tolist()
if len([a for a in geneTpms if a <= 5.0]) < 60:
print txName,
txName = txName + "\n"
filename.write(txName)
filename.close()
In [17]:
"""
Print genes with samples which quantify as > 2.5 in any of the 60 samples
"""
filename = open("/shared/sudlab1/General/projects/utrons_project/misc_files/Utrons_2.5tpmInSims.txt", 'w')
for gene in range(0, numGenes,1):
txName = utronTpms.ix[gene, 1]
geneTpms = utronTpms.ix[gene, 2:].tolist()
if len([a for a in geneTpms if a <= 2.5]) < 60:
print txName,
txName = txName + "\n"
filename.write(txName)
filename.close()