In [2]:
import sqlite3
import pandas as pd
import numpy as np
from scipy.stats.stats import pearsonr
from scipy.stats.stats import spearmanr
import math
import random
# ATTACH UTRONS DATABASE
cnx = sqlite3.connect('/shared/sudlab1/General/projects/utrons_project/BladderCancerUtrons/431BladderUtrons.db')
cnx.execute("ATTACH '/shared/sudlab1/General/annotations/hg38_noalt_ensembl85/csvdb' as annotations")
sampleCount = 434
In [3]:
"""
GETTING PROPORTIONS
"""
"""
Function:
1 - get TPMs of the Tx Ids passed to the function
2 - get proportions of the tpms found in (1)
"""
#################################################
# 1 - Get UTRon Tpms
#################################################
# Get tpms for UTRons
def getTpms(sample, runNum, index, Dataframe):
# Load file into dataframe
sampleFile = '/shared/sudlab1/General/projects/utrons_project/BladderCancerUtrons/Quantifications/%s_agg-agg-agg.sf' % sample
sampleDf = pd.read_csv(sampleFile, sep='\t')
# Copy only Tx ids and TPM values from dataframe
sampleDf = sampleDf[[0,3]].copy()
# Merge with Utrons DF (get only utron tpms)
sampleDf = pd.merge(Dataframe, sampleDf, right_on="Name", left_on="Name")
# Make columns named with the sample number
indexname = "TPM_%d" % index
sampleDf[indexname] = sampleDf["TPM"]
# If passed with runNum == 1, return tpms AND tx IDs, else return just the tpms
if runNum ==1:
return sampleDf[["Name","match_id", indexname]]
if runNum ==2:
return sampleDf[[indexname]]
#################################################
# 2- Get TPM totals for each gene in each sample
#################################################
# Open each sample individually and get transcript tpms then groupby match_tx_id to get transcript_totals
def getGeneTotals(sampleNum, runNum, Dataframe):
# Make vars with sample names and indexes
sample="ZF-WT-%d" % sampleNum
tpmIndex = "TPM_%d" % sampleNum
propIndex = "PROP_%d" % sampleNum
# Load file into dataframe
sampleFile = '/shared/sudlab1/General/projects/utrons_project/BladderCancerUtrons/Quantifications/%s_agg-agg-agg.sf' % sample
sampleDf = pd.read_csv(sampleFile, sep='\t')
# Copy only Tx ids and TPM values from dataframe, Merge with transcripts DF and group by ID to get gene sums
sampleDf = sampleDf[["Name","TPM"]]
sampleDf = pd.merge(transcriptIds, sampleDf, right_on="Name", left_on="Name")
sampleDf = sampleDf.groupby("match_id").sum()
# Make a copy of the utrons dataframe tpms for the sample and merge utrons and sample dataframes
utronTpms = Dataframe[["Name","match_id",tpmIndex]].copy()
utronTpms = pd.merge(utronTpms, sampleDf, right_index=True, left_on="match_id")
# get proportion for each transcript
utronTpms[propIndex] = utronTpms[tpmIndex] / utronTpms["TPM"]
# Return dataframe
if runNum == 1:
return utronTpms[["Name", "match_id", propIndex]]
if runNum ==2:
return utronTpms[[propIndex]]
In [4]:
"""
SQL QUERIES TO GET TRANSCRIPT IDS TO BE USED LATER
"""
#########################################################
# Dataframe of all transcript IDs
query_text1 = '''
SELECT tc.match_gene_id AS match_id, tc.transcript_id AS Name
FROM transcript_class AS tc
WHERE tc.track="agg-agg-agg"
ORDER BY match_transcript_id
'''
transcriptIds = pd.read_sql_query(query_text1, cnx)
#########################################################
# Dataframe of novel UTRons IDs and matched transcript IDs
query_text1 = '''
SELECT uid.transcript_id AS Name, tc.match_gene_id AS match_id
FROM novel_utrons_ids AS uid
INNER JOIN transcript_class AS tc
ON tc.transcript_id = uid.transcript_id
INNER JOIN annotations.transcript_info AS ti
ON ti.transcript_id = tc.match_transcript_id
WHERE uid.track='agg-agg-agg'
GROUP BY uid.transcript_id
'''
novelUtronsDf = pd.read_sql_query(query_text1, cnx)
#########################################################
# Dataframe of all UTRon IDs and matched transcript IDs
query_text1 = '''
SELECT uid.transcript_id AS Name, tc.match_gene_id AS match_id
FROM all_utrons_ids AS uid
INNER JOIN transcript_class AS tc
ON tc.transcript_id = uid.transcript_id
INNER JOIN annotations.transcript_info AS ti
ON ti.transcript_id = tc.match_transcript_id
WHERE uid.track='agg-agg-agg'
GROUP BY uid.transcript_id
'''
allUtronsDf = pd.read_sql_query(query_text1, cnx)
In [5]:
"""
GETTING NOVEL UTRON TPMS AND PROPORTIONS USING THE ABOVE FUNCTIONS
"""
# Loop through samples and get tpms
novelUtronTpms = getTpms("ZF-WT-1", 1, 1, novelUtronsDf)
for num in range(2,sampleCount,1):
if num != 252 and num != 347:
sample = "ZF-WT-%d" % num
sampleTpms = getTpms(sample, 2, num, novelUtronsDf)
novelUtronTpms = pd.concat([novelUtronTpms, sampleTpms], axis=1)
# Loop through samples and get proportions
novelUtronProportions = getGeneTotals(1, 1, novelUtronTpms)
for num in range(2,sampleCount,1):
if num != 252 and num != 347:
sampleProportions = getGeneTotals(num, 2, novelUtronTpms)
novelUtronProportions = pd.concat([novelUtronProportions, sampleProportions], axis=1)
In [6]:
"""
GETTING ALL UTRON TPMS AND PROPORTIONS USING THE ABOVE FUNCTIONS
"""
# Loop through samples and get tpms
allUtronTpms = getTpms("ZF-WT-1", 1, 1, allUtronsDf)
for num in range(2,sampleCount,1):
if num != 252 and num != 347:
sample = "ZF-WT-%d" % num
sampleTpms = getTpms(sample, 2, num, allUtronsDf)
allUtronTpms = pd.concat([allUtronTpms, sampleTpms], axis=1)
# Loop through samples and get proportions
allUtronProportions = getGeneTotals(1, 1, allUtronTpms)
for num in range(2,sampleCount,1):
if num != 252 and num != 347:
sampleProportions = getGeneTotals(num, 2, allUtronTpms)
allUtronProportions = pd.concat([allUtronProportions, sampleProportions], axis=1)
In [7]:
"""
PICK A SET OF RANDOM TXS AND GET PROPORTIONS FOR THEM
"""
# Remove null values (cant find proportions for these genes)
transcriptIds = transcriptIds[transcriptIds["match_id"].notnull()]
# pick a set of random numbers and add the corresponding genes to a db - randomtxs
indexvals = transcriptIds.index.tolist()
numTxs = 33673
randlist = []
picked = []
# Pick random set of numbers and add to list
while len(randlist) < numTxs:
randomnum = random.randint(0,len(indexvals)-1)
if randomnum not in randlist:
randlist.append(randomnum)
# Pick the numbers generated from above
for num in randlist:
picked.append(indexvals[num])
# Select transcripts corresponding to random numbers
randomTxs = transcriptIds.filter(picked, axis=0)
# Loop through samples and get tpms
randomTpms = getTpms("ZF-WT-1", 1, 1, randomTxs)
for num in range(2,sampleCount,1):
if num != 252 and num != 347:
sample = "ZF-WT-%d" % num
sampleTpms = getTpms(sample, 2, num, randomTxs)
randomTpms = pd.concat([randomTpms, sampleTpms], axis=1)
# Loop through samples and get proportions
randomProportions = getGeneTotals(1, 1, randomTpms)
for num in range(2,sampleCount,1):
if num != 252 and num != 347:
sampleProportions = getGeneTotals(num, 2, randomTpms)
randomProportions = pd.concat([randomProportions, sampleProportions], axis=1)
In [81]:
"""
FUNCTION TO GET CORRELATIONS
- Takes 2 dataframes - proportions and tpms
- Return pearson correlation as a list
"""
# Set Filter (needs x tpm in atleast x samples to meet filter)
tpmThreshold = 5
sampleThreshold1 = 10
sampleThreshold2 = 100
def getCorrelations(proportionDf, tpmDf):
a = proportionDf.sort("Name")
b = tpmDf.sort("Name")
numTranscripts = len(a)
transcriptRange = range(0, numTranscripts-1, 1)
indexVals = a.index
Correlations = []
#for num in transcriptRange:
for num in indexVals:
proportions = a.ix[num,2:].tolist()
tpms = b.ix[num,2:].tolist()
stackedValues = np.column_stack([tpms, proportions])
stackedValues = stackedValues[stackedValues[:,0] > tpmThreshold]
if len(stackedValues) > sampleThreshold1 and len(stackedValues) < sampleThreshold2:
correlation = pearsonr(stackedValues[:,1], stackedValues[:,0])
Correlations.append(correlation[0])
return Correlations
In [85]:
novelUtronCorrelations = getCorrelations(novelUtronProportions, novelUtronTpms)
allUtronCorrelations = getCorrelations(allUtronProportions, allUtronTpms)
RandomCorrelations = getCorrelations(randomProportions, randomTpms)
In [89]:
%pylab inline
pylab.hist(novelUtronCorrelations, bins=30, range=(-1,1), edgecolor="black", normed=True, label="novel UTRons", color="red")
pylab.hist(allUtronCorrelations, bins=30, range=(-1,1), edgecolor="black", normed=True, label="known UTRons", color="blue", alpha=0.5,)
pylab.legend()
pylab.savefig("./images/10_FINALnovelVsKnown", dpi=300)
In [90]:
%pylab inline
pylab.hist(RandomCorrelations, bins=30, range=(-1,1), edgecolor="black", normed=True, label="Random Transcripts", color="red")
pylab.hist(allUtronCorrelations, bins=30, range=(-1,1), edgecolor="black", normed=True, label="known UTRons", color="blue", alpha=0.5,)
pylab.legend()
pylab.savefig("./images/10_FINALknownVsRand", dpi=300)
In [91]:
%pylab inline
pylab.hist(RandomCorrelations, bins=30, range=(-1,1), edgecolor="black", normed=True, label="Random Transcripts", color="red")
pylab.hist(novelUtronCorrelations, bins=30, range=(-1,1), edgecolor="black", normed=True, label="novel UTRons", color="blue", alpha=0.5,)
pylab.legend()
pylab.savefig("./images/10_FINALnovelVsRand", dpi=300)
In [186]:
"""
Plotting novel UTRons correlations based on UTRon length
"""
Length = 200
lengthsFile = "/shared/sudlab1/General/projects/utrons_project/misc_files/SpliceSite/novelLengths.txt"
lengthsInfo = pd.read_csv(lengthsFile, "\t")
shortIds = lengthsInfo[lengthsInfo["Length"]<=Length]["transcript_id"]
shortUtronTpms = novelUtronTpms[novelUtronTpms["Name"].isin(shortIds)]
shortUtronProportions = novelUtronProportions[novelUtronProportions["Name"].isin(shortIds)]
longUtronTpms = novelUtronTpms[~novelUtronTpms["Name"].isin(shortIds)]
longUtronProportions = novelUtronProportions[~novelUtronProportions["Name"].isin(shortIds)]
shortUtronCorrelations = getCorrelations(shortUtronProportions, shortUtronTpms)
longUtronCorrelations = getCorrelations(longUtronProportions, longUtronTpms)
pylab.hist(shortUtronCorrelations, bins=20, range=(-1,1), edgecolor="black", normed=True, color="blue", label="<200 bp")
pylab.hist(longUtronCorrelations, bins=20, range=(-1,1), edgecolor="black", normed=True, color="red", alpha=0.8,label=">200 bp")
pylab.legend()
pylab.savefig("10_FINALlengthSplit", dpi=300)
In [41]:
"""
Plotting correlation of novel Utrons based on splice site type
"""
# File of novel utrons with non-canonical splice sites
badsplicefile = "/shared/sudlab1/General/projects/utrons_project/misc_files/SpliceSite/novelUtrons_unknown_100000bp.txt"
badspliceUtrons = pd.read_csv(badsplicefile, header=None, sep="\t", index_col=0)[1].tolist()
"""
Function returns the splice site status (1) = known, (0) = unknown
"""
def spliceStatus(row):
tx = row["Name"]
if tx in badspliceUtrons:
return 0
else:
return 1
novelTpms = novelUtronTpms
novelProps = novelUtronProportions
novelTpms["Splice"] = novelTpms.apply(spliceStatus, axis=1)
novelProps["Splice"] = novelProps.apply(spliceStatus, axis=1)
knownSpliceTpms = novelTpms[novelTpms["Splice"]==1]
unknownSpliceTpms = novelTpms[novelTpms["Splice"]==0]
knownSpliceProps = novelProps[novelProps["Splice"]==1]
unknownSpliceProps = novelProps[novelProps["Splice"]==0]
In [93]:
knownSpliceCorrelations = getCorrelations(knownSpliceProps, knownSpliceTpms)
unknownSpliceCorrelations = getCorrelations(unknownSpliceProps, unknownSpliceTpms)
pylab.hist(knownSpliceCorrelations, bins=30, range=(-1,1), edgecolor="black", normed=True, color="blue", label="known splice site")
pylab.hist(unknownSpliceCorrelations, bins=30, range=(-1,1), edgecolor="black", normed=True, color="red", alpha=0.8, label="unknown splice site")
pylab.legend()
pylab.savefig("./images/10_FINALknownSpliceVsUnknown_10-100_5tpm")
In [94]:
shortUnknownUtronTpms = unknownSpliceTpms[unknownSpliceTpms["Name"].isin(shortIds)]
shortUnknownUtronProps = unknownSpliceProps[unknownSpliceProps["Name"].isin(shortIds)]
longKnownUtronTpms = knownSpliceTpms[~knownSpliceTpms["Name"].isin(shortIds)]
longKnownUtronProps = knownSpliceProps[~knownSpliceProps["Name"].isin(shortIds)]
longKnownSpliceCorrelations = getCorrelations(longKnownUtronProps, longKnownUtronTpms)
shortUnknownSpliceCorrelations = getCorrelations(shortUnknownUtronProps, shortUnknownUtronTpms)
In [96]:
pylab.hist(longKnownSpliceCorrelations, bins=30, range=(-1,1), edgecolor="black", normed=True, color="blue", label=">200bp and Known Splice Site")
pylab.hist(shortUnknownSpliceCorrelations, bins=30, range=(-1,1), edgecolor="black", normed=True, color="red", alpha=0.8, label="<200bp and Unknown Splice Site")
pylab.legend(fontsize="small")
pylab.savefig("./images/10_filteredSpliceLength", dpi=300)
In [178]:
"""
Plotting based on suppressors vs oncogenes
"""
# File with cancer genes
TPM = 20
oncoFile = "/shared/sudlab1/General/projects/utrons_project/misc_files/cancerUtrons/oncogenes_%dTPM.txt" % TPM
suppFile = "/shared/sudlab1/General/projects/utrons_project/misc_files/cancerUtrons/suppresor_%dTPM.txt" % TPM
unknFile = "/shared/sudlab1/General/projects/utrons_project/misc_files/cancerUtrons/unknown_%dTPM.txt" % TPM
oncoDf = pd.read_csv(oncoFile, sep="\t", header=None)
suppDf = pd.read_csv(suppFile, sep="\t", header=None)
unknDf = pd.read_csv(unknFile, sep="\t", header=None)
oncoTpms = allUtronTpms[allUtronTpms["Name"].isin(oncoDf[0])]
oncoProps = allUtronProportions[allUtronProportions["Name"].isin(oncoDf[0])]
suppTpms = allUtronTpms[allUtronTpms["Name"].isin(suppDf[0])]
suppProps = allUtronProportions[allUtronProportions["Name"].isin(suppDf[0])]
unknTpms = allUtronTpms[allUtronTpms["Name"].isin(unknDf[0])]
unknProps = allUtronProportions[allUtronProportions["Name"].isin(unknDf[0])]
oncoCorrelations = getCorrelations(oncoProps, oncoTpms)
unknCorrelations = getCorrelations(unknProps, unknTpms)
suppCorrelations = getCorrelations(suppProps,suppTpms)
In [179]:
pylab.hist(oncoCorrelations, bins=25, range=(-1,1), edgecolor="black", normed=True, color="red")
pylab.hist(suppCorrelations, bins=25, range=(-1,1), edgecolor="black", normed=True, color="blue", alpha=0.5,)
pylab.legend()