Statement to find overlap:
bedtools intersect -c -a novel_utrons.bed -b TS_predictions_hg38_liftover.bed
> overlap.txt
File in misc_files microRNAoverlap.txt has the number of overlaps with microRNAs for these genes (done for both novel and known)
In [41]:
import pandas as pd
import sqlite3
import pandas as pd
import numpy as np
cnx = sqlite3.connect('/shared/sudlab1/General/projects/utrons_project/BladderCancerUtrons/431BladderUtrons.db')
cnx.execute("ATTACH '/shared/sudlab1/General/annotations/hg38_noalt_ensembl85/csvdb' as annotations")
systematicUtrons = "/shared/sudlab1/General/projects/utrons_project/misc_files/systematicUtronGenes.txt"
systematicUtrons = pd.read_csv(systematicUtrons, sep="\t", header=None)
In [33]:
novelOverlapFile = "/shared/sudlab1/General/projects/utrons_project/misc_files/microRNAOverlap.txt"
allOverlapFile = "/shared/sudlab1/General/projects/utrons_project/misc_files/allUtronsMicroRnaOverlap.txt"
novelOverlap = pd.read_csv(novelOverlapFile, sep=" |\t", engine="python", header=None)
allOverlap = pd.read_csv(allOverlapFile, sep=" |\t", engine="python", header=None)
def getId(row):
Id = row[3][:-16]
return Id
novelOverlap["GeneId"] = novelOverlap.apply(getId, axis=1)
allOverlap["GeneId"] = allOverlap[3]
novelOverlap = novelOverlap[["GeneId", 7]]
allOverlap = allOverlap[["GeneId", 12]]
In [43]:
query_text1 = '''
SELECT uid.transcript_id AS Name, ti.gene_name AS Gene
FROM novel_utrons_ids AS uid
INNER JOIN annotations.transcript_info AS ti
ON ti.transcript_id = uid.match_transcript_id
WHERE uid.track='agg-agg-agg' AND uid.transcript_id like "MSTRG%"
ORDER BY uid.transcript_id
'''
novelIds = pd.read_sql_query(query_text1, cnx)
novelIds = novelIds[~novelIds["Gene"].isin(systematicUtrons[0])]
query_text1 = '''
SELECT uid.transcript_id AS Name, ti.gene_name AS Gene
FROM all_utrons_ids AS uid
INNER JOIN transcript_class AS tc
ON tc.transcript_id = uid.transcript_id
INNER JOIN annotations.transcript_info AS ti
ON ti.transcript_id = tc.match_transcript_id
WHERE uid.track='agg-agg-agg' AND uid.transcript_id like "ENS%"
ORDER BY uid.transcript_id
'''
knownIds = pd.read_sql_query(query_text1, cnx)
knownIds = knownIds[~knownIds["Gene"].isin(systematicUtrons[0])]
In [56]:
novelOverlap = novelOverlap[novelOverlap["GeneId"].isin(novelIds["Name"])]
allOverlap = allOverlap[allOverlap["GeneId"].isin(knownIds["Name"])]
print len(novelOverlap[novelOverlap[7]>0]), len(novelOverlap[novelOverlap[7]==0])
print len(allOverlap[allOverlap[12]>0]), len(allOverlap[allOverlap[12]==0])
In [166]:
lengthInfo = pd.read_csv("/shared/sudlab1/General/projects/utrons_project/misc_files/SpliceSite/novelLengths.txt", sep="\t")
def getPercents(length):
shortIds = lengthInfo[lengthInfo["Length"]<=length]["transcript_id"]
shortNovel = novelOverlap[novelOverlap["GeneId"].isin(shortIds)]
longNovel = novelOverlap[~novelOverlap["GeneId"].isin(shortIds)]
shortWith = len(shortNovel[shortNovel[7]>0])
shortNone = len(shortNovel[shortNovel[7]==0])
longWith = len(longNovel[longNovel[7]>0])
longNone = len(longNovel[longNovel[7]==0])
percent1 = shortWith / float(shortWith + shortNone)
percent2 = longWith / float(longWith + longNone)
#print percent1 * 100, (1 - percent1) * 100
#print percent2 * 100, (1 - percent2) * 100
return percent1, percent2
%pylab inline
shortList = []
longList = []
lengthList = []
for num in range(25,2000,10):
a, b = getPercents(num)
shortList.append(a)
longList.append(b)
lengthList.append(num)
pylab.plot(lengthList, longList)
pylab.ylim(0,0.15)
pylab.xlabel("Length")
pylab.ylabel("Proportion")
pylab.savefig("./images/LengthVsMicroRNA", dpi=300)
In [108]:
print longList
In [131]:
tpm = 20
supDf = pd.read_csv("/shared/sudlab1/General/projects/utrons_project/misc_files/cancerUtrons/suppresor_%dTPM.txt" % tpm,sep="\t", header=None)
oncDf = pd.read_csv("/shared/sudlab1/General/projects/utrons_project/misc_files/cancerUtrons/oncogenes_%dTPM.txt" % tpm,sep="\t", header=None)
unkDf = pd.read_csv("/shared/sudlab1/General/projects/utrons_project/misc_files/cancerUtrons/unknown_%dTPM.txt" % tpm,sep="\t", header=None)
In [134]:
allOncOverlap = allOverlap[allOverlap["GeneId"].isin(oncDf[0])]
novelOncOverlap = novelOverlap[novelOverlap["GeneId"].isin(oncDf[0])]
print "oncogenes"
print "known", len(allOncOverlap[allOncOverlap[12]>0])/ float(len(allOncOverlap))
print "novel", len(novelOncOverlap[novelOncOverlap[7]>0])/float(len(novelOncOverlap))
allSupOverlap = allOverlap[allOverlap["GeneId"].isin(supDf[0])]
novelSupOverlap = novelOverlap[novelOverlap["GeneId"].isin(supDf[0])]
print "\nSuppressors"
print "known", len(allSupOverlap[allSupOverlap[12]>0])/ float(len(allSupOverlap))
print "novel", len(novelSupOverlap[novelSupOverlap[7]>0])/float(len(novelSupOverlap))
allUnkOverlap = allOverlap[allOverlap["GeneId"].isin(unkDf[0])]
novelUnkOverlap = novelOverlap[novelOverlap["GeneId"].isin(unkDf[0])]
print "\nUnknown"
print "known", len(allUnkOverlap[allUnkOverlap[12]>0])/ float(len(allSupOverlap))
print "novel", len(novelUnkOverlap[novelUnkOverlap[7]>0])/float(len(novelSupOverlap))
In [168]:
oncGenes1 = pd.merge(knownIds, allOncOverlap, left_on="Name", right_on="GeneId")
print "Known Oncogenes removing microRNAs"
print sorted(oncGenes1["Gene"].unique())
oncGenes2 = pd.merge(novelIds, novelOncOverlap, left_on="Name", right_on="GeneId")
print "\n\nNovel Oncogenes removing microRNAs"
print sorted(oncGenes2["Gene"].unique())
In [169]:
supGenes1 = pd.merge(knownIds, allSupOverlap, left_on="Name", right_on="GeneId")
print "Known suppressors removing microRNAs"
print sorted(supGenes1["Gene"].unique())
supGenes2 = pd.merge(novelIds, novelSupOverlap, left_on="Name", right_on="GeneId")
print "\n\nNovel suppressors removing microRNAs"
print sorted(supGenes2["Gene"].unique())
In [157]:
print "\t\tTranscripts", "\tGenes"
print "known Oncogenes \t %s \t %s" % (len(oncGenes1), len(oncGenes1["Gene"].unique()))
print "novel Oncogenes \t %s \t %s" % (len(oncGenes2), len(oncGenes2["Gene"].unique()))
print "known suppressors \t %s \t %s" % ( len(supGenes1), len(supGenes1["Gene"].unique()))
print "novel suppressors \t %s \t %s"% ( len(supGenes2), len(supGenes2["Gene"].unique()))
In [165]:
supjoined = pd.concat([supGenes1, supGenes2])
supjoined.to_csv("/shared/sudlab1/General/projects/utrons_project/misc_files/cancerUtrons/suppressorMicroRNA.txt", sep="\t", header=None, index=None)
oncjoined = pd.concat([oncGenes1, oncGenes2])
oncjoined.to_csv("/shared/sudlab1/General/projects/utrons_project/misc_files/cancerUtrons/oncogeneMicroRNA.txt", sep="\t", header=None, index=None)