Statement to find overlap:
bedtools intersect -c -a novel_utrons.bed -b TS_predictions_hg38_liftover.bed
> overlap.txt
File in misc_files microRNAoverlap.txt has the number of overlaps with microRNAs for these genes (done for both novel and known)
In [41]:
    
import pandas as pd
import sqlite3
import pandas as pd
import numpy as np
cnx = sqlite3.connect('/shared/sudlab1/General/projects/utrons_project/BladderCancerUtrons/431BladderUtrons.db')
cnx.execute("ATTACH '/shared/sudlab1/General/annotations/hg38_noalt_ensembl85/csvdb' as annotations")
systematicUtrons = "/shared/sudlab1/General/projects/utrons_project/misc_files/systematicUtronGenes.txt"
systematicUtrons = pd.read_csv(systematicUtrons, sep="\t", header=None)
    
In [33]:
    
novelOverlapFile = "/shared/sudlab1/General/projects/utrons_project/misc_files/microRNAOverlap.txt"
allOverlapFile = "/shared/sudlab1/General/projects/utrons_project/misc_files/allUtronsMicroRnaOverlap.txt"
novelOverlap = pd.read_csv(novelOverlapFile, sep=" |\t", engine="python", header=None)
allOverlap = pd.read_csv(allOverlapFile, sep=" |\t", engine="python", header=None)
def getId(row):
    Id = row[3][:-16]
    return Id
novelOverlap["GeneId"] = novelOverlap.apply(getId, axis=1)
allOverlap["GeneId"] = allOverlap[3]
novelOverlap = novelOverlap[["GeneId", 7]]
allOverlap = allOverlap[["GeneId", 12]]
    
In [43]:
    
query_text1 = '''
    SELECT uid.transcript_id AS Name, ti.gene_name AS Gene
    FROM novel_utrons_ids AS uid
    INNER JOIN annotations.transcript_info AS ti
    ON ti.transcript_id = uid.match_transcript_id
    WHERE uid.track='agg-agg-agg' AND uid.transcript_id like "MSTRG%"
    ORDER BY uid.transcript_id
    '''
novelIds = pd.read_sql_query(query_text1, cnx)
novelIds = novelIds[~novelIds["Gene"].isin(systematicUtrons[0])]
query_text1 = '''
    SELECT uid.transcript_id AS Name, ti.gene_name AS Gene
    FROM all_utrons_ids AS uid
    INNER JOIN transcript_class AS tc
    ON tc.transcript_id = uid.transcript_id
    INNER JOIN annotations.transcript_info AS ti
    ON ti.transcript_id = tc.match_transcript_id 
    WHERE uid.track='agg-agg-agg' AND uid.transcript_id like "ENS%"
    ORDER BY uid.transcript_id
    '''
knownIds = pd.read_sql_query(query_text1, cnx)
knownIds = knownIds[~knownIds["Gene"].isin(systematicUtrons[0])]
    
In [56]:
    
novelOverlap = novelOverlap[novelOverlap["GeneId"].isin(novelIds["Name"])]
allOverlap = allOverlap[allOverlap["GeneId"].isin(knownIds["Name"])]
print len(novelOverlap[novelOverlap[7]>0]), len(novelOverlap[novelOverlap[7]==0])
print len(allOverlap[allOverlap[12]>0]), len(allOverlap[allOverlap[12]==0])
    
    
In [166]:
    
lengthInfo = pd.read_csv("/shared/sudlab1/General/projects/utrons_project/misc_files/SpliceSite/novelLengths.txt", sep="\t")
def getPercents(length):
    shortIds = lengthInfo[lengthInfo["Length"]<=length]["transcript_id"]
    shortNovel = novelOverlap[novelOverlap["GeneId"].isin(shortIds)]
    longNovel = novelOverlap[~novelOverlap["GeneId"].isin(shortIds)]
    shortWith = len(shortNovel[shortNovel[7]>0])
    shortNone = len(shortNovel[shortNovel[7]==0])
    longWith = len(longNovel[longNovel[7]>0])
    longNone = len(longNovel[longNovel[7]==0])
    
    percent1 = shortWith / float(shortWith + shortNone)
    percent2 = longWith / float(longWith + longNone)
    
    #print percent1 * 100, (1 - percent1) * 100
    #print percent2 * 100, (1 - percent2) * 100
    
    return percent1, percent2
    
%pylab inline
shortList = []
longList = []
lengthList = []
for num in range(25,2000,10):
    a, b = getPercents(num)
    shortList.append(a)
    longList.append(b)
    lengthList.append(num)
pylab.plot(lengthList, longList)
pylab.ylim(0,0.15)
pylab.xlabel("Length")
pylab.ylabel("Proportion")
pylab.savefig("./images/LengthVsMicroRNA", dpi=300)
    
    
    
In [108]:
    
print longList
    
    
In [131]:
    
tpm = 20
supDf = pd.read_csv("/shared/sudlab1/General/projects/utrons_project/misc_files/cancerUtrons/suppresor_%dTPM.txt" % tpm,sep="\t", header=None)
oncDf = pd.read_csv("/shared/sudlab1/General/projects/utrons_project/misc_files/cancerUtrons/oncogenes_%dTPM.txt" % tpm,sep="\t", header=None)
unkDf = pd.read_csv("/shared/sudlab1/General/projects/utrons_project/misc_files/cancerUtrons/unknown_%dTPM.txt" % tpm,sep="\t", header=None)
    
In [134]:
    
allOncOverlap = allOverlap[allOverlap["GeneId"].isin(oncDf[0])]
novelOncOverlap = novelOverlap[novelOverlap["GeneId"].isin(oncDf[0])]
print "oncogenes"
print "known", len(allOncOverlap[allOncOverlap[12]>0])/ float(len(allOncOverlap))
print "novel", len(novelOncOverlap[novelOncOverlap[7]>0])/float(len(novelOncOverlap))
allSupOverlap = allOverlap[allOverlap["GeneId"].isin(supDf[0])]
novelSupOverlap = novelOverlap[novelOverlap["GeneId"].isin(supDf[0])]
print "\nSuppressors"
print "known", len(allSupOverlap[allSupOverlap[12]>0])/ float(len(allSupOverlap))
print "novel", len(novelSupOverlap[novelSupOverlap[7]>0])/float(len(novelSupOverlap))
allUnkOverlap = allOverlap[allOverlap["GeneId"].isin(unkDf[0])]
novelUnkOverlap = novelOverlap[novelOverlap["GeneId"].isin(unkDf[0])]
print "\nUnknown"
print "known", len(allUnkOverlap[allUnkOverlap[12]>0])/ float(len(allSupOverlap))
print "novel", len(novelUnkOverlap[novelUnkOverlap[7]>0])/float(len(novelSupOverlap))
    
    
In [168]:
    
oncGenes1 = pd.merge(knownIds, allOncOverlap, left_on="Name", right_on="GeneId")
print "Known Oncogenes removing microRNAs"
print sorted(oncGenes1["Gene"].unique())
oncGenes2 = pd.merge(novelIds, novelOncOverlap, left_on="Name", right_on="GeneId")
print "\n\nNovel Oncogenes removing microRNAs"
print sorted(oncGenes2["Gene"].unique())
    
    
In [169]:
    
supGenes1 = pd.merge(knownIds, allSupOverlap, left_on="Name", right_on="GeneId")
print "Known suppressors removing microRNAs"
print sorted(supGenes1["Gene"].unique())
supGenes2 = pd.merge(novelIds, novelSupOverlap, left_on="Name", right_on="GeneId")
print "\n\nNovel suppressors removing microRNAs"
print sorted(supGenes2["Gene"].unique())
    
    
In [157]:
    
print "\t\tTranscripts", "\tGenes"
print "known Oncogenes \t %s \t %s" % (len(oncGenes1), len(oncGenes1["Gene"].unique()))
print "novel Oncogenes \t %s \t %s" % (len(oncGenes2), len(oncGenes2["Gene"].unique()))
print "known suppressors \t %s \t %s" % ( len(supGenes1), len(supGenes1["Gene"].unique()))
print "novel suppressors \t %s \t %s"% ( len(supGenes2), len(supGenes2["Gene"].unique()))
    
    
In [165]:
    
supjoined = pd.concat([supGenes1, supGenes2])
supjoined.to_csv("/shared/sudlab1/General/projects/utrons_project/misc_files/cancerUtrons/suppressorMicroRNA.txt", sep="\t", header=None, index=None)
oncjoined = pd.concat([oncGenes1, oncGenes2])
oncjoined.to_csv("/shared/sudlab1/General/projects/utrons_project/misc_files/cancerUtrons/oncogeneMicroRNA.txt", sep="\t", header=None, index=None)