Finding the overlap between novel UTRons and microRNA sites predicted from TargetScan

Statement to find overlap:

bedtools intersect -c -a novel_utrons.bed -b TS_predictions_hg38_liftover.bed

> overlap.txt

File in misc_files microRNAoverlap.txt has the number of overlaps with microRNAs for these genes (done for both novel and known)


In [41]:
import pandas as pd
import sqlite3
import pandas as pd
import numpy as np

cnx = sqlite3.connect('/shared/sudlab1/General/projects/utrons_project/BladderCancerUtrons/431BladderUtrons.db')
cnx.execute("ATTACH '/shared/sudlab1/General/annotations/hg38_noalt_ensembl85/csvdb' as annotations")

systematicUtrons = "/shared/sudlab1/General/projects/utrons_project/misc_files/systematicUtronGenes.txt"
systematicUtrons = pd.read_csv(systematicUtrons, sep="\t", header=None)

In [33]:
novelOverlapFile = "/shared/sudlab1/General/projects/utrons_project/misc_files/microRNAOverlap.txt"
allOverlapFile = "/shared/sudlab1/General/projects/utrons_project/misc_files/allUtronsMicroRnaOverlap.txt"

novelOverlap = pd.read_csv(novelOverlapFile, sep=" |\t", engine="python", header=None)
allOverlap = pd.read_csv(allOverlapFile, sep=" |\t", engine="python", header=None)

def getId(row):
    Id = row[3][:-16]
    return Id

novelOverlap["GeneId"] = novelOverlap.apply(getId, axis=1)
allOverlap["GeneId"] = allOverlap[3]

novelOverlap = novelOverlap[["GeneId", 7]]
allOverlap = allOverlap[["GeneId", 12]]

In [43]:
query_text1 = '''
    SELECT uid.transcript_id AS Name, ti.gene_name AS Gene
    FROM novel_utrons_ids AS uid
    INNER JOIN annotations.transcript_info AS ti
    ON ti.transcript_id = uid.match_transcript_id
    WHERE uid.track='agg-agg-agg' AND uid.transcript_id like "MSTRG%"
    ORDER BY uid.transcript_id
    '''
novelIds = pd.read_sql_query(query_text1, cnx)
novelIds = novelIds[~novelIds["Gene"].isin(systematicUtrons[0])]


query_text1 = '''
    SELECT uid.transcript_id AS Name, ti.gene_name AS Gene
    FROM all_utrons_ids AS uid
    INNER JOIN transcript_class AS tc
    ON tc.transcript_id = uid.transcript_id
    INNER JOIN annotations.transcript_info AS ti
    ON ti.transcript_id = tc.match_transcript_id 
    WHERE uid.track='agg-agg-agg' AND uid.transcript_id like "ENS%"
    ORDER BY uid.transcript_id
    '''
knownIds = pd.read_sql_query(query_text1, cnx)
knownIds = knownIds[~knownIds["Gene"].isin(systematicUtrons[0])]

In [56]:
novelOverlap = novelOverlap[novelOverlap["GeneId"].isin(novelIds["Name"])]
allOverlap = allOverlap[allOverlap["GeneId"].isin(knownIds["Name"])]


print len(novelOverlap[novelOverlap[7]>0]), len(novelOverlap[novelOverlap[7]==0])
print len(allOverlap[allOverlap[12]>0]), len(allOverlap[allOverlap[12]==0])


424 3068
1199 26449

In [166]:
lengthInfo = pd.read_csv("/shared/sudlab1/General/projects/utrons_project/misc_files/SpliceSite/novelLengths.txt", sep="\t")

def getPercents(length):
    shortIds = lengthInfo[lengthInfo["Length"]<=length]["transcript_id"]

    shortNovel = novelOverlap[novelOverlap["GeneId"].isin(shortIds)]
    longNovel = novelOverlap[~novelOverlap["GeneId"].isin(shortIds)]

    shortWith = len(shortNovel[shortNovel[7]>0])
    shortNone = len(shortNovel[shortNovel[7]==0])
    longWith = len(longNovel[longNovel[7]>0])
    longNone = len(longNovel[longNovel[7]==0])
    
    percent1 = shortWith / float(shortWith + shortNone)
    percent2 = longWith / float(longWith + longNone)
    
    #print percent1 * 100, (1 - percent1) * 100
    #print percent2 * 100, (1 - percent2) * 100
    
    return percent1, percent2
    

%pylab inline

shortList = []
longList = []
lengthList = []
for num in range(25,2000,10):
    a, b = getPercents(num)
    shortList.append(a)
    longList.append(b)
    lengthList.append(num)

pylab.plot(lengthList, longList)
pylab.ylim(0,0.15)
pylab.xlabel("Length")
pylab.ylabel("Proportion")
pylab.savefig("./images/LengthVsMicroRNA", dpi=300)


Populating the interactive namespace from numpy and matplotlib

In [108]:
print longList


[0.12158808933002481, 0.12386156648451731, 0.13311421528348397, 0.13479052823315119, 0.13636363636363635, 0.1312240663900415, 0.13743815283122596, 0.13908450704225353, 0.14081885856079404, 0.14018087855297157, 0.1415282392026578, 0.1412894375857339, 0.14285714285714285, 0.14296134208606856, 0.14157303370786517, 0.1426403641881639, 0.14340786430223593, 0.1410756040530008, 0.1422924901185771, 0.14170692431561996, 0.14193025141930252, 0.14133986928104575, 0.14262159934047816, 0.14250207125103562, 0.14309623430962343, 0.14370748299319727, 0.14432989690721648, 0.14507772020725387, 0.14298169136878813, 0.1436077057793345, 0.14386584289496912, 0.14658273381294964, 0.14660633484162897, 0.1468721668177697, 0.1479779411764706, 0.14838709677419354, 0.1467038068709378, 0.14684014869888476, 0.1455223880597015, 0.1455223880597015, 0.14553990610328638, 0.14366729678638943, 0.14312796208530806, 0.14435946462715105, 0.1424446583253128, 0.13627450980392156, 0.13681102362204725, 0.13663366336633664, 0.13690476190476192, 0.13755020080321284, 0.13420787083753785, 0.13149847094801223, 0.12833675564681724, 0.12980269989615784, 0.12839248434237996, 0.12866108786610878, 0.12906610703043023, 0.12842105263157894, 0.1285563751317176, 0.12882787750791974, 0.12937433722163308, 0.1276595744680851, 0.12740899357601712, 0.12701829924650163, 0.12701829924650163, 0.1271551724137931, 0.12742980561555076, 0.12689804772234273, 0.12663755458515283, 0.12806236080178174, 0.1282051282051282, 0.12598425196850394, 0.12641083521444696, 0.12655367231638417, 0.12669683257918551, 0.12300683371298406, 0.11507479861910241, 0.11600928074245939, 0.11529411764705882, 0.1154299175500589, 0.11556603773584906, 0.11575178997613365, 0.11589008363201912, 0.11473429951690821, 0.11515151515151516, 0.1120584652862363, 0.11233211233211234, 0.11274509803921569, 0.11288343558282209, 0.11288343558282209, 0.11372064276885044, 0.11400247831474597, 0.1141439205955335, 0.115, 0.115, 0.11586901763224182, 0.11586901763224182, 0.11504424778761062, 0.11518987341772152, 0.11518987341772152, 0.11518987341772152, 0.11518987341772152, 0.11533586818757921, 0.11421319796954314, 0.11464968152866242, 0.11508951406649616, 0.11553273427471117, 0.11673151750972763, 0.11780104712041885, 0.11795543905635648, 0.11330698287220026, 0.11345646437994723, 0.11273209549071618, 0.11303191489361702, 0.11378848728246319, 0.10326086956521739, 0.10326086956521739, 0.10326086956521739, 0.10231923601637108, 0.10027472527472528, 0.09793103448275862, 0.09681881051175657, 0.09695290858725762, 0.09596662030598054, 0.09596662030598054, 0.09596662030598054, 0.09563994374120956, 0.09563994374120956, 0.09590973201692525, 0.09590973201692525, 0.0950354609929078, 0.09557774607703282, 0.09557774607703282, 0.09571428571428571, 0.09585121602288985, 0.09339080459770115, 0.09221902017291066, 0.09248554913294797, 0.09130434782608696, 0.09143686502177069, 0.09170305676855896, 0.09197080291970802, 0.08944281524926687, 0.08957415565345081, 0.08970588235294118, 0.08970588235294118, 0.08836524300441827, 0.08915304606240713, 0.08915304606240713, 0.08779761904761904, 0.08779761904761904, 0.08832335329341318, 0.08858858858858859, 0.0887218045112782, 0.0887218045112782, 0.0887218045112782, 0.0889894419306184, 0.0891238670694864, 0.0891238670694864, 0.08925869894099848, 0.08966565349544073, 0.08841463414634146, 0.08841463414634146, 0.08841463414634146, 0.08868501529051988, 0.08868501529051988, 0.08895705521472393, 0.0890937019969278, 0.08923076923076922, 0.08923076923076922, 0.08936825885978428, 0.08950617283950617, 0.08950617283950617, 0.08950617283950617, 0.08950617283950617, 0.09006211180124224, 0.09034267912772585, 0.09034267912772585, 0.0892018779342723, 0.08948194662480377, 0.0880503144654088, 0.08832807570977919, 0.08846761453396525, 0.08860759493670886, 0.08860759493670886, 0.08860759493670886, 0.08874801901743265, 0.08888888888888889, 0.08888888888888889, 0.0875796178343949, 0.0875796178343949, 0.08771929824561403, 0.08814102564102565, 0.08667736757624397, 0.08520900321543408, 0.08520900321543408, 0.08562197092084006, 0.08441558441558442]

In [131]:
tpm = 20
supDf = pd.read_csv("/shared/sudlab1/General/projects/utrons_project/misc_files/cancerUtrons/suppresor_%dTPM.txt" % tpm,sep="\t", header=None)
oncDf = pd.read_csv("/shared/sudlab1/General/projects/utrons_project/misc_files/cancerUtrons/oncogenes_%dTPM.txt" % tpm,sep="\t", header=None)
unkDf = pd.read_csv("/shared/sudlab1/General/projects/utrons_project/misc_files/cancerUtrons/unknown_%dTPM.txt" % tpm,sep="\t", header=None)

In [134]:
allOncOverlap = allOverlap[allOverlap["GeneId"].isin(oncDf[0])]
novelOncOverlap = novelOverlap[novelOverlap["GeneId"].isin(oncDf[0])]

print "oncogenes"
print "known", len(allOncOverlap[allOncOverlap[12]>0])/ float(len(allOncOverlap))
print "novel", len(novelOncOverlap[novelOncOverlap[7]>0])/float(len(novelOncOverlap))


allSupOverlap = allOverlap[allOverlap["GeneId"].isin(supDf[0])]
novelSupOverlap = novelOverlap[novelOverlap["GeneId"].isin(supDf[0])]

print "\nSuppressors"
print "known", len(allSupOverlap[allSupOverlap[12]>0])/ float(len(allSupOverlap))
print "novel", len(novelSupOverlap[novelSupOverlap[7]>0])/float(len(novelSupOverlap))


allUnkOverlap = allOverlap[allOverlap["GeneId"].isin(unkDf[0])]
novelUnkOverlap = novelOverlap[novelOverlap["GeneId"].isin(unkDf[0])]

print "\nUnknown"
print "known", len(allUnkOverlap[allUnkOverlap[12]>0])/ float(len(allSupOverlap))
print "novel", len(novelUnkOverlap[novelUnkOverlap[7]>0])/float(len(novelSupOverlap))


oncogenes
known 0.0953757225434
novel 0.311111111111

Suppressors
known 0.0454545454545
novel 0.125

Unknown
known 0.0833333333333
novel 0.0

In [168]:
oncGenes1 = pd.merge(knownIds, allOncOverlap, left_on="Name", right_on="GeneId")
print "Known Oncogenes removing microRNAs"
print sorted(oncGenes1["Gene"].unique())


oncGenes2 = pd.merge(novelIds, novelOncOverlap, left_on="Name", right_on="GeneId")
print "\n\nNovel Oncogenes removing microRNAs"
print sorted(oncGenes2["Gene"].unique())


Known Oncogenes removing microRNAs
[u'ACSL3', u'ADAM10', u'AKT1', u'AKT2', u'ALDH2', u'ASPSCR1', u'ATF1', u'ATIC', u'BCL11A', u'BCLAF1', u'BCR', u'BRAF', u'BRD4', u'CALR', u'CAMTA1', u'CANT1', u'CARM1', u'CARS', u'CCNB1IP1', u'CD74', u'CDK4', u'CHCHD7', u'CIITA', u'CLTC', u'CLTCL1', u'CMC4', u'COX6C', u'CREB3L2', u'CTNNB1', u'CTTN', u'DDX6', u'DEK', u'EIF4A2', u'ELN', u'ERBB2', u'ETV5', u'FGFR1', u'FIP1L1', u'GNAS', u'GOT2', u'HERPUD1', u'HNRNPA2B1', u'HOXA9', u'HRAS', u'HSP90AA1', u'IDH2', u'IL7R', u'KEAP1', u'KIF5B', u'KLK2', u'KRAS', u'KTN1', u'LASP1', u'LMO2', u'LZTR1', u'MDM2', u'MED17', u'METTL14', u'MLF1', u'MMP2', u'MUC1', u'MYH11', u'NACA', u'NDRG1', u'NIN', u'NRAS', u'NUMA1', u'NUP214', u'PAX8', u'PBX1', u'PCSK7', u'PDE4DIP', u'PDGFB', u'PICALM', u'PLCG1', u'PML', u'PPARG', u'PPP2R1A', u'PTPN11', u'RAF1', u'RHEB', u'RHOA', u'RPN1', u'SEPT5', u'SEPT9', u'SETDB1', u'SF3B1', u'SFPQ', u'SMC1A', u'SMO', u'SRSF2', u'SRSF3', u'SS18', u'SS18L1', u'SSX1', u'SSX2', u'SSX4', u'STAT3', u'TAF15', u'TCEA1', u'TCL1A', u'TFDP1', u'TFG', u'TFPT', u'TFRC', u'TPM3', u'TPM4', u'TRAF7', u'TRIP11', u'U2AF1', u'WHSC1', u'WWTR1', u'XPO1', u'YWHAE', u'ZNF814']


Novel Oncogenes removing microRNAs
[u'AKT2', u'CCND1', u'COL1A1', u'COX6C', u'H3F3B', u'HSP90AA1', u'JUN', u'LASP1', u'MAFB', u'MYCL', u'NDRG1', u'NEDD4L', u'NONO', u'PIM1', u'RAF1', u'RHOA', u'SDC4', u'SET', u'SRSF3', u'TPM3', u'TRAF7', u'YWHAE']

In [169]:
supGenes1 = pd.merge(knownIds, allSupOverlap, left_on="Name", right_on="GeneId")
print "Known suppressors removing microRNAs"
print sorted(supGenes1["Gene"].unique())


supGenes2 = pd.merge(novelIds, novelSupOverlap, left_on="Name", right_on="GeneId")
print "\n\nNovel suppressors removing microRNAs"
print sorted(supGenes2["Gene"].unique())


Known suppressors removing microRNAs
[u'ASXL1', u'ATRX', u'B2M', u'BAP1', u'BMPR1A', u'CASP8', u'CAST', u'CBLC', u'CCAR1', u'CDH1', u'CDK12', u'CDKN1B', u'CDKN2A', u'CIC', u'CYLD', u'DDB2', u'EPHB6', u'ERCC3', u'EXT1', u'FANCA', u'FANCG', u'FAS', u'FBXO11', u'FUBP1', u'HLA-B', u'KDM5C', u'MAP2K4', u'MLH1', u'MSH6', u'MUTYH', u'NF1', u'NF2', u'PALB2', u'PIK3R1', u'RB1', u'RECQL4', u'SBDS', u'SDHC', u'SDHD', u'SMAD4', u'STK11', u'TBL1XR1', u'TOM1', u'TP53BP1', u'TSC2', u'XPC']


Novel suppressors removing microRNAs
[u'B2M', u'CDH1', u'CDKN1A', u'EXT1', u'PIK3R1', u'SDHC', u'ZFP36L2']

In [157]:
print "\t\tTranscripts", "\tGenes"
print "known Oncogenes \t %s \t %s" % (len(oncGenes1), len(oncGenes1["Gene"].unique()))
print "novel Oncogenes \t %s \t %s" % (len(oncGenes2), len(oncGenes2["Gene"].unique()))
print "known suppressors \t %s \t %s" % ( len(supGenes1), len(supGenes1["Gene"].unique()))
print "novel suppressors \t %s \t %s"% ( len(supGenes2), len(supGenes2["Gene"].unique()))


		Transcripts 	Genes
known Oncogenes 	 692 	 115
novel Oncogenes 	 59 	 22
known suppressors 	 264 	 46
novel suppressors 	 34 	 7

In [165]:
supjoined = pd.concat([supGenes1, supGenes2])
supjoined.to_csv("/shared/sudlab1/General/projects/utrons_project/misc_files/cancerUtrons/suppressorMicroRNA.txt", sep="\t", header=None, index=None)

oncjoined = pd.concat([oncGenes1, oncGenes2])
oncjoined.to_csv("/shared/sudlab1/General/projects/utrons_project/misc_files/cancerUtrons/oncogeneMicroRNA.txt", sep="\t", header=None, index=None)