In [1]:
import sqlite3
import pandas as pd
import numpy as np
import subprocess
cnx = sqlite3.connect('/shared/sudlab1/General/projects/utrons_project/BladderCancerUtrons/431BladderUtrons.db')
cnx.execute("ATTACH '/shared/sudlab1/General/annotations/hg38_noalt_ensembl85/csvdb' as annotations")
Out[1]:
In [2]:
# Get novel gene ids into a database
query_text1 = """
SELECT uid.transcript_id AS name, ti.gene_name AS gene
FROM novel_utrons_ids AS uid
INNER JOIN annotations.transcript_info AS ti
ON ti.transcript_id = uid.match_transcript_id
WHERE uid.transcript_id like "MSTRG%" AND uid.track="agg-agg-agg"
ORDER BY uid.transcript_id
"""
novelUtronIds = pd.read_sql_query(query_text1, cnx)
#Output Ids to text file
novelFilename = "/shared/sudlab1/General/projects/utrons_project/misc_files/novelUtronIds.unfiltered.txt"
novelFile = open(novelFilename, 'w')
for gene in novelUtronIds.ix[:,0].tolist():
string = gene + "\n"
novelFile.write(string)
novelFile.close()
# Get known Utron ids into a database
query_text1 = """
SELECT uid.transcript_id AS name, ti.gene_name AS gene
FROM all_utrons_ids AS uid
INNER JOIN transcript_class AS tc
ON tc.transcript_id = uid.transcript_id
INNER JOIN annotations.transcript_info AS ti
ON ti.transcript_id = tc.match_transcript_id
WHERE uid.transcript_id like "ENS%" AND tc.track="agg-agg-agg"
GROUP BY uid.transcript_id
ORDER BY uid.transcript_id
"""
allUtronIds = pd.read_sql_query(query_text1, cnx)
# Output Ids to a text file
allFilename = "/shared/sudlab1/General/projects/utrons_project/misc_files/knownUtronIds.txt"
allFile = open(allFilename, 'w')
for gene in allUtronIds.ix[:,0].tolist():
string = gene + "\n"
allFile.write(string)
allFile.close()
In [3]:
# get novel ids into a list
a = novelUtronIds["name"].tolist()
# Open utrons bed file
novelBed = "/shared/sudlab1/General/projects/utrons_project/misc_files/UtronAnnotationFiles/agg-agg-agg.novel_utrons.bed"
# Open temp file to write filtered bed file to
novelBed2 = "./tempfiles/novelutronsfiltered.bed"
novelBedFiltered = open(novelBed2, 'w')
# Open novel bed file and if txnames in the list of novel ids write the line to a file
for line in open(novelBed).readlines():
line = line.rstrip()
f = line.split()
txname = f[3][:-16]
if txname in a:
string = ""
for col in f:
string += col + "\t"
string += "\n"
novelBedFiltered.write(string)
novelBedFiltered.close()
# Open the file just written to and get the lengths into a list novelLengths
novelLengths = []
for line in open(novelBed2).readlines():
f =line.split()
point1 = int(f[1])
point2 = int(f[2])
length = point2 - point1
novelLengths.append(length)
In [6]:
# get all utron names into a list
b = allUtronIds["name"].tolist()
# Filename of all utrons bed file
allBed = "/shared/sudlab1/General/projects/utrons_project/misc_files/UtronAnnotationFiles/agg-agg-agg.all_utrons.bed"
# Filename of temp file to write filtered data to
allBed2 = "./tempfiles/allutronsfiltered.bed"
allBedFiltered = open(allBed2, 'w')
# Loop through bed fiel and if the utron id is in the list of known utrons, write it to tempfile
txnames = []
for line in open(allBed).readlines():
line = line.rstrip()
f = line.split()
txname = f[3]
txnames.append(txname)
if txname in b:
string = ""
for col in f:
string += col + "\t"
string += "\n"
allBedFiltered.write(string)
allBedFiltered.close()
# Get lengths from tempfile
allLengths = []
for line in open(allBed2).readlines():
f =line.split()
point1 = int(f[1])
point2 = int(f[2])
length = point2 - point1
allLengths.append(length)
In [7]:
# Get lengths of reference introns
# Open file with geneset_all intronic regions
genesetIntrons = "/shared/sudlab1/General/projects/utrons_project/misc_files/geneset_all.introns.gtf"
# Read file into dataframe
a = pd.read_csv(genesetIntrons, sep="\t", header=None)
#Get lengths as col4 - col3
a["length"] = a[4] - a[3]
a = a["length"].copy()
In [10]:
%pylab inline
# Plot vals
pylab.hist(allLengths, range=(0,5000), bins=25, color="red", normed=True, label="Annotated UTRons",edgecolor = "black" )
pylab.hist(novelLengths, range=(0,5000), bins=25, color="blue", normed=True, alpha=0.7, label="Novel UTRons", edgecolor = "black")
# Figure options
pylab.legend(fontsize="small")
pylab.xlabel("Length"); pylab.ylabel("Normalised Distribution")
pylab.savefig("./images/1_KnownVsNovelLengths_5000.png", dpi=300)
In [22]:
# Plot vals
pylab.hist(a.tolist(), bins=50, range=(0,5000), normed=True, color="red", label="Reference Introns",edgecolor = "black" )
pylab.hist(allLengths, bins=50, range=(0,5000), normed=True, color="blue", alpha=0.7, label="Annotated UTRons", edgecolor = "black")
# Figure options
pylab.legend(fontsize="small")
pylab.xlabel("Length"); pylab.ylabel("Normalised Distribution")
pylab.savefig("./images/1_KnownVsReference_5000.png", dpi=300)
In [24]:
# Plot vals
pylab.hist(a.tolist(), bins=50, range=(0,5000), normed=True, color="red", label="Reference Introns", edgecolor = "black")
pylab.hist(novelLengths, bins=50, range=(0,5000), normed=True, color="blue", alpha=0.7, label="novel UTRons", edgecolor = "black")
# Figure Options
pylab.legend(fontsize="small")
pylab.xlabel("Length"); pylab.ylabel("Normalised Distribution")
pylab.savefig("./images/1_NovelVsReference.png", dpi=300)
In [25]:
# Plot vals
pylab.hist(allLengths, range=(0,400), bins=50, color="red", normed=True, label="Annotated UTRons", edgecolor = "black")
pylab.hist(novelLengths, range=(0,400), bins=50, color="blue", normed=True, alpha=0.7, label="Novel UTRons", edgecolor = "black")
# Figure Options
pylab.legend(fontsize="small")
pylab.xlabel("Length"); pylab.ylabel("Normalised Distribution")
pylab.savefig("./images/1_NovelVsKnown_400", dpi=300)
In [27]:
# Plot vals
pylab.hist(a.tolist(), range=(0,400), bins=50, color="red", normed=True, label="Reference Introns",edgecolor = "black")
pylab.hist(novelLengths, range=(0,400), bins=50, color="blue", normed=True, alpha=0.7, label="Novel UTRons",edgecolor = "black")
# Figure Options
pylab.legend(fontsize="small")
pylab.xlabel("Length"); pylab.ylabel("Normalised Distribution")
pylab.savefig("./images/1_NovelVsReference_400", dpi=300)
In [28]:
# Plot Vals
pylab.hist(a.tolist(), range=(0,400), bins=50, color="red", normed=True, label="Reference Introns", edgecolor = "black")
pylab.hist(allLengths, range=(0,400), bins=50, color="blue", normed=True, alpha=0.7, label="All UTRons", edgecolor = "black")
# Figure Options
pylab.legend(fontsize="small")
pylab.xlabel("Length"); pylab.ylabel("Normalised Distribution")
pylab.savefig("./images/1_AllVsReference_400", dpi=300)
In [12]:
# Plot Vals
pylab.hist(allLengths, range=(250,1000), bins=30, color="red", normed=True, label="Annotated UTRons",edgecolor = "black")
pylab.hist(novelLengths, range=(250,1000), bins=30, color="blue", normed=True, alpha=0.7, label="Novel UTRons",edgecolor = "black")
# Figure Options
pylab.legend(fontsize="small")
pylab.xlabel("Length"); pylab.ylabel("Normalised Distribution")
pylab.savefig("./images/1_NovelVsKnown_250-1000", dpi=300)
In [35]:
length = 75
print "proportion novel utrons > %sbp" % length
print float(len([x for x in novelLengths if x > length])) / len(novelLengths)
print "\nproportion all utrons > %sbp" % length
print float(len([x for x in allLengths if x > length])) / len(allLengths)
print "\nproportion reference introns > %sbp" % length
print float(len([x for x in a.tolist() if x > length])) / len(a)
In [72]:
# Getting novel Utron tx names into csv files (for later use)
# Function to get txname for each tx
def getName(row):
return row[3][:-16]
# Read bed file info into DF
a = pd.read_csv("/shared/sudlab1/General/projects/utrons_project/misc_files/UtronAnnotationFiles/agg-agg-agg.novel_utrons.bed", sep="\t", header=None)
a["length"] = a[2] - a[1]
a["txId"] = a.apply(getName, axis=1)
# Get lengths and tx ids into DF copy
novelUtrons = a[["length", "txId"]].copy()
novelUtrons = novelUtrons[novelUtrons["txId"].isin(novelUtronIds["name"].tolist())]
#####################################################################################################
# Filter by < 100 and > 100 bp
length = 100
short = novelUtrons[novelUtrons["length"]<=length]
big = novelUtrons[novelUtrons["length"]>length]
# Write txnames to csv file
shortoutfile = "/shared/sudlab1/General/projects/utrons_project/misc_files/LengthIds/novelUtronssmaller%d" %length
bigoutfile = "/shared/sudlab1/General/projects/utrons_project/misc_files/LengthIds/novelUtronsbigger%d" %length
short.to_csv(shortoutfile, header=None, index=None, sep="\t")
big.to_csv(bigoutfile, header=None, index=None, sep="\t")
#####################################################################################################
# Filter by < 70 and > 70 bp
length = 70
short = novelUtrons[novelUtrons["length"]<=length]
big = novelUtrons[novelUtrons["length"]>length]
# Write txnames to csv file
shortoutfile = "/shared/sudlab1/General/projects/utrons_project/misc_files/LengthIds/novelUtronssmaller%d" %length
bigoutfile = "/shared/sudlab1/General/projects/utrons_project/misc_files/LengthIds/novelUtronsbigger%d" %length
short.to_csv(shortoutfile, header=None, index=None, sep="\t")
big.to_csv(bigoutfile, header=None, index=None, sep="\t")
#####################################################################################################
# Filter by < 200 and > 200 bp
length = 200
short = novelUtrons[novelUtrons["length"]<=length]
big = novelUtrons[novelUtrons["length"]>length]
# Write txnames to csv file
shortoutfile = "/shared/sudlab1/General/projects/utrons_project/misc_files/LengthIds/novelUtronssmaller%d" %length
bigoutfile = "/shared/sudlab1/General/projects/utrons_project/misc_files/LengthIds/novelUtronsbigger%d" %length
short.to_csv(shortoutfile, header=None, index=None, sep="\t")
big.to_csv(bigoutfile, header=None, index=None, sep="\t")