In [1]:
%run al_funcs.ipynb
In [2]:
handle = open("../../Genomic Data/Escherichia coli str. K-12 substr. MG1655, complete genome.fasta", 'rb')
ecoligenome = [SeqIO.read(handle, "fasta", alphabet=IUPACAmbiguousDNA())]
In [3]:
cutslist = [item for item in al_digesttarget(ecoligenome)]
In [4]:
cutslistdict = {}
for item in cutslist:
cutslistdict[str(item.seq)] = item.description
In [204]:
# Load in the BLAST results
result_handle = open("HiSeqOut/alltgts_ADL1.blast")
ecoli_hits_blast = NCBIXML.parse(result_handle) # use NCBIXML.parse(result_handle) for multiple queries here
In [202]:
item = next(ecoli_hits_blast)
In [203]:
print item.alignments[0].hsps[0].sbjct_start
print item.alignments[0].hsps[0].sbjct_end
In [ ]:
a_goodguides = []
index = 0
for item in range(10000000):
try:
item = next(ecoli_hits_blast)
except:
break
index += 1
try:
if item.alignments[0].hsps[0].score > 19:
try:
a_goodguides.append((cutslistdict[item.alignments[0].hsps[0].query[0:20]], \
item.alignments[0].hsps[0].query[0:20], \
item.alignments[0].hsps[0].sbjct_start, \
item.alignments[0].hsps[0].sbjct_end))
except: None
except: None
if str(index)[-5:] == "00000": #only print every 100000
sys.stdout.flush() # These two lines produce the line counter as the loop runs
sys.stdout.write('\r' + str(index) + " "),
In [8]:
import cPickle as pickle
In [207]:
pickle.dump(a_goodguides, open( "HiSeqOut/Part 5 - functional mapping/ADL1_goodguides.p", "wb" ))
In [205]:
#pickle.dump(a_goodguides, open( "HiSeqOut/Part 5 - functional mapping/ADL2_goodguides.p", "wb" ))
In [210]:
a_goodguides = pickle.load( open( "HiSeqOut/Part 5 - functional mapping/ADL2_goodguides.p", "rb" ) )
In [211]:
a_goodguides.extend(pickle.load( open( "HiSeqOut/Part 5 - functional mapping/ADL1_goodguides.p", "rb" ) ))
In [185]:
#a_goodguides.extend(pickle.load( open( "HiSeqOut/a_goodguides_ADL1_more_stringent.p", "rb" ) ))
In [212]:
len(a_goodguides)
Out[212]:
In [213]:
a_goodguides = set(a_goodguides)
In [214]:
len(a_goodguides)
Out[214]:
In [215]:
gene_annotations = []
import csv
with open('../../Genomic Data/AllEcoligenesEcogene2014117.txt','rb') as tsvin:
tsvin = csv.reader(tsvin, delimiter='\t')
for row in tsvin:
gene_annotations.append(row)
In [216]:
gene_annotations[2]
Out[216]:
In [217]:
gene_annotations_dict = {}
for item in gene_annotations:
gene_annotations_dict[item[0]] = item[1:]
In [218]:
print len(gene_annotations) # Number of genes in EcoGene datafile
In [219]:
#Uncomment if you want to regenerate match of guides to genes
genes_in_library = []
guides_with_gene_info = []
for item in a_goodguides:
for j in gene_annotations[1:]:
if item[2] > int(j[7]) and item[2] < int(j[8]):
guides_with_gene_info.append([item, j[-1]])
genes_in_library.append(j[0])
else:
None
#print("Not in a gene")
In [220]:
len(guides_with_gene_info)
Out[220]:
In [ ]:
pickle.dump(genes_in_library, open( "HiSeqOut/Part 5 - functional mapping/bothreps_genes_in_library.p", "wb" ))
pickle.dump(guides_with_gene_info, open( "HiSeqOut/Part 5 - functional mapping/bothreps_guides_with_gene_info.p", "wb" ))
In [ ]:
genes_in_library = pickle.load( open( "HiSeqOut/Part 5 - functional mapping/ADL2_genes_in_library.p", "rb" ) )
guides_with_gene_info = pickle.load( open( "HiSeqOut/Part 5 - functional mapping/ADL2_guides_with_gene_info.p", "rb" ) )
In [221]:
len(genes_in_library) #Number of reads that are in genes...
Out[221]:
In [222]:
len(set(genes_in_library)) # Number of EcoGene genes represented in EATING library
Out[222]:
In [ ]:
gene_annotations_dict
In [223]:
# Add lengths of genes to genes dict
for item in gene_annotations_dict:
try:
gene_annotations_dict[item].append(int(gene_annotations_dict[item][7]) - int(gene_annotations_dict[item][6]))
except:
print("not there")
del gene_annotations_dict["EG"]
In [224]:
genes_in_library_with_lengths = []
for item in set(genes_in_library):
genes_in_library_with_lengths.append([item, gene_annotations_dict[item][-1]])
In [231]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
path = '/Library/Fonts/Microsoft/Arial.ttf'
prop = fm.FontProperties(fname=path)
mpl.rcParams['font.family'] = prop.get_name()
In [244]:
% pylab inline
#bins = [0, 100, 200, 400, 800, 1600, 3200, 6400]
bins = xrange(0, 5000, 100)
figure(figsize=(10, 4))
hist([item[-1][-1] for item in gene_annotations_dict.iteritems()], bins, color = "white", label="All E. coli genes") #Extracting the last item(gene lengths) from the dict entries
hist([item[1] for item in genes_in_library_with_lengths], bins, color="gray", label="Genes targeted by guide library")
tick_params(axis=u'both', labelsize=18)
legend()
legend(loc=1,prop={'size':16})
savefig('HiSeqOut/Part 5 - functional mapping/Guides Targeted vs length_bothreps.pdf', format="pdf")
#gca().set_xscale("log")
In [75]:
max([item[-1][-1] for item in gene_annotations_dict.iteritems()])
Out[75]:
In [76]:
go_terms = []
with open('../../Genomic Data/gene_association.ecocyc','rb') as tsvin:
tsvin = csv.reader(tsvin, delimiter='\t')
for row in tsvin:
if len(row) >2:
go_terms.append(row)
In [77]:
go_terms_dict = {}
for item in go_terms:
try:
go_terms_dict[item[1]].append(item[4])
except:
go_terms_dict[item[1]] = [item[4]]
go_terms_dict[item[1]] = list(set(go_terms_dict[item[1]]))
In [78]:
go_definitions = []
with open('../../Genomic Data/go-basic.obo.txt','rb') as tsvin:
tsvin = tsvin.read()
splitfile = tsvin.split("\n\n")
In [79]:
subsplit = []
for item in splitfile:
if item.split("\n")[0] == "[Term]":
subsplit.append(item.split("\n")[1:])
In [80]:
go_definitions_dict = {}
for item in subsplit:
go_definitions_dict[item[0].split(" ")[1]] = item[1:]
In [81]:
guides_with_gene_info_and_GO = []
for item in set(genes_in_library):
RCSBID = gene_annotations_dict[item][-2]
try:
guides_with_gene_info_and_GO.append((item, go_terms_dict[RCSBID]))
except:
None
#print("No GO terms for this RCSB ID")
In [ ]:
gene_annotations_dict
In [83]:
all_gos_in_lib = []
for j in [item[1] for item in guides_with_gene_info_and_GO]:
all_gos_in_lib.extend(j)
In [84]:
len(all_gos_in_lib)
Out[84]:
In [85]:
from collections import Counter
In [86]:
c = Counter(all_gos_in_lib)
freqs = []
for item in c:
freqs.append((c[item], item))
In [87]:
freqs = reversed(sorted(freqs))
In [88]:
freqslist = []
i = 0
while i <100:
freqslist.append(next(freqs))
i += 1
In [89]:
freqslist_with_gonames = []
for item in freqslist:
if go_definitions_dict[item[1]][1] == "namespace: biological_process":
go_name = go_definitions_dict[item[1]][0].split(" ",1)[1] # Split on first occurrence
freqslist_with_gonames.append([item[0], item[1], go_name])
In [226]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import numpy
In [227]:
# This sets my pyplot font to Arial
# http://stackoverflow.com/questions/16574898/how-to-load-ttf-file-in-matplotlib-using-mpl-rcparams?lq=1
path = '/Library/Fonts/Microsoft/Arial.ttf'
prop = fm.FontProperties(fname=path)
mpl.rcParams['font.family'] = prop.get_name()
In [100]:
%pylab inline
In [102]:
freqslist_with_gonames
Out[102]:
In [106]:
plt.figure(figsize=(8, 6))
N = len(freqslist_with_gonames[:10])
ind = numpy.arange(N) # the x locations for the groups
width = 0.8
mpl.pyplot.barh(ind, [item[0] for item in freqslist_with_gonames[:10]], width, hold=None)
plt.yticks(ind+width/2., [item[2] for item in freqslist_with_gonames], rotation=0 )
#plt.show()
Out[106]:
In [ ]:
In [107]:
all_ecoli_with_gene_info_and_GO = []
for item in gene_annotations_dict.iterkeys():
RCSBID = gene_annotations_dict[item][-2]
try:
all_ecoli_with_gene_info_and_GO.append((item, go_terms_dict[RCSBID]))
except:
None
#print("No GO terms for this RCSB ID")
In [108]:
all_ecoli_gos = []
for j in [item[1] for item in all_ecoli_with_gene_info_and_GO]:
all_ecoli_gos.extend(j)
In [109]:
go_count_all_ecoli = Counter(all_ecoli_gos)
all_ecoli_freqs = []
for item in go_count_all_ecoli:
all_ecoli_freqs.append((go_count_all_ecoli[item], item))
all_ecoli_freqs = list(reversed(sorted(all_ecoli_freqs)))
In [110]:
all_ecoli_freqslist = []
i = 0
for item in all_ecoli_freqs:
all_ecoli_freqslist.append(item)
In [111]:
all_ecoli_freqslist_with_gonames = []
for item in all_ecoli_freqslist:
if go_definitions_dict[item[1]][1] == "namespace: biological_process":
go_name = go_definitions_dict[item[1]][0].split(" ",1)[1] # Split on first occurrence
all_ecoli_freqslist_with_gonames.append([item[0], item[1], go_name])
In [112]:
all_ecoli_freqslist_with_gonames[0:10]
Out[112]:
In [113]:
all_ecoli_freqslist_with_gonames_dict = {}
for item in all_ecoli_freqslist_with_gonames:
all_ecoli_freqslist_with_gonames_dict[item[2]] = item[0]
In [114]:
collated_go_frequences = []
for item in freqslist_with_gonames:
print item[-1]
collated_go_frequences.append([item[-1], item[0], all_ecoli_freqslist_with_gonames_dict[item[-1]]])
In [118]:
list(enumerate(collated_go_frequences))
Out[118]:
In [123]:
collated_go_frequences_all = collated_go_frequences[:]
In [156]:
collated_go_frequences_all = pickle.load( open("./HiSeqOut/Part 5 - functional mapping/collated_go_frequencies_all.p", "r"))
In [173]:
collated_go_frequences = [item for index, item in enumerate(collated_go_frequences_all) if index in [0, 2, 4, 12, 13, 19, 29, 34]]
In [174]:
#collated_go_frequences[4][0] = "cellular amino\nacid biosynthetic process"
In [176]:
collated_go_frequences
Out[176]:
In [153]:
#pickle.dump(collated_go_frequences_all, open("./HiSeqOut/Part 5 - functional mapping/collated_go_frequencies_all.p", "wb"))
In [184]:
import matplotlib.patches as mpatches
figure(figsize=(8.5, 3.5))
N = len(collated_go_frequences)
ind = np.arange(N) # the x locations for the groups
width = 0.8
all_coli_plot = matplotlib.pyplot.barh(ind, [item[2] for item in collated_go_frequences], width, hold=None, label="All E. coli genes", color="white")
guides_plot = matplotlib.pyplot.barh(ind, [item[1] for item in collated_go_frequences], width, hold=None, color="gray", alpha=1, label="Genes targeted\n by guide library")
plt.yticks(ind+width/2., [item[0] for item in collated_go_frequences], rotation=0 )
plt.legend()
tick_params(axis=u'both', labelsize=16)
legend(loc=1,prop={'size':16})
savefig('HiSeqOut/Part 5 - functional mapping/GoTerms - All Ecoli vs targeted by guides_selectedGOs.pdf', bbox_inches='tight')
In [ ]: