In [1]:
%run al_funcs.ipynb

In [2]:
handle = open("../../Genomic Data/Escherichia coli str. K-12 substr. MG1655, complete genome.fasta", 'rb')
ecoligenome = [SeqIO.read(handle, "fasta", alphabet=IUPACAmbiguousDNA())]

In [3]:
cutslist = [item for item in al_digesttarget(ecoligenome)]

In [4]:
cutslistdict = {}
for item in cutslist:
    cutslistdict[str(item.seq)] = item.description

In [204]:
# Load in the BLAST results
result_handle = open("HiSeqOut/alltgts_ADL1.blast")
ecoli_hits_blast = NCBIXML.parse(result_handle) # use NCBIXML.parse(result_handle) for multiple queries here

In [202]:
item = next(ecoli_hits_blast)

In [203]:
print item.alignments[0].hsps[0].sbjct_start 
print item.alignments[0].hsps[0].sbjct_end


3495618
3495637

In [ ]:
a_goodguides = []
index = 0

for item in range(10000000):
    try:
        item = next(ecoli_hits_blast)
    except:
        break
    index += 1
    try:
        if item.alignments[0].hsps[0].score > 19:
            try: 
                a_goodguides.append((cutslistdict[item.alignments[0].hsps[0].query[0:20]], \
                                     item.alignments[0].hsps[0].query[0:20], \
                                     item.alignments[0].hsps[0].sbjct_start, \
                                     item.alignments[0].hsps[0].sbjct_end))
            except: None
    except: None
    if str(index)[-5:] == "00000": #only print every 100000
        sys.stdout.flush() # These two lines produce the line counter as the loop runs
        sys.stdout.write('\r' + str(index) + " "),


300000 

In [8]:
import cPickle as pickle

In [207]:
pickle.dump(a_goodguides, open( "HiSeqOut/Part 5 - functional mapping/ADL1_goodguides.p", "wb" ))

In [205]:
#pickle.dump(a_goodguides, open( "HiSeqOut/Part 5 - functional mapping/ADL2_goodguides.p", "wb" ))

In [210]:
a_goodguides = pickle.load( open( "HiSeqOut/Part 5 - functional mapping/ADL2_goodguides.p", "rb" ) )

In [211]:
a_goodguides.extend(pickle.load( open( "HiSeqOut/Part 5 - functional mapping/ADL1_goodguides.p", "rb" ) ))

In [185]:
#a_goodguides.extend(pickle.load( open( "HiSeqOut/a_goodguides_ADL1_more_stringent.p", "rb" ) ))

In [212]:
len(a_goodguides)


Out[212]:
1842529

In [213]:
a_goodguides = set(a_goodguides)

In [214]:
len(a_goodguides)


Out[214]:
51376

In [215]:
gene_annotations = []

import csv

with open('../../Genomic Data/AllEcoligenesEcogene2014117.txt','rb') as tsvin:
    tsvin = csv.reader(tsvin, delimiter='\t')
    for row in tsvin:
        gene_annotations.append(row)

In [216]:
gene_annotations[2]


Out[216]:
['EG10002',
 'ECK0753',
 'modB',
 'chlJ',
 'aa',
 '229',
 'Clockwise',
 '795862',
 '796551',
 'molybdate ABC transporter permease; chlorate resistance protein',
 'Transport; Transport of small molecules: Anions',
 'Molybdate ABC transporter permease; chlorate resistance',
 'Null',
 '; 1786980',
 'P0AF01']

In [217]:
gene_annotations_dict = {}
for item in gene_annotations:
    gene_annotations_dict[item[0]] = item[1:]

In [218]:
print len(gene_annotations) # Number of genes in EcoGene datafile


4503

In [219]:
#Uncomment if you want to regenerate match of guides to genes
genes_in_library = []
guides_with_gene_info = []
for item in a_goodguides:
    for j in gene_annotations[1:]:   
        if item[2] > int(j[7]) and item[2] < int(j[8]):
            guides_with_gene_info.append([item, j[-1]])
            genes_in_library.append(j[0])
        else:
            None
            #print("Not in a gene")

In [220]:
len(guides_with_gene_info)


Out[220]:
48169

In [ ]:
pickle.dump(genes_in_library, open( "HiSeqOut/Part 5 - functional mapping/bothreps_genes_in_library.p", "wb" ))
pickle.dump(guides_with_gene_info, open( "HiSeqOut/Part 5 - functional mapping/bothreps_guides_with_gene_info.p", "wb" ))

In [ ]:
genes_in_library = pickle.load( open( "HiSeqOut/Part 5 - functional mapping/ADL2_genes_in_library.p", "rb" ) )
guides_with_gene_info = pickle.load( open( "HiSeqOut/Part 5 - functional mapping/ADL2_guides_with_gene_info.p", "rb" ) )

In [221]:
len(genes_in_library) #Number of reads that are in genes...


Out[221]:
48169

In [222]:
len(set(genes_in_library)) # Number of EcoGene genes represented in EATING library


Out[222]:
3984

In [ ]:
gene_annotations_dict

In [223]:
# Add lengths of genes to genes dict
for item in gene_annotations_dict:
    try:
        gene_annotations_dict[item].append(int(gene_annotations_dict[item][7]) - int(gene_annotations_dict[item][6]))
    except:
        print("not there")
del gene_annotations_dict["EG"]


not there

In [224]:
genes_in_library_with_lengths = []
for item in set(genes_in_library):
    genes_in_library_with_lengths.append([item, gene_annotations_dict[item][-1]])

In [231]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

path = '/Library/Fonts/Microsoft/Arial.ttf'
prop = fm.FontProperties(fname=path)
mpl.rcParams['font.family'] = prop.get_name()

In [244]:
% pylab inline
#bins = [0, 100, 200, 400, 800, 1600, 3200, 6400]
bins = xrange(0, 5000, 100)
figure(figsize=(10, 4))
hist([item[-1][-1] for item in gene_annotations_dict.iteritems()], bins, color = "white", label="All E. coli genes")  #Extracting the last item(gene lengths) from the dict entries
hist([item[1] for item in genes_in_library_with_lengths], bins, color="gray", label="Genes targeted by guide library")
tick_params(axis=u'both', labelsize=18)
legend()
legend(loc=1,prop={'size':16})
savefig('HiSeqOut/Part 5 - functional mapping/Guides Targeted vs length_bothreps.pdf', format="pdf")
#gca().set_xscale("log")


Populating the interactive namespace from numpy and matplotlib

In [75]:
max([item[-1][-1] for item in gene_annotations_dict.iteritems()])


Out[75]:
8621

In [76]:
go_terms = []
with open('../../Genomic Data/gene_association.ecocyc','rb') as tsvin:
    tsvin = csv.reader(tsvin, delimiter='\t')
    for row in tsvin:
        if len(row) >2:
            go_terms.append(row)

In [77]:
go_terms_dict = {}
for item in go_terms:
    try:
        go_terms_dict[item[1]].append(item[4])
    except:
        go_terms_dict[item[1]] = [item[4]]
    go_terms_dict[item[1]] = list(set(go_terms_dict[item[1]]))

In [78]:
go_definitions = []
with open('../../Genomic Data/go-basic.obo.txt','rb') as tsvin:
    tsvin = tsvin.read()
    splitfile = tsvin.split("\n\n")

In [79]:
subsplit = []
for item in splitfile:
    if item.split("\n")[0] == "[Term]":
        subsplit.append(item.split("\n")[1:])

In [80]:
go_definitions_dict = {}
for item in subsplit:
    go_definitions_dict[item[0].split(" ")[1]] = item[1:]

In [81]:
guides_with_gene_info_and_GO = []

for item in set(genes_in_library):
    RCSBID = gene_annotations_dict[item][-2]
    try:
        guides_with_gene_info_and_GO.append((item, go_terms_dict[RCSBID]))
    except:
        None
        #print("No GO terms for this RCSB ID")

In [ ]:
gene_annotations_dict

In [83]:
all_gos_in_lib = []
for j in [item[1] for item in guides_with_gene_info_and_GO]:
    all_gos_in_lib.extend(j)

In [84]:
len(all_gos_in_lib)


Out[84]:
27184

In [85]:
from collections import Counter

In [86]:
c = Counter(all_gos_in_lib)
freqs = []
for item in c:
   freqs.append((c[item], item))

In [87]:
freqs = reversed(sorted(freqs))

In [88]:
freqslist = []
i = 0
while i <100:
    freqslist.append(next(freqs))
    i += 1

In [89]:
freqslist_with_gonames = []
for item in freqslist:
    if go_definitions_dict[item[1]][1] == "namespace: biological_process":
        go_name = go_definitions_dict[item[1]][0].split(" ",1)[1] # Split on first occurrence
        freqslist_with_gonames.append([item[0], item[1], go_name])

In [226]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import numpy

In [227]:
# This sets my pyplot font to Arial
# http://stackoverflow.com/questions/16574898/how-to-load-ttf-file-in-matplotlib-using-mpl-rcparams?lq=1
path = '/Library/Fonts/Microsoft/Arial.ttf'
prop = fm.FontProperties(fname=path)
mpl.rcParams['font.family'] = prop.get_name()

In [100]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['sample', 'shuffle', 'randint', 'random', 'triangular', 'uniform', 'seed']
`%matplotlib` prevents importing * from pylab and numpy

In [102]:
freqslist_with_gonames


Out[102]:
[[644, 'GO:0006810', 'transport'],
 [429, 'GO:0055114', 'oxidation-reduction process'],
 [349, 'GO:0008152', 'metabolic process'],
 [318, 'GO:0006355', 'regulation of transcription, DNA-templated'],
 [308, 'GO:0006351', 'transcription, DNA-templated'],
 [227, 'GO:0006974', 'cellular response to DNA damage stimulus'],
 [161, 'GO:0016310', 'phosphorylation'],
 [149, 'GO:0055085', 'transmembrane transport'],
 [141, 'GO:0005975', 'carbohydrate metabolic process'],
 [108, 'GO:0006811', 'ion transport'],
 [99, 'GO:0008652', 'cellular amino acid biosynthetic process'],
 [96, 'GO:0008643', 'carbohydrate transport'],
 [90, 'GO:0006412', 'translation'],
 [82, 'GO:0006508', 'proteolysis'],
 [80, 'GO:0046677', 'response to antibiotic'],
 [75, 'GO:0009103', 'lipopolysaccharide biosynthetic process'],
 [75, 'GO:0000160', 'phosphorelay signal transduction system'],
 [72, 'GO:0006629', 'lipid metabolic process'],
 [71, 'GO:0006865', 'amino acid transport'],
 [69, 'GO:0006281', 'DNA repair'],
 [63, 'GO:0015031', 'protein transport'],
 [63, 'GO:0006310', 'DNA recombination'],
 [62, 'GO:0045892', 'negative regulation of transcription, DNA-templated'],
 [58, 'GO:0051301', 'cell division'],
 [58, 'GO:0032259', 'methylation'],
 [56, 'GO:0009058', 'biosynthetic process'],
 [52, 'GO:0009408', 'response to heat'],
 [51,
  'GO:0009401',
  'phosphoenolpyruvate-dependent sugar phosphotransferase system'],
 [48, 'GO:0008360', 'regulation of cell shape'],
 [48, 'GO:0006260', 'DNA replication'],
 [47, 'GO:0008033', 'tRNA processing'],
 [46, 'GO:0009252', 'peptidoglycan biosynthetic process'],
 [44, 'GO:0006979', 'response to oxidative stress'],
 [43, 'GO:0045893', 'positive regulation of transcription, DNA-templated'],
 [42, 'GO:0007155', 'cell adhesion'],
 [40, 'GO:0007165', 'signal transduction'],
 [37, 'GO:0006364', 'rRNA processing'],
 [35, 'GO:0009061', 'anaerobic respiration']]

In [106]:
plt.figure(figsize=(8, 6))
N = len(freqslist_with_gonames[:10])
ind = numpy.arange(N)    # the x locations for the groups
width = 0.8
mpl.pyplot.barh(ind, [item[0] for item in freqslist_with_gonames[:10]], width, hold=None)
plt.yticks(ind+width/2., [item[2] for item in freqslist_with_gonames], rotation=0 )
#plt.show()


Out[106]:
([<matplotlib.axis.YTick at 0x136863ad0>,
  <matplotlib.axis.YTick at 0x118986a90>,
  <matplotlib.axis.YTick at 0x136f81f90>,
  <matplotlib.axis.YTick at 0x136f9b890>,
  <matplotlib.axis.YTick at 0x136f9bfd0>,
  <matplotlib.axis.YTick at 0x136fb5750>,
  <matplotlib.axis.YTick at 0x136fb5e90>,
  <matplotlib.axis.YTick at 0x136f86610>,
  <matplotlib.axis.YTick at 0x136f86d50>,
  <matplotlib.axis.YTick at 0x136f9a4d0>],
 <a list of 10 Text yticklabel objects>)

In [ ]:


In [107]:
all_ecoli_with_gene_info_and_GO = []

for item in gene_annotations_dict.iterkeys():
    RCSBID = gene_annotations_dict[item][-2]
    try:
        all_ecoli_with_gene_info_and_GO.append((item, go_terms_dict[RCSBID]))
    except:
        None
        #print("No GO terms for this RCSB ID")

In [108]:
all_ecoli_gos = []
for j in [item[1] for item in all_ecoli_with_gene_info_and_GO]:
    all_ecoli_gos.extend(j)

In [109]:
go_count_all_ecoli = Counter(all_ecoli_gos)
all_ecoli_freqs = []
for item in go_count_all_ecoli:
   all_ecoli_freqs.append((go_count_all_ecoli[item], item))
all_ecoli_freqs = list(reversed(sorted(all_ecoli_freqs)))

In [110]:
all_ecoli_freqslist = []
i = 0
for item in all_ecoli_freqs:
    all_ecoli_freqslist.append(item)

In [111]:
all_ecoli_freqslist_with_gonames = []
for item in all_ecoli_freqslist:
    if go_definitions_dict[item[1]][1] == "namespace: biological_process":
        go_name = go_definitions_dict[item[1]][0].split(" ",1)[1] # Split on first occurrence
        all_ecoli_freqslist_with_gonames.append([item[0], item[1], go_name])

In [112]:
all_ecoli_freqslist_with_gonames[0:10]


Out[112]:
[[662, 'GO:0006810', 'transport'],
 [439, 'GO:0055114', 'oxidation-reduction process'],
 [356, 'GO:0006355', 'regulation of transcription, DNA-templated'],
 [353, 'GO:0008152', 'metabolic process'],
 [345, 'GO:0006351', 'transcription, DNA-templated'],
 [240, 'GO:0006974', 'cellular response to DNA damage stimulus'],
 [163, 'GO:0016310', 'phosphorylation'],
 [149, 'GO:0055085', 'transmembrane transport'],
 [142, 'GO:0005975', 'carbohydrate metabolic process'],
 [112, 'GO:0006811', 'ion transport']]

In [113]:
all_ecoli_freqslist_with_gonames_dict = {}
for item in all_ecoli_freqslist_with_gonames:
    all_ecoli_freqslist_with_gonames_dict[item[2]] = item[0]

In [114]:
collated_go_frequences = []
for item in freqslist_with_gonames:
    print item[-1]
    collated_go_frequences.append([item[-1], item[0], all_ecoli_freqslist_with_gonames_dict[item[-1]]])


transport
oxidation-reduction process
metabolic process
regulation of transcription, DNA-templated
transcription, DNA-templated
cellular response to DNA damage stimulus
phosphorylation
transmembrane transport
carbohydrate metabolic process
ion transport
cellular amino acid biosynthetic process
carbohydrate transport
translation
proteolysis
response to antibiotic
lipopolysaccharide biosynthetic process
phosphorelay signal transduction system
lipid metabolic process
amino acid transport
DNA repair
protein transport
DNA recombination
negative regulation of transcription, DNA-templated
cell division
methylation
biosynthetic process
response to heat
phosphoenolpyruvate-dependent sugar phosphotransferase system
regulation of cell shape
DNA replication
tRNA processing
peptidoglycan biosynthetic process
response to oxidative stress
positive regulation of transcription, DNA-templated
cell adhesion
signal transduction
rRNA processing
anaerobic respiration

In [118]:
list(enumerate(collated_go_frequences))


Out[118]:
[(0, ['transport', 644, 662]),
 (1, ['oxidation-reduction process', 429, 439]),
 (2, ['metabolic process', 349, 353]),
 (3, ['regulation of transcription, DNA-templated', 318, 356]),
 (4, ['transcription, DNA-templated', 308, 345]),
 (5, ['cellular response to DNA damage stimulus', 227, 240]),
 (6, ['phosphorylation', 161, 163]),
 (7, ['transmembrane transport', 149, 149]),
 (8, ['carbohydrate metabolic process', 141, 142]),
 (9, ['ion transport', 108, 112]),
 (10, ['cellular amino acid biosynthetic process', 99, 106]),
 (11, ['carbohydrate transport', 96, 100]),
 (12, ['translation', 90, 104]),
 (13, ['proteolysis', 82, 83]),
 (14, ['response to antibiotic', 80, 88]),
 (15, ['lipopolysaccharide biosynthetic process', 75, 76]),
 (16, ['phosphorelay signal transduction system', 75, 76]),
 (17, ['lipid metabolic process', 72, 72]),
 (18, ['amino acid transport', 71, 72]),
 (19, ['DNA repair', 69, 73]),
 (20, ['protein transport', 63, 68]),
 (21, ['DNA recombination', 63, 96]),
 (22, ['negative regulation of transcription, DNA-templated', 62, 68]),
 (23, ['cell division', 58, 63]),
 (24, ['methylation', 58, 59]),
 (25, ['biosynthetic process', 56, 56]),
 (26, ['response to heat', 52, 53]),
 (27,
  ['phosphoenolpyruvate-dependent sugar phosphotransferase system', 51, 55]),
 (28, ['regulation of cell shape', 48, 48]),
 (29, ['DNA replication', 48, 48]),
 (30, ['tRNA processing', 47, 48]),
 (31, ['peptidoglycan biosynthetic process', 46, 47]),
 (32, ['response to oxidative stress', 44, 45]),
 (33, ['positive regulation of transcription, DNA-templated', 43, 45]),
 (34, ['cell adhesion', 42, 43]),
 (35, ['signal transduction', 40, 40]),
 (36, ['rRNA processing', 37, 37]),
 (37, ['anaerobic respiration', 35, 35])]

In [123]:
collated_go_frequences_all = collated_go_frequences[:]

In [156]:
collated_go_frequences_all = pickle.load( open("./HiSeqOut/Part 5 - functional mapping/collated_go_frequencies_all.p", "r"))

In [173]:
collated_go_frequences = [item for index, item in enumerate(collated_go_frequences_all) if index in [0, 2, 4, 12, 13, 19, 29, 34]]

In [174]:
#collated_go_frequences[4][0] = "cellular amino\nacid biosynthetic process"

In [176]:
collated_go_frequences


Out[176]:
[['transport', 644, 662],
 ['metabolic process', 349, 353],
 ['transcription, DNA-templated', 308, 345],
 ['translation', 90, 104],
 ['proteolysis', 82, 83],
 ['DNA repair', 69, 73],
 ['DNA replication', 48, 48],
 ['cell adhesion', 42, 43]]

In [153]:
#pickle.dump(collated_go_frequences_all, open("./HiSeqOut/Part 5 - functional mapping/collated_go_frequencies_all.p", "wb"))

In [184]:
import matplotlib.patches as mpatches 
figure(figsize=(8.5, 3.5))
N = len(collated_go_frequences)
ind = np.arange(N)    # the x locations for the groups
width = 0.8
all_coli_plot = matplotlib.pyplot.barh(ind, [item[2] for item in collated_go_frequences], width, hold=None, label="All E. coli genes", color="white")
guides_plot = matplotlib.pyplot.barh(ind, [item[1] for item in collated_go_frequences], width, hold=None, color="gray", alpha=1, label="Genes targeted\n by guide library")
plt.yticks(ind+width/2., [item[0] for item in collated_go_frequences], rotation=0 )
plt.legend()
tick_params(axis=u'both', labelsize=16)
legend(loc=1,prop={'size':16})
savefig('HiSeqOut/Part 5 - functional mapping/GoTerms - All Ecoli vs targeted by guides_selectedGOs.pdf', bbox_inches='tight')



In [ ]: