In [1]:
import tensorflow as tf
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from collections import Counter
# Pretty much translating my clojure directly into python here...
# k-mer size to use
k = 9
#
# NOTE!!!!!!!!!!!!!!!!
#
# We can reduce problem space if we get the reverse complement, and add a bit to indicate reversed or not...
#
# Also -- Build a recurrent network to predict sequences that come after a given kmer?
# Look at word2vec, dna2vec, bag of words, skip-gram
#
# Problem space
space = 5 ** k
def partition(n, step, coll):
for i in range(0, len(coll), step):
if (i+n > len(coll)):
break # raise StopIteration...
yield coll[i:i+n]
def get_kmers(k):
return lambda sequence: partition(k, 1, sequence)
def convert_nt(c):
return {"N": 0, "A": 1, "C": 2, "T": 3, "G": 4}.get(c, 0)
def convert_kmer_to_int(kmer):
return int(''.join(str(x) for x in (map(convert_nt, kmer))), 5)
# Sequences should be 1000bp in length, but will be multipled appropriately...
# This is not the best behavior, but it is what exists...
def convert_to_sparse_matrix_previous(sequence):
c = Counter(map(convert_kmer_to_int, get_kmers(k)(sequence)))
length = len(sequence)
lmul = 1000 / length # If length isn't 1000, multiple by some number to bring scores closer to expected
# Because of how sparse this matrix is, I don't think this will actually help much...
return csr_matrix([c.get(x, 0) for x in range(0, space)]) * lmul
# Sequences should be 1000bp in length, but will be multipled appropriately...
# This is not the best behavior, but it is what exists...
def convert_to_sparse_matrix(sequence):
c = Counter(map(convert_kmer_to_int, get_kmers(k)(sequence)))
length = len(sequence)
lmul = 1000 / length # If length isn't 1000, multiple by some number to bring scores closer to expected
# Because of how sparse this matrix is, I don't think this will actually help much...
cmat = csr_matrix([c.get(x, 0) for x in range(0, space)]) * lmul
coo = cmat.tocoo()
indices = np.mat([coo.row, coo.col]).transpose()
return tf.SparseTensor(indices, coo.data, coo.shape)
tf.logging.set_verbosity(tf.logging.INFO)
In [83]:
z
Out[83]:
In [97]:
z = convert_to_sparse_matrix(seq_main)
x = tf.sparse_placeholder(tf.int)
y = tf.sparse_reduce_sum(x)
# l1 = tf.contrib.layers.fully_connected(x, 62, tf.nn.relu)
# correct_pred = tf.argmax(l1, 1)
# accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float64))
with tf.Session() as sess:
print(sess.run(y, feed_dict={x: z}))
In [14]:
a = next(get_kmers(k)(seq_main))
In [15]:
convert_kmer_to_int(a)
Out[15]:
In [54]:
for i in sorted(c.keys()):
In [51]:
c = Counter(map(convert_kmer_to_int, get_kmers(k)(seq_main)))
keys = tuple(sorted(c.keys()))
vals = tuple(c.get(x) for x in keys)
tf.SparseTensor(indices=keys, values=vals, dense_shape=space)
In [49]:
i
Out[49]:
In [5]:
seq_main = "CTCTGGAAAGGAGATTCGGCAGTGCGAAGGCGAGGCCCGCAAGGACGACCAGCCAGATGAGAATATTCTTGAACGGCGACAACCTGGGCATATTTCGAAATGCTCCATCCGCCGGTCTCTGCCGGTCGCGGCCCTTGCCCTAACCACGGTGATTTTGGCCTCGCGTCGCTCCAAAATCAAACCGTGATCAATGCCGATCGAAGGCGCGGTATGCGGGCGAAAACCGCGCACGCTTCTTCAAGTACCCGCGCCGGGGTTACTCCTTGACGGGCTCGCCCTTCACACGAACCTCGGAGACGCCGCTGCGCACGACGCGGACCTTGATGCCCTCGGCGATCTCCACTTCGAGCTCCGTATCGTCGACGACCTTGGTCACCTTGCCGACAATGCCGCCGCCGGTGACGACCTGGTCGCCGCGCCGGATGTTCTTCAGGAGCTCTTCGCGGCGCTTCATCTGCGCACGCTGCGGCCGGATGATCAGGAAATACATCACCACGAAGATCAGCAGGAATGGCAGGATGGACATCAGAATGTCGGCGCCGCCGCCCCAGGGGCCGCCGTCTGCGCGAAAGCTTCGGTAATAAACATCGATCACTCCTTGAGTTCAAATTGCGCGCTTGCCCCCGCGGCAAACCTGCCGGAATATAGGCAAGCCGTCCCGTAACACAAATCGTCGGTACACTTCCCCGTTTCTCCTGCCTCTGGCACAAATTCCGCAGCAGGAGAACCCCCTGGTTGCAGGCTGCCGGTCTTTTCCAGCGCAAACCGCCGTGCTACCGAGAAAAACGCCGCCGGCGGCAGCTTCAACGGATTCGACCGGAGGATGAACGTCGCGGCGATTCAAGGATTTGGGTGGGCTGACGCACGCCCGTTATCGCGCGGGTCGGCCGCCCAGTTCGAAATTCAGCCTGCCGGAGATACATGAAATGCCCGAAAGCAAGATCGACGTCCTGCTCAACGAAATACAGAAGCTTTCGGCCGCGATGGAGCGCATCGCCGGACCGGCATATGCCGTCAACAATTGGCATGAGGCGGAGTGTTTCGTCTGGGCACCGGCCACGCGCCACCTGCAGCCCGTCCCGAGGCCGAATCGCATCGACCTCGCGCTCATCGCCGGCGTCGACCATGTCCGCGACATTCTCTTCGACAACACGCTCCGCTTCGCCGAAGGCTATCCGGCGAACAACGTGCTCCTGTGGGGCGCCCGCGGCATGGGCAAATCGTCACTGGTCAAGGCGGTCCACGCAAAGGTCGCCCACGACACCGGCAGCGCAATCAAGCTTGTCGAAGTACACCGGGAGGATATCGCCACCCTGCCCGTGCTGATGGAAATCCTGAAGGCGGCGCCGATGCCCGTGATCGTCTTCTGCGATGATCTCTCCTTCGATCACGACGATACCTCCTACAAGTCGCTGAAGGCGGTTCTCGACGGCGGCGTCGAGGGGCGCCCGGCAAACGTTCTGCTCTATGCGACGTCCAACCGCAGACACCTGCTTCCCCGCAACATGATGGAAAATGAACAATCCACCGCCATTAACCCCTCGGAGGCCGTCGAGGAAAAAGTGTCGCTATCCGACCGCTTCGGGCTATGGCTGGGTTTCTACAAGTGCAGCCAGGACGACTATCTGGCGATGGTCGACGGGTATGCGCAGTACTTCAAATTGCCTCTCGAGCCCGAAGCGCTGCATGCCGAGGCTCTTGAATGGGCGACGACGCGAGGATCGAGGTCCGGCCGCGTCG"
seq_psyma = "GAACACCGGTACGGCGCCGAGCGCATCGACCTTCGACAGCCTGCTCGACAAGGGACAGGCCTCAGCCACCGATATTTGGTCACGTGCCTCCTGGCCGGTCGACATCGTCACCGGCGTCGGCGGCATGATGGTGATCGGCGCGAGCTTCATCGTCGCCGCGATCGGCTATATCGTCTCGCTTTACGCGCGGCTGGCGCTTGCCATCGTGCTCGCGATTGGACCAATTTTCGTGGCGCTCGCCATGTTTCAGGCGACGCGGCGCTTCACGGAGGCATGGATCGGCCAGCTTGCGAACTTTGTGATCCTCCAGGTCCTCGTCGTCGCCGTCGGCTCTCTACTGATCACCTGCATCGACACCACCTTCGCGGCGATCGACGGATATAGCGATGTGCTGATGCGGCCGATCGCACTCTGCGCCATCTGCCTCGCGGCTCTCTATGTCTTCTATCAACTCCCGAACATCGCCTCGGCGCTTGCCGCCGGCGGCGCGTCGTTGACCTACGGCTACGGCGCCGCACGCGACGCCCACGAAAGCACGCTCGCCTGGGCGGCTTCCCATACCGTCCGTGCGGCCGGACGTGGTGTCCGTGCCGTTGGCCGAACCTTCACCTCAAAAGGCTCCGGATCATGACGCTTTTCGCACGAACAAGAGAAAGGCTTTCCAGGATTAATCAGAACGTTCCGCTGCTTTGCGTTGCGGCGATCTTAAGCGGTTGCGCATCGATGACCTATCCGCTCCCGAAATGTGACGGCTATTCGCGCCGGCCCCTCAATCGATCGATGTGGCAGTGGGAAGACAATAGCAACTTCAAGCTGAAACAGTCCGATGCGCGACCGGCGGCCTCTCAGTCCGTCGCCACCGCTTATGCCGGCGAGGGCAGGGAATTTCCCGCCTTCGCACATCTCGACATCGACGCATCCTATCGTCCTTGCGAGGGTTGACTCGATGGTCTCGGCGGACGAACTCAAGACATACTTCGAAAAGGCGCGACGCTTCGATCAGGACCGCGTGATCCAGGT"
seq_psymb = "cgcCGCGGCTGCGGTTCAGCGCCAGCTCCAGATTGTCCCAGACCGTATGGTTCTCGAAGACGGTCGGCTTCTGGAACTTGCGGCCGATGCCGAGCTCGGCGATTGCCGCTTCGTCTTTCTTGGTGAGGTCGATGTCGCCCTTGAAGAAGACCTCGCCCTCGTCCGGCCGCGTCTTGCCGGTGATGATGTCCATCATCGTCGTCTTGCCGGCGCCATTGGGGCCGATGATCGCGCGCAGTTCCCCCGGCTCTACGACGAAGGAGAGCGAGTTTAGCGCCTTGAAGCCATCGAAGGAGACGGAGACCCCATCGAGATAGAGCAGGTTCCTGGGTTTCTTTCCGGTCATGGCGATCACTCCGCGGCCACCGTTTCGGCGTCCGCAAGGCTCGCCGCTTTTTCGCTCTCGCTTTCCTTCCGGGCCGCCGCGTGGGATGTGCGCCGGCTTGCGAGATAGCTCTGCGCCGTGCCGACCACGCCCTTCGGCAGGAAAAGCGTGACGAGGACGAAGAGCCCGCCGAGCGCAAAGAGCCAGAATTCGGGGAAGGCGGCGGTGAATATGCTTTTTCCGCCGTTGACGAGGATCGCGCCGACGATCGGTCCGATCAGCGTGCCGCGCCCGCCGACAGCCGTCCATATGACCACCTCGATCGAATTGGCGGGGGCGAACTCGCCCGGATTGATGATGCCGACTTGCGGCACGTAGAGCGCGCCGGCGACGCCCGCCATCATTGCCGAGACCGTGAAGGCGAAGAGCTTCATGTGCTCGACGCGATAGCCGAGAAAGCGTGTGCGGCTTTCCGCGTCGCGCAGCGCCACCAGCACCTTGCCGAATTTCGAGCGGACGATGCCCGAGGTGACGACGAGCGAAACGGCAAGCGCCAGCGCGGAGGCTGCAAAGAGTGCCGCACGCGTTCCGTCGGCCTGGATGTTGAAGCCGAGGATGTCCTTGAAATCGGTGAGCCCGTTATTGCCGCCGAAGCCCATGTCGTTGCGGAAGAAGGCGAGCAGCAGCGCATAGGTCATCGCCTGGGTGATGATCGAGAGATAGACCCCGTTGACCCGCGAGCGGAAGGCGAACCAGCCGAAGACGAAGGCAAGCAGGCCCGGCACCAGCACCACCATCAGCGCTGCGAACCAGAACATGTCGAAGCCGTACCAGAACCAGGGCAGCTCCTTCCAGTTGAGAAAGACCATGAAGTCCGGCAGCAGCGGATTGCCGTAGGAGCCGCGTGCGCCGATCTGGCGCATCAGATACATGCCCATGGCATAGCCGCCGAGCGCGAAGAAGGCCGCATGCCCCAGCGAGAGGATGCCGCAGAA"
In [41]:
seq_main
Out[41]:
In [42]:
seq_main[0]
Out[42]:
In [51]:
for i in get_kmers(10)(seq_main):
print(i)
In [55]:
convert_nt(\A)
In [56]:
\A
In [135]:
Out[135]:
In [ ]: