In [ ]:
import tensorflow as tf
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from collections import Counter
# Pretty much translating my clojure directly into python here...
# k-mer size to use
k = 9
#
# NOTE!!!!!!!!!!!!!!!!
#
# We can reduce problem space if we get the reverse complement, and add a bit to indicate reversed or not...
#
# Also -- Build a recurrent network to predict sequences that come after a given kmer?
# Look at word2vec, dna2vec, bag of words, skip-gram
#
# Problem space
space = 5 ** k
def partition(n, step, coll):
for i in range(0, len(coll), step):
if (i+n > len(coll)):
break # raise StopIteration...
yield coll[i:i+n]
def get_kmers(k):
return lambda sequence: partition(k, 1, sequence)
def convert_nt(c):
return {"N": 0, "A": 1, "C": 2, "T": 3, "G": 4}.get(c, 0)
def convert_kmer_to_int(kmer):
return int(''.join(str(x) for x in (map(convert_nt, kmer))), 5)
# Sequences should be 1000bp in length, but will be multipled appropriately...
# This is not the best behavior, but it is what exists...
def convert_to_sparse_matrix_previous(sequence):
c = Counter(map(convert_kmer_to_int, get_kmers(k)(sequence)))
length = len(sequence)
lmul = 1000 / length # If length isn't 1000, multiple by some number to bring scores closer to expected
# Because of how sparse this matrix is, I don't think this will actually help much...
return csr_matrix([c.get(x, 0) for x in range(0, space)]) * lmul
# Sequences should be 1000bp in length, but will be multipled appropriately...
# This is not the best behavior, but it is what exists...
def convert_to_sparse_matrix(sequence):
c = Counter(map(convert_kmer_to_int, get_kmers(k)(sequence)))
length = len(sequence)
lmul = 1000 / length # If length isn't 1000, multiple by some number to bring scores closer to expected
# Because of how sparse this matrix is, I don't think this will actually help much...
cmat = csr_matrix([c.get(x, 0) for x in range(0, space)]) * lmul
coo = cmat.tocoo()
indices = np.mat([coo.row, coo.col]).transpose()
return tf.SparseTensor(indices, coo.data, coo.shape)
tf.logging.set_verbosity(tf.logging.INFO)