In [ ]:
import tensorflow as tf
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from collections import Counter

# Pretty much translating my clojure directly into python here...

# k-mer size to use
k = 9

#
# NOTE!!!!!!!!!!!!!!!!
#
# We can reduce problem space if we get the reverse complement, and add a bit to indicate reversed or not...
#
# Also -- Build a recurrent network to predict sequences that come after a given kmer?
# Look at word2vec, dna2vec, bag of words, skip-gram
#


# Problem space
space = 5 ** k

def partition(n, step, coll):
    for i in range(0, len(coll), step):
        if (i+n > len(coll)):
            break #  raise StopIteration...
        yield coll[i:i+n]
        
def get_kmers(k):
    return lambda sequence: partition(k, 1, sequence)

def convert_nt(c):
    return {"N": 0, "A": 1, "C": 2, "T": 3, "G": 4}.get(c, 0)

def convert_kmer_to_int(kmer):
    return int(''.join(str(x) for x in (map(convert_nt, kmer))), 5)

# Sequences should be 1000bp in length, but will be multipled appropriately...
# This is not the best behavior, but it is what exists...
def convert_to_sparse_matrix_previous(sequence):
    c = Counter(map(convert_kmer_to_int, get_kmers(k)(sequence)))
    length = len(sequence)
    lmul = 1000 / length # If length isn't 1000, multiple by some number to bring scores closer to expected
                         # Because of how sparse this matrix is, I don't think this will actually help much...
    return csr_matrix([c.get(x, 0) for x in range(0, space)]) * lmul
    
# Sequences should be 1000bp in length, but will be multipled appropriately...
# This is not the best behavior, but it is what exists...
def convert_to_sparse_matrix(sequence):
    c = Counter(map(convert_kmer_to_int, get_kmers(k)(sequence)))
    length = len(sequence)
    lmul = 1000 / length # If length isn't 1000, multiple by some number to bring scores closer to expected
                         # Because of how sparse this matrix is, I don't think this will actually help much...
    cmat = csr_matrix([c.get(x, 0) for x in range(0, space)]) * lmul
    coo = cmat.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensor(indices, coo.data, coo.shape)
    
tf.logging.set_verbosity(tf.logging.INFO)