In [3]:
import tensorflow as tf
import numpy as np
import random
from six.moves import xrange  # pylint: disable=redefined-builtin
from Bio import SeqIO
import os
import concurrent.futures
import functools
import math
import pickle
import sys

k = 9


space = 5 ** k

def partition(n, step, coll):
    for i in range(0, len(coll), step):
        if (i+n > len(coll)):
            break #  raise StopIteration...
        yield coll[i:i+n]
        
def get_kmers(k):
    return lambda sequence: partition(k, k, sequence)

def convert_nt(c):
    return {"N": 0, "A": 1, "C": 2, "T": 3, "G": 4}.get(c, 0)

def convert_nt_complement(c):
    return {"N": 0, "A": 3, "C": 4, "T": 1, "G": 2}.get(c, 0)

def convert_kmer_to_int(kmer):
    return int(''.join(str(x) for x in (map(convert_nt, kmer))), 5)

def convert_kmer_to_int_complement(kmer):
    return int(''.join(str(x) for x in reversed(list(map(convert_nt_complement, kmer)))), 5)

def convert_base5(n):
    return {"0": "N", "1": "A", "2": "C", "3": "T", "4": "G"}.get(n,"N")

def convert_to_kmer(kmer):
    return ''.join(map(convert_base5, str(np.base_repr(kmer, 5))))

In [4]:
def kmer_processor(seq,offset):
    return list(map(convert_kmer_to_int, get_kmers(k)(seq[offset:])))

def get_kmers_from_seq(sequence):
    kmers_from_seq = list()

    kp = functools.partial(kmer_processor, sequence)
    
    for i in map(kp, range(0,k)):
        kmers_from_seq.append(i)

    rev = sequence[::-1]
    kpr = functools.partial(kmer_processor, rev)
    
    for i in map(kpr, range(0,k)):
        kmers_from_seq.append(i)
            
#    for i in range(0,k):
#        kmers_from_seq.append(kmer_processor(sequence,i))
#    for i in range(0,k):
#        kmers_from_seq.append(kmer_processor(rev, i))
    return kmers_from_seq

data = list()

def load_fasta(filename):
    data = list()
    file_base_name = os.path.basename(filename)
    picklefilename = file_base_name + ".kmerdict.picklepickle"
    if os.path.isfile(picklefilename):
        print("Loading from pickle")
        data = pickle.load(open(picklefilename, "rb"))
    else:
        print("File not found, generating new sequence: " + picklefilename)
        for seq_record in SeqIO.parse(filename, "fasta"):
            data.extend(get_kmers_from_seq(seq_record.seq.upper()))
        pickle.dump(data, open(picklefilename, "wb"))
    return(data)
        
def get_kmers_from_file(filename):
    kmer_list = list()
    for seq_record in SeqIO.parse(filename, "fasta"):
        kmer_list.extend(get_kmers_from_seq(seq_record.seq.upper()))
    return set([item for sublist in kmer_list for item in sublist])

all_kmers = set()

# Very slow, should make this part concurrent...

def find_all_kmers(directory):
    kmer_master_list = list()
    files = [directory + "/" + f for f in os.listdir(directory)]
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        for i in executor.map(get_kmers_from_file, files):
            kmer_master_list.extend(list(i))
            kmer_master_list = list(set(kmer_master_list))
            print("Total unique kmers: " + str(len(set(kmer_master_list))))
    return set(kmer_master_list)

# Because this was run at work on a smaller sample of files....
# with open("all_kmers_subset.txt", "w") as f:
#     for s in all_kmers:
#         f.write(str(s) +"\n")

# Because this was run at work on a smaller sample of files....
all_kmers = list()
# with open("all_kmers_subset.txt", "r") as f:
#     for line in f:
#         all_kmers.append(int(line.strip()))

# All kmer's in the dataset
# Already calculated so not recalculated
all_kmers = pickle.load(open("all_kmers.p", "rb"))
all_kmers = set(all_kmers)

kmer_dict = dict()
reverse_kmer_dict = dict();

a = 0
for i in all_kmers:
    kmer_dict[i] = a
    reverse_kmer_dict[a] = i
    a += 1

In [186]:
def embedding_kmer_generator(directory, window_size):
    files = [directory + "/" + f for f in os.listdir(directory)]
    random.shuffle(files)
    
    window_range = list(range(-window_size, 0))
    window_range.extend(list(range(1, window_size + 1)))

    def gen_training_data(idata, window_size):
        for data in idata:
            for i in xrange(window_size, len(data) + 1 - window_size):
                for x in window_range:
                    if (x == (i + x)):
                        continue
                    yield kmer_dict[data[i]], kmer_dict[data[i + x]]
    
    for f in files:
        yield from gen_training_data(load_fasta(f), window_size)

In [187]:
a = embedding_kmer_generator("training-files/", 4)

In [197]:
def my_input_fn():
    kmer_gen = functools.partial(embedding_kmer_generator, "training-files/", window_size)

    ds = tf.data.Dataset.from_generator(kmer_gen, 
                                        (tf.int64,
                                         tf.int64),
                                        (tf.TensorShape(1),
                                         tf.TensorShape(None)))
                                        
    # Numbers reduced to run on my desktop
    #ds = ds.repeat(4)
    #ds = ds.prefetch(5000000)
    #ds = ds.shuffle(buffer_size=500000)
    #ds = ds.batch(8000)
    
    ds = ds.repeat(1)
    ds = ds.prefetch(1000)
    ds = ds.shuffle(buffer_size=500)
    ds = ds.batch(250)
    
    def add_labels(arr, lab):
        return({"x": arr}, lab)
    
    ds = ds.map(add_labels)
    iterator = ds.make_one_shot_iterator()
    batch_features, batch_labels = iterator.get_next()
    return batch_features, batch_labels

In [199]:


In [219]:



Out[219]:
({'x': <tf.Tensor 'IteratorGetNext_19:0' shape=(?, 1) dtype=int64>},
 <tf.Tensor 'IteratorGetNext_19:1' shape=<unknown> dtype=int64>)

In [ ]:


In [ ]:


In [ ]: