In [ ]:
import tensorflow as tf
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from collections import Counter
import collections
import random
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import Bio
from Bio import SeqIO
import os
import concurrent.futures
import functools
from functools import partial
import math
import threading
import time
import random
from random import shuffle
import pickle
import tempfile
import ntpath
import os.path

# k-mer size to use
k = 9

#
# NOTE!!!!!!!!!!!!!!!!
#
# We can reduce problem space if we get the reverse complement, and add a bit to indicate reversed or not...
# Not really.... revcomp just doubles it back up again....
#
# Also -- Build a recurrent network to predict sequences that come after a given kmer?
# Look at word2vec, dna2vec, bag of words, skip-gram
#

# Problem space
space = 5 ** k

def partition(n, step, coll):
    for i in range(0, len(coll), step):
        if (i+n > len(coll)):
            break #  raise StopIteration...
        yield coll[i:i+n]
        
def get_kmers(k):
    return lambda sequence: partition(k, k, sequence)

def convert_nt(c):
    return {"N": 0, "A": 1, "C": 2, "T": 3, "G": 4}.get(c, 0)

def convert_nt_complement(c):
    return {"N": 0, "A": 3, "C": 4, "T": 1, "G": 2}.get(c, 0)

def convert_kmer_to_int(kmer):
    return int(''.join(str(x) for x in (map(convert_nt, kmer))), 5)

def convert_kmer_to_int_complement(kmer):
    return int(''.join(str(x) for x in reversed(list(map(convert_nt_complement, kmer)))), 5)

def convert_base5(n):
    return {"0": "N", "1": "A", "2": "C", "3": "T", "4": "G"}.get(n,"N")

def convert_to_kmer(kmer):
    return ''.join(map(convert_base5, str(np.base_repr(kmer, 5))))

# Not using sparse tensors anymore.
   
tf.logging.set_verbosity(tf.logging.INFO)

# Get all kmers, in order, with a sliding window of k (but sliding 1bp for each iteration up to k)
# Also get RC for all....

def kmer_processor(seq,offset):
    return list(map(convert_kmer_to_int, get_kmers(k)(seq[offset:])))

def get_kmers_from_seq(sequence):
    kmers_from_seq = list()

    kp = functools.partial(kmer_processor, sequence)
    
    for i in map(kp, range(0,k)):
        kmers_from_seq.append(i)

    rev = sequence[::-1]
    kpr = functools.partial(kmer_processor, rev)
    
    for i in map(kpr, range(0,k)):
        kmers_from_seq.append(i)
            
#    for i in range(0,k):
#        kmers_from_seq.append(kmer_processor(sequence,i))
#    for i in range(0,k):
#        kmers_from_seq.append(kmer_processor(rev, i))
    return kmers_from_seq

data = list()

def load_fasta(filename):
    data = dict()
    file_base_name = ntpath.basename(filename)
    picklefilename = file_base_name + ".picklepickle"
    if os.path.isfile(picklefilename):
        print("Loading from pickle: " + filename)
        data = pickle.load(open(picklefilename, "rb"))
    else:
        print("File not found, generating new sequence: " + picklefilename)
        for seq_record in SeqIO.parse(filename, "fasta"):
            data.update({seq_record.id:
                         get_kmers_from_seq(seq_record.seq.upper())})
        pickle.dump(data, open(picklefilename, "wb"))
    return(data)
        
def get_kmers_from_file(filename):
    kmer_list = list()
    for seq_record in SeqIO.parse(filename, "fasta"):
        kmer_list.extend(get_kmers_from_seq(seq_record.seq.upper()))
    return set([item for sublist in kmer_list for item in sublist])

all_kmers = set()

# Very slow, should make this part concurrent...

def find_all_kmers(directory):
    kmer_master_list = list()
    files = [directory + "/" + f for f in os.listdir(directory)]
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        for i in executor.map(get_kmers_from_file, files):
            kmer_master_list.extend(list(i))
            kmer_master_list = list(set(kmer_master_list))
            print("Total unique kmers: " + str(len(set(kmer_master_list))))
    return set(kmer_master_list)

def get_categories(directory):
    data = list()
    files = os.listdir(directory)
    for filename in files:
        for seq_record in SeqIO.parse(directory + "/" + filename, "fasta"):
            data.append(seq_record.id)
    data = sorted(list(set(data)))
    return(data)

def training_file_generator(directory):
    files = [directory + "/" + f for f in os.listdir(directory)]
    random.shuffle(files)
    def gen():
        nonlocal files
        if (len(files) == 0):
            files = [directory + "/" + f for f in os.listdir(directory)]
            random.shuffle(files)
        return(files.pop())
    return gen

def gen_random_training_data(input_data, window_size):
    rname = random.choice(list(input_data.keys()))
    rdata = random.choice(input_data[rname])
    idx = random.randrange(window_size + 1, len(rdata) - window_size - 1)
    tdata = list();
    for i in range(idx - window_size - 1, idx + window_size):
        if (i < 0): continue
        if (i >= len(rdata)): break
        if type(rdata[idx]) == list: break;
        if type(rdata[i]) == list: break
        tdata.append(kmer_dict[rdata[i]])
    return tdata, rname

# The current state is, each training batch is from a single FASTA file (strain, usually)
# This can be ok, as long as training batch is a large number
# Need to speed up reading of FASTA files though, maybe pyfaidx or something?

# Define the one-hot dictionary...

replicons_list = get_categories("training-files/")

oh = dict()
a = 0
for i in replicons_list:
    oh[i] = tf.one_hot(a, len(replicons_list))
    a += 1
    
oh = dict()
a = 0
for i in replicons_list:
    oh[i] = a
    a += 1
    
oh = dict()
oh['Main'] = [1.0, 0.0, 0.0]
oh['pSymA'] = [0.0, 1.0, 0.0]
oh['pSymB'] = [0.0, 0.0, 1.0]


def generate_training_batch(data, batch_size, window_size):
    training_batch_data = list();
    while len(training_batch_data) < batch_size:
         training_batch_data.append(gen_random_training_data(data, 
                                                             window_size))
    return training_batch_data

def train_input_fn():
    rdata = generate_training_batch(training_data, 1, window_size)[0]
    return rdata[0], oh[rdata[1]]
    # return {"train_input": rdata[0]}, oh[rdata[1]]

batch_size = 1024
embedding_size = 128
window_size = 7

replicons_list = get_categories("training-files/")

filegen = training_file_generator("training-files/")

repdict = dict()
a = 0
for i in replicons_list:
    repdict[i] = a
    a += 1

def train_input_fn(data):
    tbatch = generate_training_batch(data, 1, window_size)
    dat = {'x': tf.convert_to_tensor([tf.convert_to_tensor(get_kmer_embeddings(tbatch[0][0]))])}
    lab = tf.convert_to_tensor([repdict[tbatch[0][1]]])
    return dat, lab

def test_input_fn(data):
    tbatch = generate_training_batch(data, 1, window_size)
    dat = {'x': tf.convert_to_tensor([tf.convert_to_tensor(get_kmer_embeddings(tbatch[0][0]))])}
    lab = tf.convert_to_tensor([repdict[tbatch[0][1]]])
    return dat, lab

def train_input_fn_raw(data):
    tbatch = generate_training_batch(data, 1, window_size)
    dat = {'x': (get_kmer_embeddings(tbatch[0][0]))}
    lab = [repdict[tbatch[0][1]]]
    return dat, lab

# Because this was run at work on a smaller sample of files....
# with open("all_kmers_subset.txt", "w") as f:
#     for s in all_kmers:
#         f.write(str(s) +"\n")

sess = tf.Session()

# Because this was run at work on a smaller sample of files....
all_kmers = list()
# with open("all_kmers_subset.txt", "r") as f:
#     for line in f:
#         all_kmers.append(int(line.strip()))

all_kmers = pickle.load(open("all_kmers.p", "rb"))

all_kmers = set(all_kmers)
len(all_kmers)
# len(data)

# all_kmers = set([item for sublist in data for item in sublist])
unused_kmers = set(range(0, space)) - all_kmers

kmer_dict = dict()
reverse_kmer_dict = dict();

a = 0
for i in all_kmers:
    kmer_dict[i] = a
    reverse_kmer_dict[a] = i
    a += 1
    
kmer_count = len(all_kmers)

[len(all_kmers), len(unused_kmers), space]


Out[ ]:
[269132, 1683993, 1953125]

In [ ]:
all_kmers = find_all_kmers("kmer-files/")


Total unique kmers: 261991
Total unique kmers: 262148
Total unique kmers: 262174
Total unique kmers: 262207
Total unique kmers: 262257
Total unique kmers: 262257
Total unique kmers: 262257
Total unique kmers: 262285
Total unique kmers: 262311
Total unique kmers: 262339
Total unique kmers: 262339
Total unique kmers: 262367
Total unique kmers: 262367
Total unique kmers: 262381
Total unique kmers: 262399
Total unique kmers: 262423
Total unique kmers: 262447
Total unique kmers: 262469

In [2]:
# This fn now generates all possible combinations of training data....

def gen_training_data(input_data, window_size):
    total_data = list()
    
    for k in input_data.keys():
        for kdata in input_data[k]:
            for i in range(window_size + 1, len(kdata) - window_size):
                kentry = list()
                for x in range(i - window_size - 1, i + window_size):
                    kentry.append(kmer_dict[kdata[x]])
                total_data.append([kentry, k])
    return total_data

In [ ]:


In [3]:
feature_columns = [tf.feature_column.numeric_column("x", shape=[15,128])]

embeddings = np.load("final_embeddings.npy")

def get_kmer_embeddings(kmers):
    a = list() # numpy.empty(128 * 15)
    for k in kmers:
        a.append(embeddings[k])
    return a
    #return np.hstack(a)

def gen_training_data_generator(input_data, window_size, repdict):
    for k in input_data.keys():
        for kdata in input_data[k]:
            for i in range(window_size + 1, len(kdata) - window_size):
                kentry = list()
                for x in range(i - window_size - 1, i + window_size):
                    kentry.append(kmer_dict[kdata[x]])
                yield(get_kmer_embeddings(kentry), [repdict[k]])

# Not infinite
def kmer_generator(directory, window_size):
    files = [directory + "/" + f for f in os.listdir(directory)]
    random.shuffle(files)
    
    replicons_list = get_categories("training-files/")
    repdict = dict()
    a = 0
    for i in replicons_list:
        repdict[i] = a
        a += 1
    
    for f in files:
        yield from gen_training_data_generator(load_fasta(f), window_size, repdict)
        
# Plan to use tf.data.Dataset.from_generator
# ds = tf.contrib.data.Dataset.list_files("training-files/").map(tf_load_fasta)

In [13]:
def my_input_fn():
    kmer_gen = functools.partial(kmer_generator, "training-files/", window_size)

    ds = tf.data.Dataset.from_generator(kmer_gen, 
                                        (tf.float32,
                                         tf.int64),
                                        (tf.TensorShape([15,128]),
                                         tf.TensorShape(None)))
    ds = ds.repeat(5)
    ds = ds.prefetch(300000)
    ds = ds.shuffle(buffer_size=150000)
    ds = ds.batch(15000)
    
    def add_labels(arr, lab):
        return({"x": arr}, lab)
    
    ds = ds.map(add_labels)
    iterator = ds.make_one_shot_iterator()
    batch_features, batch_labels = iterator.get_next()
    return batch_features, batch_labels

#next_batch = my_input_fn()

# with tf.Session() as sess:
#     first_batch = sess.run(next_batch)
# print(first_batch)

In [ ]:
lc = tf.estimator.LinearClassifier(feature_columns=feature_columns,
                                   model_dir="linearclassifier.tanh.dropout0.05.adam.ecoli",
                                   n_classes=len(replicons_list),
                                   optimizer=tf.train.FtrlOptimizer(
                                       learning_rate=0.1,
                                       l1_regularization_strength=0.001))

In [5]:
lc = tf.estimator.LinearClassifier(feature_columns=feature_columns,
                                   model_dir="linearclassifier.tanh.dropout0.05.adam.ecoli",
                                   n_classes=len(replicons_list),
                                   optimizer=tf.train.FtrlOptimizer(
                                       learning_rate=0.1,
                                       l1_regularization_strength=0.001))


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'linearclassifier.tanh.dropout0.05.adam.ecoli', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001D86397E908>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

In [6]:
lc.train(input_fn = my_input_fn)


INFO:tensorflow:Create CheckpointSaverHook.
Loading from pickle: training-files//Rm41.final.fasta
INFO:tensorflow:Saving checkpoints for 1 into linearclassifier.tanh.dropout0.05.adam.ecoli\model.ckpt.
INFO:tensorflow:loss = 33673.0, step = 1
INFO:tensorflow:global_step/sec: 1.32186
INFO:tensorflow:loss = 4.49904, step = 101 (75.652 sec)
INFO:tensorflow:global_step/sec: 1.34027
INFO:tensorflow:loss = 3.10247, step = 201 (74.687 sec)
INFO:tensorflow:global_step/sec: 1.347
INFO:tensorflow:loss = 2.33194, step = 301 (74.165 sec)
INFO:tensorflow:global_step/sec: 1.35791
INFO:tensorflow:loss = 1.91844, step = 401 (73.718 sec)
INFO:tensorflow:global_step/sec: 1.35614
INFO:tensorflow:loss = 1.60657, step = 501 (73.663 sec)
INFO:tensorflow:global_step/sec: 1.35749
INFO:tensorflow:loss = 1.37372, step = 601 (73.737 sec)
INFO:tensorflow:global_step/sec: 1.36311
INFO:tensorflow:loss = 1.21957, step = 701 (73.291 sec)
INFO:tensorflow:global_step/sec: 1.35807
INFO:tensorflow:loss = 13.6567, step = 801 (73.648 sec)
INFO:tensorflow:Saving checkpoints for 811 into linearclassifier.tanh.dropout0.05.adam.ecoli\model.ckpt.
INFO:tensorflow:global_step/sec: 1.3571
INFO:tensorflow:loss = 5.25562, step = 901 (73.673 sec)
INFO:tensorflow:global_step/sec: 1.36183
INFO:tensorflow:loss = 3.24315, step = 1001 (73.504 sec)
INFO:tensorflow:global_step/sec: 1.36352
INFO:tensorflow:loss = 33.614, step = 1101 (73.265 sec)
INFO:tensorflow:global_step/sec: 1.36129
INFO:tensorflow:loss = 12.5095, step = 1201 (73.534 sec)
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-6-61f89a16a422> in <module>()
----> 1 lc.train(input_fn = my_input_fn)

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
    300 
    301     saving_listeners = _check_listeners_type(saving_listeners)
--> 302     loss = self._train_model(input_fn, hooks, saving_listeners)
    303     logging.info('Loss for final step: %s.', loss)
    304     return self

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py in _train_model(self, input_fn, hooks, saving_listeners)
    781         loss = None
    782         while not mon_sess.should_stop():
--> 783           _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
    784       return loss
    785 

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, fetches, feed_dict, options, run_metadata)
    519                           feed_dict=feed_dict,
    520                           options=options,
--> 521                           run_metadata=run_metadata)
    522 
    523   def should_stop(self):

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, fetches, feed_dict, options, run_metadata)
    890                               feed_dict=feed_dict,
    891                               options=options,
--> 892                               run_metadata=run_metadata)
    893       except _PREEMPTION_ERRORS as e:
    894         logging.info('An error was raised. This may be due to a preemption in '

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, *args, **kwargs)
    950   def run(self, *args, **kwargs):
    951     try:
--> 952       return self._sess.run(*args, **kwargs)
    953     except _PREEMPTION_ERRORS:
    954       raise

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, fetches, feed_dict, options, run_metadata)
   1022                                   feed_dict=feed_dict,
   1023                                   options=options,
-> 1024                                   run_metadata=run_metadata)
   1025 
   1026     for hook in self._hooks:

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, *args, **kwargs)
    825 
    826   def run(self, *args, **kwargs):
--> 827     return self._sess.run(*args, **kwargs)
    828 
    829 

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in run(self, fetches, feed_dict, options, run_metadata)
    887     try:
    888       result = self._run(None, fetches, feed_dict, options_ptr,
--> 889                          run_metadata_ptr)
    890       if run_metadata:
    891         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
   1118     if final_fetches or final_targets or (handle and feed_dict_tensor):
   1119       results = self._do_run(handle, final_targets, final_fetches,
-> 1120                              feed_dict_tensor, options, run_metadata)
   1121     else:
   1122       results = []

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1315     if handle is None:
   1316       return self._do_call(_run_fn, self._session, feeds, fetches, targets,
-> 1317                            options, run_metadata)
   1318     else:
   1319       return self._do_call(_prun_fn, self._session, handle, feeds, fetches)

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
   1321   def _do_call(self, fn, *args):
   1322     try:
-> 1323       return fn(*args)
   1324     except errors.OpError as e:
   1325       message = compat.as_text(e.message)

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1300           return tf_session.TF_Run(session, options,
   1301                                    feed_dict, fetch_list, target_list,
-> 1302                                    status, run_metadata)
   1303 
   1304     def _prun_fn(session, handle, feed_dict, fetch_list):

KeyboardInterrupt: 

In [7]:


In [11]:
# DNN Classifier

dnn = tf.estimator.DNNClassifier(feature_columns=feature_columns,
                                hidden_units = [128, 32],
                                activation_fn=tf.nn.tanh,
                                dropout=0.05,
                                model_dir="classifier.tanh.dropout0.05.momentum.0.6.nesterov.ecoli",
                                n_classes=len(replicons_list),
                                optimizer=tf.train.MomentumOptimizer(
                                    momentum=0.6,
                                    learning_rate=0.01,
                                    use_nesterov=True))


# Have to know the names of the tensors to do this level of logging....
# Custom estimator would allow it....
# tensors_to_log = {"probabilities": "softmax_tensor"}
# logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=100)


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'classifier.tanh.dropout0.05.momentum.0.6.nesterov.ecoli', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001D8EAEFF198>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

In [14]:
dnn.train(input_fn = my_input_fn)


INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from classifier.tanh.dropout0.05.momentum.0.6.nesterov.ecoli\model.ckpt-790
Loading from pickle: training-files//T073.final.fasta
INFO:tensorflow:Saving checkpoints for 791 into classifier.tanh.dropout0.05.momentum.0.6.nesterov.ecoli\model.ckpt.
INFO:tensorflow:loss = 1.781e+08, step = 791
INFO:tensorflow:global_step/sec: 0.880747
INFO:tensorflow:loss = 0.0, step = 891 (113.569 sec)
INFO:tensorflow:global_step/sec: 0.864494
INFO:tensorflow:loss = 0.0, step = 991 (115.647 sec)
INFO:tensorflow:global_step/sec: 0.866545
INFO:tensorflow:loss = 0.0, step = 1091 (115.417 sec)
INFO:tensorflow:global_step/sec: 0.882302
INFO:tensorflow:loss = 0.0, step = 1191 (113.324 sec)
INFO:tensorflow:global_step/sec: 0.876112
INFO:tensorflow:loss = 1.61081e+07, step = 1291 (114.219 sec)
INFO:tensorflow:Saving checkpoints for 1315 into classifier.tanh.dropout0.05.momentum.0.6.nesterov.ecoli\model.ckpt.
INFO:tensorflow:global_step/sec: 0.861494
INFO:tensorflow:loss = 14660.8, step = 1391 (115.999 sec)
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-14-4e8d3a4cd6f9> in <module>()
----> 1 dnn.train(input_fn = my_input_fn)

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
    300 
    301     saving_listeners = _check_listeners_type(saving_listeners)
--> 302     loss = self._train_model(input_fn, hooks, saving_listeners)
    303     logging.info('Loss for final step: %s.', loss)
    304     return self

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py in _train_model(self, input_fn, hooks, saving_listeners)
    781         loss = None
    782         while not mon_sess.should_stop():
--> 783           _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
    784       return loss
    785 

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, fetches, feed_dict, options, run_metadata)
    519                           feed_dict=feed_dict,
    520                           options=options,
--> 521                           run_metadata=run_metadata)
    522 
    523   def should_stop(self):

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, fetches, feed_dict, options, run_metadata)
    890                               feed_dict=feed_dict,
    891                               options=options,
--> 892                               run_metadata=run_metadata)
    893       except _PREEMPTION_ERRORS as e:
    894         logging.info('An error was raised. This may be due to a preemption in '

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, *args, **kwargs)
    950   def run(self, *args, **kwargs):
    951     try:
--> 952       return self._sess.run(*args, **kwargs)
    953     except _PREEMPTION_ERRORS:
    954       raise

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, fetches, feed_dict, options, run_metadata)
   1022                                   feed_dict=feed_dict,
   1023                                   options=options,
-> 1024                                   run_metadata=run_metadata)
   1025 
   1026     for hook in self._hooks:

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, *args, **kwargs)
    825 
    826   def run(self, *args, **kwargs):
--> 827     return self._sess.run(*args, **kwargs)
    828 
    829 

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in run(self, fetches, feed_dict, options, run_metadata)
    887     try:
    888       result = self._run(None, fetches, feed_dict, options_ptr,
--> 889                          run_metadata_ptr)
    890       if run_metadata:
    891         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
   1118     if final_fetches or final_targets or (handle and feed_dict_tensor):
   1119       results = self._do_run(handle, final_targets, final_fetches,
-> 1120                              feed_dict_tensor, options, run_metadata)
   1121     else:
   1122       results = []

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1315     if handle is None:
   1316       return self._do_call(_run_fn, self._session, feeds, fetches, targets,
-> 1317                            options, run_metadata)
   1318     else:
   1319       return self._do_call(_prun_fn, self._session, handle, feeds, fetches)

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
   1321   def _do_call(self, fn, *args):
   1322     try:
-> 1323       return fn(*args)
   1324     except errors.OpError as e:
   1325       message = compat.as_text(e.message)

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1300           return tf_session.TF_Run(session, options,
   1301                                    feed_dict, fetch_list, target_list,
-> 1302                                    status, run_metadata)
   1303 
   1304     def _prun_fn(session, handle, feed_dict, fetch_list):

KeyboardInterrupt: 

In [14]:
def my_eval_fn():
    kmer_gen = functools.partial(kmer_generator, "training-files/", window_size)

    ds = tf.data.Dataset.from_generator(kmer_gen, 
                                        (tf.float32,
                                         tf.int64),
                                        (tf.TensorShape([15,128]),
                                         tf.TensorShape(None)))
    ds = ds.repeat(1)
    ds = ds.prefetch(4000)
    ds = ds.shuffle(buffer_size=2000)
    ds = ds.batch(1000)
    
    def add_labels(arr, lab):
        return({"x": arr}, lab)
    
    ds = ds.map(add_labels)
    iterator = ds.make_one_shot_iterator()
    batch_features, batch_labels = iterator.get_next()
    return batch_features, batch_labels

tf.logging.set_verbosity(tf.logging.ERROR)
nn.evaluate(input_fn = my_input_fn, steps = 1000)


Loading from pickle: training-files//KH35c.final.fasta
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-14-bfa8cc9dd882> in <module>()
      1 tf.logging.set_verbosity(tf.logging.ERROR)
----> 2 nn.evaluate(input_fn = my_input_fn, steps = 1000)

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py in evaluate(self, input_fn, steps, hooks, checkpoint_path, name)
    353         hooks=hooks,
    354         checkpoint_path=checkpoint_path,
--> 355         name=name)
    356 
    357   def _convert_eval_steps_to_hooks(self, steps):

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py in _evaluate_model(self, input_fn, hooks, checkpoint_path, name)
    837           final_ops=eval_dict,
    838           hooks=all_hooks,
--> 839           config=self._session_config)
    840 
    841       _write_dict_to_summary(

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\evaluation.py in _evaluate_once(checkpoint_path, master, scaffold, eval_ops, feed_dict, final_ops, final_ops_feed_dict, hooks, config)
    204     if eval_ops is not None:
    205       while not session.should_stop():
--> 206         session.run(eval_ops, feed_dict)
    207 
    208   logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, fetches, feed_dict, options, run_metadata)
    519                           feed_dict=feed_dict,
    520                           options=options,
--> 521                           run_metadata=run_metadata)
    522 
    523   def should_stop(self):

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, fetches, feed_dict, options, run_metadata)
    890                               feed_dict=feed_dict,
    891                               options=options,
--> 892                               run_metadata=run_metadata)
    893       except _PREEMPTION_ERRORS as e:
    894         logging.info('An error was raised. This may be due to a preemption in '

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, *args, **kwargs)
    950   def run(self, *args, **kwargs):
    951     try:
--> 952       return self._sess.run(*args, **kwargs)
    953     except _PREEMPTION_ERRORS:
    954       raise

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, fetches, feed_dict, options, run_metadata)
   1022                                   feed_dict=feed_dict,
   1023                                   options=options,
-> 1024                                   run_metadata=run_metadata)
   1025 
   1026     for hook in self._hooks:

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, *args, **kwargs)
    825 
    826   def run(self, *args, **kwargs):
--> 827     return self._sess.run(*args, **kwargs)
    828 
    829 

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in run(self, fetches, feed_dict, options, run_metadata)
    887     try:
    888       result = self._run(None, fetches, feed_dict, options_ptr,
--> 889                          run_metadata_ptr)
    890       if run_metadata:
    891         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
   1118     if final_fetches or final_targets or (handle and feed_dict_tensor):
   1119       results = self._do_run(handle, final_targets, final_fetches,
-> 1120                              feed_dict_tensor, options, run_metadata)
   1121     else:
   1122       results = []

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1315     if handle is None:
   1316       return self._do_call(_run_fn, self._session, feeds, fetches, targets,
-> 1317                            options, run_metadata)
   1318     else:
   1319       return self._do_call(_prun_fn, self._session, handle, feeds, fetches)

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
   1321   def _do_call(self, fn, *args):
   1322     try:
-> 1323       return fn(*args)
   1324     except errors.OpError as e:
   1325       message = compat.as_text(e.message)

G:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1300           return tf_session.TF_Run(session, options,
   1301                                    feed_dict, fetch_list, target_list,
-> 1302                                    status, run_metadata)
   1303 
   1304     def _prun_fn(session, handle, feed_dict, fetch_list):

KeyboardInterrupt: 

In [ ]:


In [31]:
# next(kmer_gen)

# value = ds.make_one_shot_iterator().get_next()
# sess.run(value) # It works!

# filegen = training_file_generator("training-files/")
# training_data = load_fasta(filegen())
# a = gen_training_data_generator(training_data, window_size, repdict)

# np.shape(next(a)[1])

# convert_to_kmer(reverse_kmer_dict[a[0][0][0]])

In [7]:
embeddings = np.load("final_embeddings.npy")

def get_kmer_embeddings(kmers):
    a = list() # numpy.empty(128 * 15)
    for k in kmers:
        a.append(embeddings[k])
    return np.hstack(a)

# training_data = load_fasta(filegen())
# get_kmer_embeddings(tbatch[0][0])

# tf.convert_to_tensor([tf.convert_to_tensor(get_kmer_embeddings(tbatch[0][0]))])

In [8]:
feature_columns = [tf.feature_column.numeric_column("x", shape=[1920])]
nn = tf.estimator.DNNClassifier(feature_columns=feature_columns,
                                hidden_units = [5000,1000,100,50],
                                activation_fn=tf.nn.relu,
                                dropout=0.2,
                                n_classes=len(replicons_list),
                                optimizer="Adam")


INFO:tensorflow:Using default config.
WARNING:tensorflow:Using temporary folder as model directory: C:\Users\Joey\AppData\Local\Temp\tmpyxwxw13w
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\Joey\\AppData\\Local\\Temp\\tmpyxwxw13w', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}

In [ ]:
tf.logging.set_verbosity(tf.logging.ERROR)

filegen = training_file_generator("training-files/")

for i in xrange(10):
    training_data = load_fasta(filegen())
    tfn = functools.partial(train_input_fn, training_data)
    nn.train(input_fn=tfn, steps=100000)


Loading from pickle: training-files//SM11.final.fasta
Loading from pickle: training-files//KH35c.final.fasta
Loading from pickle: training-files//USDA1157.final.fasta
Loading from pickle: training-files//HM006.final.fasta
Loading from pickle: training-files//Reference.final.fasta

In [ ]:
for i in xrange(10):
    filegen = training_file_generator("training-files/")
    training_data = load_fasta(filegen())
    tf.logging.set_verbosity(tf.logging.ERROR)
    tfn = functools.partial(train_input_fn, training_data)
    accuracy_score = nn.evaluate(input_fn=tfn, steps=10000)
    print("\nTest Accuracy: {0:f}\n".format(accuracy_score['accuracy']))
    print("Average Loss: {0:f}".format(accuracy_score['average_loss']))

In [37]:
train_input_fn_raw(training_data)


Out[37]:
({'x': array([-0.01598681,  0.08780892,  0.05043568, ...,  0.03577497,
          0.03044195,  0.04390863], dtype=float32)}, [2])

In [38]:
replicons_list


Out[38]:
['Emel_M2_Main',
 'Emel_M2_Uni1',
 'Emel_M2_Uni2',
 'Emel_M2_Uni3',
 'Emel_WSM419_Main',
 'Emel_WSM419_Uni1',
 'Emel_WSM419_Uni11',
 'Emel_WSM419_Uni2',
 'HM006_Accessory_A',
 'KH35c_Accessory_A',
 'M162_Accessory_A',
 'M270_Accessory_A',
 'M270_Accessory_B',
 'M270_Accessory_C',
 'Main',
 'Rm41_Accessory_A',
 'T073_Accessory_A',
 'USDA1021_Accessory_A',
 'USDA1157_Accessory_A',
 'pHRB800',
 'pHRC017',
 'pRmeGR4a',
 'pRmeGR4b',
 'pSINME01',
 'pSINME02',
 'pSMED03_WSM419',
 'pSymA',
 'pSymB']

In [ ]: