In [1]:
import numpy as np
k = 9
def convert_base5(n):
return {"0": "N", "1": "A", "2": "C", "3": "T", "4": "G"}.get(n,"N")
def convert_nt(c):
return {"N": 0, "A": 1, "C": 2, "T": 3, "G": 4}.get(c, 0)
def convert_nt_complement(c):
return {"N": 0, "A": 3, "C": 4, "T": 1, "G": 2}.get(c, 0)
def convert_kmer_to_int(kmer):
return int(''.join(str(x) for x in (map(convert_nt, kmer))), 5)
def kmer_processor(seq,offset):
return list(map(convert_kmer_to_int, get_kmers(k)(seq[offset:])))
def partition(n, step, coll):
for i in range(0, len(coll), step):
if (i+n > len(coll)):
break # raise StopIteration...
yield coll[i:i+n]
def get_kmers(k):
return lambda sequence: partition(k, k, sequence)
def get_kmers_from_seq(sequence):
kmers_from_seq = list()
kp = functools.partial(kmer_processor, sequence)
for i in map(kp, range(0,k)):
kmers_from_seq.append(i)
rev = sequence[::-1]
kpr = functools.partial(kmer_processor, rev)
for i in map(kpr, range(0,k)):
kmers_from_seq.append(i)
# for i in range(0,k):
# kmers_from_seq.append(kmer_processor(sequence,i))
# for i in range(0,k):
# kmers_from_seq.append(kmer_processor(rev, i))
return kmers_from_seq
In [2]:
with open("Glove/vocab.txt", 'r') as f:
words = [x.rstrip().split(' ')[0] for x in f.readlines()]
with open("Glove/vectors.txt", 'r') as f:
vectors = {}
for line in f:
vals = line.rstrip().split(' ')
vectors[vals[0]] = [float(x) for x in vals[1:]]
vocab_size = len(words)
vocab = {w: idx for idx, w in enumerate(words)}
ivocab = {idx: w for idx, w in enumerate(words)}
vector_dim = len(vectors[ivocab[0]])
W = np.zeros((vocab_size, vector_dim))
for word, v in vectors.items():
if word == '<unk>':
continue
W[vocab[word], :] = v
# normalize each word vector to unit variance
W_norm = np.zeros(W.shape)
d = (np.sum(W ** 2, 1) ** (0.5))
W_norm = (W.T / d).T
def convert_to_kmer(kmer):
return ''.join(map(convert_base5, str(np.base_repr(kmer, 5))))
In [3]:
vocab['1008064']
Out[3]:
In [ ]:
In [ ]:
In [4]:
def distance(W, vocab, ivocab, input_term):
for idx, term in enumerate(input_term.split(' ')):
if term in vocab:
print('Word: %s Position in vocabulary: %i' % (convert_to_kmer(int(term)), vocab[term]))
if idx == 0:
vec_result = np.copy(W[vocab[term], :])
else:
vec_result += W[vocab[term], :]
else:
print('Word: %s Out of dictionary!\n' % (convert_to_kmer(int(term))))
return
vec_norm = np.zeros(vec_result.shape)
d = (np.sum(vec_result ** 2,) ** (0.5))
vec_norm = (vec_result.T / d).T
dist = np.dot(W, vec_norm.T)
for term in input_term.split(' '):
index = vocab[term]
dist[index] = -np.Inf
a = np.argsort(-dist)[:100]
print("\n Word Cosine distance\n")
print("---------------------------------------------------------\n")
for x in a:
print("%35s\t\t%f\n" % (convert_to_kmer(int(ivocab[x])), dist[x]))
In [5]:
distance(W_norm, vocab, ivocab, str(convert_kmer_to_int("ATGACGATC")))
In [6]:
# np.save("glove_embeddings.np", W_norm)
In [7]:
convert_to_kmer(1008064)
Out[7]:
In [8]:
# GCGGCGATC
In [9]:
## Tensorflow Model
# Vector length is 256
# Input is 15 kmers (can be altered)
In [10]:
import tensorflow as tf
import functools
from functools import partial
import os.path
import Bio
from Bio import SeqIO
import random
from random import shuffle
import ntpath
import pickle
import sys
In [11]:
embedding_dim = 256
# sess = tf.Session()
sess = tf.InteractiveSession()
# Weights = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_dim]), trainable=False, name="Weights")
# embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_dim])
# embedding_init = Weights.assign(embedding_placeholder)
# sess.run(embedding_init, feed_dict={embedding_placeholder: W_norm})
In [12]:
def load_fasta(filename):
# tf.summary.text("File", tf.as_string(filename))
data = dict()
file_base_name = ntpath.basename(filename)
picklefilename = file_base_name + ".picklepickle"
if os.path.isfile(picklefilename):
print("Loading from pickle: " + filename)
data = pickle.load(open(picklefilename, "rb"))
else:
print("File not found, generating new sequence: " + picklefilename)
for seq_record in SeqIO.parse(filename, "fasta"):
data.update({seq_record.id:
get_kmers_from_seq(seq_record.seq.upper())})
pickle.dump(data, open(picklefilename, "wb"))
sys.stdout.flush()
return(data)
In [52]:
def gen_training_data_generator(input_data, window_size, repdict):
for k in input_data.keys():
for kdata in input_data[k]:
for i in range(window_size + 1, len(kdata) - window_size):
kentry = list()
for x in range(i - window_size - 1, i + window_size):
kentry.append(vocab[str(kdata[x])])
yield(kentry, [repdict[k]])
def get_categories(directory):
data = list()
files = os.listdir(directory)
for filename in files:
for seq_record in SeqIO.parse(directory + "/" + filename, "fasta"):
data.append(seq_record.id)
data = sorted(list(set(data)))
return(data)
replicons_list = get_categories("training-files/")
def kmer_generator(directory, window_size):
files = [directory + "/" + f for f in os.listdir(directory)]
random.shuffle(files)
replicons_list = get_categories("training-files/")
repdict = dict()
a = 0
for i in replicons_list:
repdict[i] = a
a += 1
for f in files:
yield from gen_training_data_generator(load_fasta(f), window_size, repdict)
def input_fn():
kmer_gen = functools.partial(kmer_generator, "training-files/", 7)
ds = tf.data.Dataset.from_generator(kmer_gen,
(tf.float32,
tf.int64),
(tf.TensorShape([15]),
tf.TensorShape(None)))
# # Numbers reduced to run on my desktop
# ds = ds.repeat(5)
# ds = ds.prefetch(5000) # Each batch is only 2048, so prefetch 5000
# ds = ds.shuffle(buffer_size=1000000) # Large buffer size for better randomization
# ds = ds.batch(2048) # Reduced from 5000 so it runs quicker
# ds = ds.repeat(1)
# ds = ds.prefetch(2)
# ds = ds.shuffle(buffer_size=500)
ds = ds.batch(20)
def add_labels(arr, lab):
return({"kmers": arr}, lab)
ds = ds.map(add_labels)
iterator = ds.make_one_shot_iterator()
batch_features, batch_labels = iterator.get_next()
return batch_features, batch_labels
def init():
return W
replicons_fc = tf.feature_column.categorical_column_with_vocabulary_list(
key='label',
vocabulary_list=replicons_list)
kmers_fc = tf.feature_column.numeric_column(key="kmers", shape=15, dtype=tf.int64)
# kmers_dict = tf.feature_column.categorical_column_with_vocabulary_list(
# key="kmers",
# shape=15,
# vocabulary_list=vocab.keys())
# kmers_fc_embed = tf.feature_column.embedding_column(
# categorical_column=kmers_dict,
# dimension=256,
# initializer=init,
# trainable=False)
In [ ]:
In [41]:
kmer_gen = functools.partial(kmer_generator, "training-files/", 7)
next(kmer_gen())
Out[41]:
In [15]:
# sess = tf.InteractiveSession()
v = input_fn()
#v[0]["kmers"].eval()
a = tf.feature_column.input_layer(v[0], [kmers_fc])
a = tf.Print(a, [a], message="This is a: ")
a.eval()
Out[15]:
In [16]:
#v = input_fn()
# v[0]["kmers"]
a = tf.feature_column.input_layer(v[0], kmers_fc)
a = tf.cast(a, tf.int64)
a.eval()
Out[16]:
In [17]:
# ks = tf.feature_column.input_layer(v[0], [kmers_fc])
embedding_dim = 256
Weights = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_dim]), trainable=False, name="Weights")
embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_dim])
embedding_init = Weights.assign(embedding_placeholder)
sess.run(embedding_init, feed_dict={embedding_placeholder: W_norm})
a = tf.feature_column.input_layer(v[0], kmers_fc)
a = tf.cast(a, tf.int64)
words = tf.nn.embedding_lookup(Weights, a)
b = tf.Print(words, [words])
b.eval()
Out[17]:
In [25]:
tf.shape(b).eval()
Out[25]:
In [ ]:
vectors[5]
In [89]:
def cnn_model_fn(features, labels, mode):
if mode == tf.estimator.ModeKeys.PREDICT:
tf.logging.info("my_model_fn: PREDICT, {}".format(mode))
elif mode == tf.estimator.ModeKeys.EVAL:
tf.logging.info("my_model_fn: EVAL, {}".format(mode))
elif mode == tf.estimator.ModeKeys.TRAIN:
tf.logging.info("my_model_fn: TRAIN, {}".format(mode))
Weights = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_dim]), trainable=False, name="Weights")
embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_dim])
embedding_init = Weights.assign(embedding_placeholder)
def init_fn(scaffold, sess):
sess.run(Weights.initializer, {Weights.initial_value: W})
scaffold = tf.train.Scaffold(init_fn=init_fn)
inputs = tf.feature_column.input_layer(features, [kmers_fc])
input_i64 = tf.cast(inputs, tf.int64)
embedded_kmers = tf.nn.embedding_lookup(Weights, input_i64)
# input_layer = tf.reshape(embedded_kmers, [-1, 15, 256 ,1])
input_layer = tf.reshape(embedded_kmers, [-1, 3840])
input_layer = tf.cast(input_layer, tf.float32)
# conv1 = tf.layers.conv2d(inputs = input_layer,
# filters=32,
# kernel_size=[-1,2,256],
# strides=3,
# padding="same",
# name="Conv1",
# activation=None)
# avg_pool1 = tf.layers.average_pooling2d(conv1,
# pool_size=[-1,4,32],
# strides=[-1,2,16],
# padding="same",
# name="AvgPooling_1")
# 29 is number of replicons
# print(tf.shape(avg_pool1))
#logits = tf.layers.dense(units=len(replicons_list), inputs=avg_pool1)
# inputs=avg_pool1, units=len(replicons_list))
h1 = tf.layers.Dense(1000, activation=tf.nn.relu)(input_layer)
h2 = tf.layers.Dense(500, activation=tf.nn.relu)(h1)
logits = tf.layers.Dense(29)(h2)
predictions = {
"class_ids": tf.argmax(input=logits, axis=1)
}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels,
logits = logits)
accuracy = tf.metrics.accuracy(labels, predictions['class_ids'])
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(
mode,
loss=loss,
eval_metric_ops={'my_accuracy': accuracy})
# If mode is not PREDICT nor EVAL, then we must be in TRAIN
assert mode == tf.estimator.ModeKeys.TRAIN, "TRAIN is only ModeKey left"
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(
loss=loss,
global_step=tf.train.get_global_step())
tf.summary.scalar('my_accuracy', accuracy[1])
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
# Add evaluation metrics (for EVAL mode)
# eval_metric_ops = {
# "accuracy": tf.metrics.accuracy(
# labels=labels, predictions=predictions["classes"])}
# return tf.estimator.EstimatorSpec(
# mode=mode,
# loss=loss,
# eval_metric_ops=eval_metric_ops)
In [90]:
len(replicons_list)
Out[90]:
In [ ]:
classifier = tf.estimator.Estimator(
model_fn=cnn_model_fn,
model_dir="classifier_glove_cnn4.2",
config=tf.contrib.learn.RunConfig(
save_checkpoints_steps=10,
save_checkpoints_secs=None,
save_summary_steps=5))
classifier.train(input_fn=input_fn, steps=10000)
classifier.evaluate(input_fn=input_fn, steps=1000)
In [ ]:
# Ignore below for now...
In [18]:
def main(unused_argv):
classifier = tf.estimator.Estimator(
model_fn=cnn_model_fn,
model_dir="classifier_glove_cnn4",
config=tf.contrib.learn.RunConfig(
save_checkpoints_steps=10,
save_checkpoints_secs=None,
save_summary_steps=5))
classifier.train(input_fn=input_fn, steps=10)
# eval_results = classifier.evaluate(input_fn=my_input_fn, steps=10)
# print(eval_results)
In [ ]:
if __name__ == "__main__":
tf.app.run()
In [ ]:
In [32]:
# Using pre-made models starts down here....
In [28]:
kmers_fc = tf.feature_column.numeric_column(key="kmers", shape=15, dtype=tf.int64)
kmers_dict = tf.feature_column.categorical_column_with_vocabulary_list(
key="kmers",
vocabulary_list=list(map(int, vocab.keys())))
kmers_dict_1 = tf.feature_column.categorical_column_with_vocabulary_list(
key="kmer_1",
vocabulary_list=list(map(int, vocab.keys())))
kmers_dict_2 = tf.feature_column.categorical_column_with_vocabulary_list(
key="kmer_2",
vocabulary_list=list(map(int, vocab.keys())))
kmers_dict_3 = tf.feature_column.categorical_column_with_vocabulary_list(
key="kmer_3",
vocabulary_list=list(map(int, vocab.keys())))
kmers_dict_4 = tf.feature_column.categorical_column_with_vocabulary_list(
key="kmer_4",
vocabulary_list=list(map(int, vocab.keys())))
kmers_dict_5 = tf.feature_column.categorical_column_with_vocabulary_list(
key="kmer_5",
vocabulary_list=list(map(int, vocab.keys())))
kmers_dict_6 = tf.feature_column.categorical_column_with_vocabulary_list(
key="kmer_6",
vocabulary_list=list(map(int, vocab.keys())))
kmers_dict_7 = tf.feature_column.categorical_column_with_vocabulary_list(
key="kmer_7",
vocabulary_list=list(map(int, vocab.keys())))
kmers_dict_8 = tf.feature_column.categorical_column_with_vocabulary_list(
key="kmer_8",
vocabulary_list=list(map(int, vocab.keys())))
kmers_dict_9 = tf.feature_column.categorical_column_with_vocabulary_list(
key="kmer_9",
vocabulary_list=list(map(int, vocab.keys())))
kmers_dict_10 = tf.feature_column.categorical_column_with_vocabulary_list(
key="kmer_10",
vocabulary_list=list(map(int, vocab.keys())))
kmers_dict_11 = tf.feature_column.categorical_column_with_vocabulary_list(
key="kmer_11",
vocabulary_list=list(map(int, vocab.keys())))
kmers_dict_12 = tf.feature_column.categorical_column_with_vocabulary_list(
key="kmer_12",
vocabulary_list=list(map(int, vocab.keys())))
kmers_dict_13 = tf.feature_column.categorical_column_with_vocabulary_list(
key="kmer_13",
vocabulary_list=list(map(int, vocab.keys())))
kmers_dict_14 = tf.feature_column.categorical_column_with_vocabulary_list(
key="kmer_14",
vocabulary_list=list(map(int, vocab.keys())))
kmers_dict_15 = tf.feature_column.categorical_column_with_vocabulary_list(
key="kmer_15",
vocabulary_list=list(map(int, vocab.keys())))
kmers_fc_embed_1 = tf.feature_column.embedding_column(
categorical_column=kmers_dict_1,
dimension=256,
initializer=init,
trainable=False)
kmers_fc_embed_2 = tf.feature_column.embedding_column(
categorical_column=kmers_dict_2,
dimension=256,
initializer=init,
trainable=False)
kmers_fc_embed_3 = tf.feature_column.embedding_column(
categorical_column=kmers_dict_3,
dimension=256,
initializer=init,
trainable=False)
kmers_fc_embed_4 = tf.feature_column.embedding_column(
categorical_column=kmers_dict_4,
dimension=256,
initializer=init,
trainable=False)
kmers_fc_embed_5 = tf.feature_column.embedding_column(
categorical_column=kmers_dict_5,
dimension=256,
initializer=init,
trainable=False)
kmers_fc_embed_6 = tf.feature_column.embedding_column(
categorical_column=kmers_dict_6,
dimension=256,
initializer=init,
trainable=False)
kmers_fc_embed_7 = tf.feature_column.embedding_column(
categorical_column=kmers_dict_7,
dimension=256,
initializer=init,
trainable=False)
kmers_fc_embed_8 = tf.feature_column.embedding_column(
categorical_column=kmers_dict_8,
dimension=256,
initializer=init,
trainable=False)
kmers_fc_embed_9 = tf.feature_column.embedding_column(
categorical_column=kmers_dict_9,
dimension=256,
initializer=init,
trainable=False)
kmers_fc_embed_10 = tf.feature_column.embedding_column(
categorical_column=kmers_dict_10,
dimension=256,
initializer=init,
trainable=False)
kmers_fc_embed_11 = tf.feature_column.embedding_column(
categorical_column=kmers_dict_11,
dimension=256,
initializer=init,
trainable=False)
kmers_fc_embed_12 = tf.feature_column.embedding_column(
categorical_column=kmers_dict_12,
dimension=256,
initializer=init,
trainable=False)
kmers_fc_embed_13 = tf.feature_column.embedding_column(
categorical_column=kmers_dict_13,
dimension=256,
initializer=init,
trainable=False)
kmers_fc_embed_14 = tf.feature_column.embedding_column(
categorical_column=kmers_dict_14,
dimension=256,
initializer=init,
trainable=False)
kmers_fc_embed_15 = tf.feature_column.embedding_column(
categorical_column=kmers_dict_15,
dimension=256,
initializer=init,
trainable=False)
In [29]:
#kmers_dict
In [30]:
columns = [kmers_fc_embed_1, kmers_fc_embed_2, kmers_fc_embed_3, kmers_fc_embed_4, kmers_fc_embed_5,
kmers_fc_embed_6, kmers_fc_embed_7, kmers_fc_embed_8, kmers_fc_embed_9, kmers_fc_embed_10,
kmers_fc_embed_11, kmers_fc_embed_12, kmers_fc_embed_13, kmers_fc_embed_14, kmers_fc_embed_15]
In [31]:
estimator = tf.estimator.DNNClassifier(feature_columns=columns,
hidden_units=[1024,512,256])
In [32]:
estimator.train(input_fn=input_fn_new, steps=10)
In [ ]:
In [ ]:
In [74]:
Out[74]:
In [23]:
def input_fn_new():
kmer_gen = functools.partial(kmer_generator, "training-files/", 7)
ds = tf.data.Dataset.from_generator(kmer_gen,
(tf.int64,
tf.int64),
(tf.TensorShape(15),
tf.TensorShape(None)))
# # Numbers reduced to run on my desktop
# ds = ds.repeat(5)
# ds = ds.prefetch(5000) # Each batch is only 2048, so prefetch 5000
# ds = ds.shuffle(buffer_size=1000000) # Large buffer size for better randomization
# ds = ds.batch(2048) # Reduced from 5000 so it runs quicker
# ds = ds.repeat(1)
# ds = ds.prefetch(2)
# ds = ds.shuffle(buffer_size=500)
ds = ds.batch(1)
def add_labels(arr, lab):
return({"kmer_1": arr[0],
"kmer_2": arr[1],
"kmer_3": arr[2],
"kmer_4": arr[3],
"kmer_5": arr[4],
"kmer_6": arr[5],
"kmer_7": arr[6],
"kmer_8": arr[7],
"kmer_9": arr[8],
"kmer_10": arr[9],
"kmer_11": arr[10],
"kmer_12": arr[11],
"kmer_13": arr[12],
"kmer_14": arr[13],
"kmer_15": arr[14]}, lab)
ds = ds.map(add_labels)
iterator = ds.make_one_shot_iterator()
batch_features, batch_labels = iterator.get_next()
return batch_features, batch_labels
In [24]:
def add_labels(arr, lab):
return({"kmer_1": arr[0],
"kmer_2": arr[1],
"kmer_3": arr[2],
"kmer_4": arr[3],
"kmer_5": arr[4],
"kmer_6": arr[5],
"kmer_7": arr[6],
"kmer_8": arr[7],
"kmer_9": arr[8],
"kmer_10": arr[9],
"kmer_11": arr[10],
"kmer_12": arr[11],
"kmer_13": arr[12],
"kmer_14": arr[13],
"kmer_15": arr[14]}, lab)
ds = tf.data.Dataset.from_generator(kmer_gen,
(tf.int64,
tf.int64),
(tf.TensorShape([15]),
tf.TensorShape(None)))
ds = ds.map(add_labels)
In [25]:
iterator = ds.make_one_shot_iterator()
In [26]:
batch_features, batch_labels = iterator.get_next()
In [27]:
def init(shape=None,
dtype=None,
partition_info=None):
return W
In [99]:
Out[99]:
In [ ]:
# Previous classifier
def cnn_model_fn(features, labels, mode):
"""Model for CNN"""
Weights = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_dim]), trainable=False, name="Weights")
embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_dim])
embedding_init = Weights.assign(embedding_placeholder)
def init_fn(scaffold, sess):
sess.run(Weights.initializer, {Weights.initial_value: W})
scaffold = tf.train.Scaffold(init_fn=init_fn)
inputs = tf.feature_column.input_layer(features, [kmers_fc])
input_i64 = tf.cast(inputs, tf.int64)
embedded_kmers = tf.nn.embedding_lookup(Weights, input_i64)
input_layer = tf.reshape(embedded_kmers, [-1, 15, 256, 1])
input_layer = tf.cast(input_layer, tf.float32)
conv1 = tf.layers.conv2d(inputs = input_layer,
filters=32,
kernel_size=[2,256],
strides=3,
padding="same",
name="Conv1",
activation=None)
avg_pool1 = tf.layers.average_pooling2d(conv1,
pool_size=[4,32],
strides=[2,16],
padding="same",
name="AvgPooling_1")
logits = tf.layers.dense(inputs=avg_pool1, units=len(replicons_list))
predictions = {
"classes": tf.argmax(input=logits, axis=1),
"probabilities": tf.nn.softmax(logits, name="softmax_tensor")}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32),
depth=len(replicons_list))
correct_prediction = tf.equal(tf.argmax(logits, 1), labels)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float64))
tf.summary.scalar("Accuracy", accuracy)
# labels = tf.squeeze(labels, 1)
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels,
logits = logits
)
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(
loss=loss,
global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
# Add evaluation metrics (for EVAL mode)
eval_metric_ops = {
"accuracy": tf.metrics.accuracy(
labels=labels, predictions=predictions["classes"])}
return tf.estimator.EstimatorSpec(
mode=mode,
loss=loss,
eval_metric_ops=eval_metric_ops)