In [1]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.datasets.data_utils import get_file
import numpy as np
import random
import sys
from Bio import SeqIO
from sklearn.utils import resample
from sklearn.preprocessing import LabelBinarizer
import custom_funcs as cf
import pandas as pd
In [2]:
# Read in the protease inhibitor data
data, drug_cols, feat_cols = cf.read_data('hiv-protease-data.csv', n_data_cols=8)
print(len(data))
# Read in the consensus data
consensus_map = cf.read_consensus('hiv-protease-consensus.fasta')
# Clean the data
data = cf.clean_data(data, feat_cols, consensus_map)
# Identify feature columns
data = cf.drop_ambiguous_sequences(data, feat_cols)
data.dropna(inplace=True, subset=feat_cols)
data.head()
Out[2]:
In [3]:
# Audience choice: Which drug would you like?
print(drug_colscols)
DRUG = 'FPV'
In [5]:
# Do vectorization into chunks of 20 a.a.
chunk_size = 20
motifs = []
jump_size = 5
In [45]:
# build the model: 2 stacked LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
Out[45]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [19]:
'''
Example script to generate text from Nietzsche's writings.
At least 20 epochs are required before the generated text
starts sounding coherent.
It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
'''
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read().lower()
print('corpus length:', len(text))
chars = set(text)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 20
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
sentences.append(text[i: i + maxlen])
next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
for t, char in enumerate(sentence):
X[i, t, char_indices[char]] = 1
y[i, char_indices[next_chars[i]]] = 1
In [16]:
sentences[10]
Out[16]:
In [48]:
next_chars[10]
Out[48]:
In [4]:
X.shape
Out[4]:
In [6]:
char_indices
Out[6]:
In [ ]: