In [1]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.datasets.data_utils import get_file
import numpy as np
import random
import sys

from Bio import SeqIO
from sklearn.utils import resample
from sklearn.preprocessing import LabelBinarizer

import custom_funcs as cf
import pandas as pd


Using gpu device 0: GRID K520 (CNMeM is disabled)
/home/ubuntu/anaconda3/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
Using Theano backend.

In [2]:
# Read in the protease inhibitor data
data, drug_cols, feat_cols = cf.read_data('hiv-protease-data.csv', n_data_cols=8)
print(len(data))
# Read in the consensus data
consensus_map = cf.read_consensus('hiv-protease-consensus.fasta')

# Clean the data
data = cf.clean_data(data, feat_cols, consensus_map)

# Identify feature columns
data = cf.drop_ambiguous_sequences(data, feat_cols)
data.dropna(inplace=True, subset=feat_cols)
data.head()


1808
Out[2]:
FPV ATV IDV LPV NFV SQV TPV DRV P1 P2 ... P90 P91 P92 P93 P94 P95 P96 P97 P98 P99
SeqID
4432 1.5 NaN 1.0 NaN 2.2 1.1 NaN NaN P Q ... L T Q I G C T L N F
4664 3.1 NaN 8.7 NaN 32.0 16.9 NaN NaN P Q ... M T Q I G C T L N F
5221 NaN NaN 0.8 0.8 1.2 0.7 NaN NaN P Q ... L T Q I G C T L N F
5279 8.3 79 16.0 12.0 600.0 1000.0 NaN NaN P Q ... M T Q I G C T L N F
5444 2.7 21 24.0 6.1 42.0 132.0 NaN NaN P Q ... M T Q I G C T L N F

5 rows × 107 columns


In [3]:
# Audience choice: Which drug would you like?
print(drug_colscols)

DRUG = 'FPV'


Index(['FPV', 'ATV', 'IDV', 'LPV', 'NFV', 'SQV', 'TPV', 'DRV'], dtype='object')

In [5]:
# Do vectorization into chunks of 20 a.a.

chunk_size = 20
motifs = []
jump_size = 5

In [45]:
# build the model: 2 stacked LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))


Out[45]:
SeqID
59258     1.131402
117142    3.737670
27084     2.484907
205640    2.833213
60106     0.262364
81797    -0.105361
68353    -0.693147
54411     1.526056
56022    -0.223144
197096   -0.105361
13239    -2.302585
28215     0.095310
61105     0.182322
79370    -0.693147
148009    0.336472
45122    -1.203973
45065    -1.609438
143455    2.197225
46209    -0.510826
187169    0.000000
172136    1.887070
51225     0.641854
75112    -0.510826
109410   -0.105361
61147    -0.916291
7119      0.336472
116511    2.772589
46705    -0.693147
90010     1.648659
29045    -0.916291
            ...   
45080    -0.105361
68992     0.955511
197088   -0.223144
187195   -0.510826
216661   -0.916291
257947   -0.510826
41597    -0.916291
117142    3.737670
12650    -0.693147
54411     1.526056
13242    -1.609438
54170     3.437208
98603     5.991465
54412     2.890372
45040     0.000000
116501   -0.510826
45096     0.000000
75114    -0.916291
60104    -0.510826
56478    -0.223144
66640     0.095310
77379     3.912023
81843     1.667707
54398     1.064711
61151    -0.510826
60102     3.401197
27465     0.336472
257923    0.095310
29045    -0.916291
197092   -0.223144
Name: FPV, dtype: float64

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [19]:
'''
    Example script to generate text from Nietzsche's writings.
    At least 20 epochs are required before the generated text
    starts sounding coherent.
    It is recommended to run this script on GPU, as recurrent
    networks are quite computationally intensive.
    If you try this script on new data, make sure your corpus
    has at least ~100k characters. ~1M is better.
'''

path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read().lower()
print('corpus length:', len(text))

chars = set(text)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
    maxlen = 20
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1


corpus length: 600893
total chars: 57
nb sequences: 200291
Vectorization...

In [16]:
sentences[10]


Out[16]:
' is a woman--what th'

In [48]:
next_chars[10]


Out[48]:
'e'

In [4]:
X.shape


Out[4]:
(200291, 20, 57)

In [6]:
char_indices


Out[6]:
{'\n': 16,
 ' ': 22,
 '!': 14,
 '"': 39,
 "'": 13,
 '(': 47,
 ')': 17,
 ',': 9,
 '-': 36,
 '.': 1,
 '0': 45,
 '1': 27,
 '2': 2,
 '3': 28,
 '4': 44,
 '5': 48,
 '6': 37,
 '7': 24,
 '8': 51,
 '9': 35,
 ':': 54,
 ';': 29,
 '=': 40,
 '?': 49,
 '[': 43,
 ']': 30,
 '_': 25,
 'a': 5,
 'b': 56,
 'c': 53,
 'd': 26,
 'e': 31,
 'f': 3,
 'g': 12,
 'h': 46,
 'i': 20,
 'j': 15,
 'k': 21,
 'l': 19,
 'm': 8,
 'n': 41,
 'o': 23,
 'p': 32,
 'q': 10,
 'r': 18,
 's': 33,
 't': 52,
 'u': 6,
 'v': 55,
 'w': 7,
 'x': 50,
 'y': 38,
 'z': 34,
 'ä': 11,
 'æ': 0,
 'é': 42,
 'ë': 4}

In [ ]: