notebook.community

Edit and run



In [1]:

    
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.datasets.data_utils import get_file
import numpy as np
import random
import sys

from Bio import SeqIO
from sklearn.utils import resample
from sklearn.preprocessing import LabelBinarizer

import custom_funcs as cf
import pandas as pd









    



Using gpu device 0: GRID K520 (CNMeM is disabled)
/home/ubuntu/anaconda3/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))






    



Using Theano backend.



In [2]:

    
# Read in the protease inhibitor data
data, drug_cols, feat_cols = cf.read_data('hiv-protease-data.csv', n_data_cols=8)
print(len(data))
# Read in the consensus data
consensus_map = cf.read_consensus('hiv-protease-consensus.fasta')

# Clean the data
data = cf.clean_data(data, feat_cols, consensus_map)

# Identify feature columns
data = cf.drop_ambiguous_sequences(data, feat_cols)
data.dropna(inplace=True, subset=feat_cols)
data.head()









    



1808






    Out[2]:






  
    
      
      FPV
      ATV
      IDV
      LPV
      NFV
      SQV
      TPV
      DRV
      P1
      P2
      ...
      P90
      P91
      P92
      P93
      P94
      P95
      P96
      P97
      P98
      P99
    
    
      SeqID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      4432
      1.5
      NaN
      1.0
      NaN
      2.2
      1.1
      NaN
      NaN
      P
      Q
      ...
      L
      T
      Q
      I
      G
      C
      T
      L
      N
      F
    
    
      4664
      3.1
      NaN
      8.7
      NaN
      32.0
      16.9
      NaN
      NaN
      P
      Q
      ...
      M
      T
      Q
      I
      G
      C
      T
      L
      N
      F
    
    
      5221
      NaN
      NaN
      0.8
      0.8
      1.2
      0.7
      NaN
      NaN
      P
      Q
      ...
      L
      T
      Q
      I
      G
      C
      T
      L
      N
      F
    
    
      5279
      8.3
      79
      16.0
      12.0
      600.0
      1000.0
      NaN
      NaN
      P
      Q
      ...
      M
      T
      Q
      I
      G
      C
      T
      L
      N
      F
    
    
      5444
      2.7
      21
      24.0
      6.1
      42.0
      132.0
      NaN
      NaN
      P
      Q
      ...
      M
      T
      Q
      I
      G
      C
      T
      L
      N
      F
    
  

5 rows × 107 columns



In [3]:

    
# Audience choice: Which drug would you like?
print(drug_colscols)

DRUG = 'FPV'









    



Index(['FPV', 'ATV', 'IDV', 'LPV', 'NFV', 'SQV', 'TPV', 'DRV'], dtype='object')



In [5]:

    
# Do vectorization into chunks of 20 a.a.

chunk_size = 20
motifs = []
jump_size = 5



In [45]:

    
# build the model: 2 stacked LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))









    Out[45]:





SeqID
59258     1.131402
117142    3.737670
27084     2.484907
205640    2.833213
60106     0.262364
81797    -0.105361
68353    -0.693147
54411     1.526056
56022    -0.223144
197096   -0.105361
13239    -2.302585
28215     0.095310
61105     0.182322
79370    -0.693147
148009    0.336472
45122    -1.203973
45065    -1.609438
143455    2.197225
46209    -0.510826
187169    0.000000
172136    1.887070
51225     0.641854
75112    -0.510826
109410   -0.105361
61147    -0.916291
7119      0.336472
116511    2.772589
46705    -0.693147
90010     1.648659
29045    -0.916291
            ...   
45080    -0.105361
68992     0.955511
197088   -0.223144
187195   -0.510826
216661   -0.916291
257947   -0.510826
41597    -0.916291
117142    3.737670
12650    -0.693147
54411     1.526056
13242    -1.609438
54170     3.437208
98603     5.991465
54412     2.890372
45040     0.000000
116501   -0.510826
45096     0.000000
75114    -0.916291
60104    -0.510826
56478    -0.223144
66640     0.095310
77379     3.912023
81843     1.667707
54398     1.064711
61151    -0.510826
60102     3.401197
27465     0.336472
257923    0.095310
29045    -0.916291
197092   -0.223144
Name: FPV, dtype: float64



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [19]:

    
'''
    Example script to generate text from Nietzsche's writings.
    At least 20 epochs are required before the generated text
    starts sounding coherent.
    It is recommended to run this script on GPU, as recurrent
    networks are quite computationally intensive.
    If you try this script on new data, make sure your corpus
    has at least ~100k characters. ~1M is better.
'''

path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read().lower()
print('corpus length:', len(text))

chars = set(text)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
    maxlen = 20
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1









    



corpus length: 600893
total chars: 57
nb sequences: 200291
Vectorization...



In [16]:

    
sentences[10]









    Out[16]:





' is a woman--what th'



In [48]:

    
next_chars[10]









    Out[48]:





'e'



In [4]:

    
X.shape









    Out[4]:





(200291, 20, 57)



In [6]:

    
char_indices









    Out[6]:





{'\n': 16,
 ' ': 22,
 '!': 14,
 '"': 39,
 "'": 13,
 '(': 47,
 ')': 17,
 ',': 9,
 '-': 36,
 '.': 1,
 '0': 45,
 '1': 27,
 '2': 2,
 '3': 28,
 '4': 44,
 '5': 48,
 '6': 37,
 '7': 24,
 '8': 51,
 '9': 35,
 ':': 54,
 ';': 29,
 '=': 40,
 '?': 49,
 '[': 43,
 ']': 30,
 '_': 25,
 'a': 5,
 'b': 56,
 'c': 53,
 'd': 26,
 'e': 31,
 'f': 3,
 'g': 12,
 'h': 46,
 'i': 20,
 'j': 15,
 'k': 21,
 'l': 19,
 'm': 8,
 'n': 41,
 'o': 23,
 'p': 32,
 'q': 10,
 'r': 18,
 's': 33,
 't': 52,
 'u': 6,
 'v': 55,
 'w': 7,
 'x': 50,
 'y': 38,
 'z': 34,
 'ä': 11,
 'æ': 0,
 'é': 42,
 'ë': 4}



In [ ]:

	FPV	ATV	IDV	LPV	NFV	SQV	TPV	DRV	P1	P2	...	P90	P91	P92	P93	P94	P95	P96	P97	P98	P99
SeqID
4432	1.5	NaN	1.0	NaN	2.2	1.1	NaN	NaN	P	Q	...	L	T	Q	I	G	C	T	L	N	F
4664	3.1	NaN	8.7	NaN	32.0	16.9	NaN	NaN	P	Q	...	M	T	Q	I	G	C	T	L	N	F
5221	NaN	NaN	0.8	0.8	1.2	0.7	NaN	NaN	P	Q	...	L	T	Q	I	G	C	T	L	N	F
5279	8.3	79	16.0	12.0	600.0	1000.0	NaN	NaN	P	Q	...	M	T	Q	I	G	C	T	L	N	F
5444	2.7	21	24.0	6.1	42.0	132.0	NaN	NaN	P	Q	...	M	T	Q	I	G	C	T	L	N	F