In [42]:
import pandas as pd
import numpy as np
In [2]:
sample_file_path = './markup/A/ADSL.csv'
In [3]:
data = pd.read_csv(sample_file_path)
In [4]:
data.head()
Out[4]:
In [5]:
tokens = data.token
positions = data.pos
sentences = data.sentence
norms = data.norm
In [14]:
current_sentence = sentences[0]
lines = {}
lines['X'] = []
lines['Y'] = []
sent_tokens = []
sent_norms = []
for i in range(len(tokens)):
if sentences[i] == current_sentence:
sent_tokens.append(tokens[i])
sent_norms.append(norms[i])
else:
current_sentence = sentences[i]
# print(sent_tokens)
lines['X'].append(' '.join(sent_tokens))
lines['Y'].append(' '.join(sent_norms))
sent_tokens = []
sent_norms = []
else:
lines['X'].append(' '.join(sent_tokens))
lines['Y'].append(' '.join(sent_norms))
In [19]:
print(lines['X'][0])
print(lines['Y'][0])
In [21]:
symbols_table = {}
codes_table = {}
char_counter = 1
net_data = {}
net_data['X'] = []
net_data['Y'] = []
for sent in lines['X']:
encoded_sentence = []
for char in sent:
if char not in symbols_table:
symbols_table[char] = char_counter
codes_table[char_counter] = char
char_counter += 1
encoded_sentence.append(symbols_table[char])
encoded_sentence.append(0)
net_data['X'].append(encoded_sentence)
for sent in lines['Y']:
encoded_sentence = []
for char in sent:
if char not in symbols_table:
symbols_table[char] = char_counter
codes_table[char_counter] = char
char_counter += 1
encoded_sentence.append(symbols_table[char])
encoded_sentence.append(0)
net_data['Y'].append(encoded_sentence)
In [27]:
max_len = 0
for sent in net_data['X']:
if len(sent) > max_len:
max_len = len(sent)
for sent in net_data['Y']:
if len(sent) > max_len:
max_len = len(sent)
print max_len
In [28]:
for sent in net_data['X']:
while len(sent) < max_len:
sent.append(0)
for sent in net_data['Y']:
while len(sent) < max_len:
sent.append(0)
In [103]:
from keras.layers import LSTM, Dense, Embedding, TimeDistributed, Activation,RepeatVector
from keras.models import Sequential
In [60]:
model = Sequential()
model.add(Embedding(input_dim=char_counter, output_dim=10, input_length=max_len, mask_zero=True))
model.add(LSTM(50))
model.add(Dense(50))
model.add(RepeatVector(max_len))
model.add(LSTM(50, return_sequences=True))
model.add(TimeDistributed(Dense(char_counter)))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
model.summary()
In [46]:
net_data['X'] = np.array(net_data['X'])
In [57]:
net_data['Y'] = np.reshape(net_data['Y'],(42,1023,1))
In [64]:
model.fit(net_data['X'], net_data['Y'], epochs=5)
Out[64]:
In [65]:
net_answer = model.predict(net_data['X'])
In [100]:
answer_sents = []
for sent in net_answer:
answer_sent = ''
for char in sent:
res = np.argmax(char)
if res > 0:
letter = codes_table[res]
answer_sent += codes_table[res]
answer_sents.append(answer_sent)
In [102]:
print(answer_sents[0][:20])