In [42]:
import pandas as pd
import numpy as np

In [2]:
sample_file_path = './markup/A/ADSL.csv'

In [3]:
data = pd.read_csv(sample_file_path)

In [4]:
data.head()


Out[4]:
Unnamed: 0 id norm pos sentence token
0 0 0 adsl 0 0 ADSL
1 1 1 adsl 0 0 ADSL
2 2 2 ( 11 0 (
3 3 3 13 0
4 4 4 асимметричный 15 0 асимметричная

In [5]:
tokens = data.token
positions = data.pos
sentences = data.sentence
norms = data.norm

In [14]:
current_sentence = sentences[0]
lines = {}
lines['X'] = []
lines['Y'] = []
sent_tokens = []
sent_norms = []
for i in range(len(tokens)):    
    if sentences[i] == current_sentence:
        sent_tokens.append(tokens[i])
        sent_norms.append(norms[i])
    else:
        current_sentence = sentences[i]
#         print(sent_tokens)
        lines['X'].append(' '.join(sent_tokens))
        lines['Y'].append(' '.join(sent_norms))
        sent_tokens = []
        sent_norms = []
else:
    lines['X'].append(' '.join(sent_tokens))
    lines['Y'].append(' '.join(sent_norms))

In [19]:
print(lines['X'][0])
print(lines['Y'][0])


ADSL ADSL ( — асимметричная цифровая абонентская линия ) — модемная технология , в которой доступная полоса пропускания канала распределена между исходящим и входящим трафиком асимметрично .
adsl adsl ( — асимметричный цифровой абонентский линия ) — модемный технология , в который доступный полоса пропускание канал распределить между исходящая и входящая трафик асимметричный .

In [21]:
symbols_table = {}
codes_table = {}
char_counter = 1
net_data = {}
net_data['X'] = []
net_data['Y'] = []
for sent in lines['X']:
    encoded_sentence = []
    for char in sent:
        if char not in symbols_table:
            symbols_table[char] = char_counter
            codes_table[char_counter] = char
            char_counter += 1
        encoded_sentence.append(symbols_table[char])
    encoded_sentence.append(0)
    net_data['X'].append(encoded_sentence)
    
for sent in lines['Y']:
    encoded_sentence = []
    for char in sent:
        if char not in symbols_table:
            symbols_table[char] = char_counter
            codes_table[char_counter] = char
            char_counter += 1
        encoded_sentence.append(symbols_table[char])        
    encoded_sentence.append(0)
    net_data['Y'].append(encoded_sentence)

In [27]:
max_len = 0
for sent in net_data['X']:
    if len(sent) > max_len:
        max_len = len(sent)
        
for sent in net_data['Y']:
    if len(sent) > max_len:
        max_len = len(sent)
print max_len


1023

In [28]:
for sent in net_data['X']:
    while len(sent) < max_len:
        sent.append(0)    
        
for sent in net_data['Y']:
    while len(sent) < max_len:
        sent.append(0)

In [103]:
from keras.layers import LSTM, Dense, Embedding, TimeDistributed, Activation,RepeatVector
from keras.models import Sequential

In [60]:
model = Sequential()
model.add(Embedding(input_dim=char_counter, output_dim=10, input_length=max_len, mask_zero=True))
model.add(LSTM(50))
model.add(Dense(50))
model.add(RepeatVector(max_len))
model.add(LSTM(50, return_sequences=True))
model.add(TimeDistributed(Dense(char_counter)))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',
             metrics=['accuracy'])

model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_11 (Embedding)     (None, 1023, 10)          1170      
_________________________________________________________________
lstm_19 (LSTM)               (None, 50)                12200     
_________________________________________________________________
dense_15 (Dense)             (None, 50)                2550      
_________________________________________________________________
repeat_vector_10 (RepeatVect (None, 1023, 50)          0         
_________________________________________________________________
lstm_20 (LSTM)               (None, 1023, 50)          20200     
_________________________________________________________________
time_distributed_4 (TimeDist (None, 1023, 117)         5967      
=================================================================
Total params: 42,087
Trainable params: 42,087
Non-trainable params: 0
_________________________________________________________________

In [46]:
net_data['X'] = np.array(net_data['X'])

In [57]:
net_data['Y'] = np.reshape(net_data['Y'],(42,1023,1))

In [64]:
model.fit(net_data['X'], net_data['Y'], epochs=5)


Epoch 1/5
42/42 [==============================] - 3s 68ms/step - loss: 6.1160 - acc: 0.0408
Epoch 2/5
42/42 [==============================] - 3s 66ms/step - loss: 9.0023 - acc: 0.0408
Epoch 3/5
42/42 [==============================] - 3s 65ms/step - loss: 4.6388 - acc: 0.0408
Epoch 4/5
42/42 [==============================] - 3s 66ms/step - loss: 3.2893 - acc: 0.0408
Epoch 5/5
42/42 [==============================] - 3s 70ms/step - loss: 5.5618 - acc: 0.0408
Out[64]:
<keras.callbacks.History at 0x26942c50>

In [65]:
net_answer = model.predict(net_data['X'])

In [100]:
answer_sents = []
for sent in net_answer:
    answer_sent = ''
    for char in sent:
        res = np.argmax(char)        
        if res > 0:
            letter = codes_table[res]
            answer_sent += codes_table[res]
    answer_sents.append(answer_sent)

In [102]:
print(answer_sents[0][:20])


��������������������