In [3]:
    
# standard
from IPython import embed
import pandas as pd
import numpy as np
# frameworks
from frameworks.seq2seq_keras.models import AttentionSeq2Seq
from gensim.models import Word2Vec
# custom
from data_utils import get_train_data
from word2vec import get_word_embedding
from vocab import get_vocab
    
In [7]:
    
_BATCH_SIZE = 64
_VOCAB_SIZE = 6000
_WORD_DIM = 128
_MODEL_DEPTH = 4
_INPUT_LENGTH = 25
_OUTPUT_LENGTH = 10
    
In [3]:
    
model = AttentionSeq2Seq(input_length=_INPUT_LENGTH, 
                         input_dim=_WORD_DIM, 
                         hidden_dim=_WORD_DIM, 
                         output_length=_OUTPUT_LENGTH, 
                         output_dim=_WORD_DIM, 
                         depth=_MODEL_DEPTH)
model.compile(loss='mse', optimizer='rmsprop')
    
In [8]:
    
embedding = get_word_embedding(_WORD_DIM)
    
In [7]:
    
train_data = get_train_data()
_, ch2int = get_vocab()
    
In [8]:
    
len(train_data)
    
    Out[8]:
In [9]:
    
def pad_to(lst, length, value):
    for i in range(len(lst), length):
        lst.append(value)
    
    return lst
def clean_train_data(train_data):
    X_train = []
    Y_train = []
    for idx in xrange(len(train_data)):
        line_number = idx % 4
        
        keyword = train_data[idx]['keyword']
        current_sentence = train_data[idx]['sentence']
        previous_sentences = ''.join([train_data[idx - i]['sentence'] for i in range(line_number, 0, -1)])
        
        X_entry = pad_to([[ch2int[ch]] for ch in (keyword + previous_sentences)], 25, [_VOCAB_SIZE - 1])
        Y_entry = pad_to([[ch2int[ch]] for ch in current_sentence], 10, [_VOCAB_SIZE - 1])
        
        X_train.append(X_entry)
        Y_train.append(Y_entry)
        
    return X_train, Y_train
    
In [10]:
    
X_train, Y_train = clean_train_data(train_data)
    
In [13]:
    
X_train_embedded = [map(lambda x: embedding[x[0]], sample) for sample in X_train]
    
In [14]:
    
Y_train_embedded = [map(lambda x: embedding[x[0]], sample) for sample in Y_train]
    
In [15]:
    
model.fit(X_train_embedded, Y_train_embedded, epochs=1, verbose=1)
    
    
    Out[15]:
In [16]:
    
kw = u'山水'
    
In [17]:
    
kw_pad = [pad_to([[ch2int[ch]] for ch in kw], 25, [_VOCAB_SIZE - 1])]
    
In [18]:
    
kw_embed = [map(lambda x: embedding[x[0]], sample) for sample in kw_pad]
    
In [19]:
    
kw_embed_array = np.array(kw_embed)
    
In [20]:
    
pred = model.predict(kw_embed_array)
pred
    
    Out[20]:
In [21]:
    
w2v_model = Word2Vec.load('data/word2vec.model')
    
In [22]:
    
result = []
for i in range(len(pred[0])):
    result.append(w2v_model.most_similar(positive=[pred[0][i]], topn=1))
    
In [4]:
    
for r in result:
    print r[0][0]
    
    
In [ ]: