In [1]:
#! /usr/bin/env python
#-*- coding:utf-8 -*-

from utils import *
from segment import Segmenter
from vocab import get_vocab, VOCAB_SIZE
from quatrains import get_quatrains
from gensim import models
from numpy.random import uniform

ndim = 128


Using TensorFlow backend.
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.240 seconds.
Prefix dict has been built succesfully.

New Word2Vec


In [2]:
# print "Generating %d-dim word embedding ..." %ndim
# int2ch, ch2int = get_vocab()
# ch_lists = []
# quatrains = get_quatrains()
# for idx, poem in enumerate(quatrains):
#     for sentence in poem['sentences']:
#         ch_lists.append(filter(lambda ch: ch in ch2int, sentence))
#     # the i-th characters in the poem, used to boost Dui Zhang
#     i_characters = [[sentence[j] for sentence in poem['sentences']] for j in range(len(poem['sentences'][0]))]
#     for characters in i_characters:
#         ch_lists.append(filter(lambda ch: ch in ch2int, characters))
#     if 0 == (idx+1)%10000:
#         print "[Word2Vec] %d/%d poems have been processed." %(idx+1, len(quatrains))

In [3]:
# print "Hold on. This may take some time ..."
# model = models.Word2Vec(ch_lists, size = ndim, min_count = 5)
# embedding = uniform(-1.0, 1.0, [VOCAB_SIZE, ndim])
# for idx, ch in enumerate(int2ch):
#     if ch in model.wv:
#         embedding[idx,:] = model.wv[ch]
# np.save(_w2v_path, embedding)
# print "Word embedding is saved."

Seq to Seq


In [ ]: