In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import numpy as np

In [2]:
# 設定環境變數來控制 keras, theano
import os
os.environ['KERAS_BACKEND']="tensorflow"
os.environ['THEANO_FLAGS']="floatX=float32, device=cuda"

用 jieba 斷詞


In [3]:
import jieba
jieba.set_dictionary('dict.txt.big')
" - ".join(jieba.cut("今天天氣很好"))


Building prefix dict from /home/tjw/src/HackNTU_Data_2017/Week08/dict.txt.big ...
Loading model from cache /tmp/jieba.uabe345d172aa2c353a2c9ed1c2ff896a.cache
Loading model cost 0.979 seconds.
Prefix dict has been built succesfully.
Out[3]:
'今天天氣 - 很 - 好'

下載資料


In [4]:
if not os.path.isfile("sdyxz_all.txt"):    
    with open("sdyxz_all.txt","w") as outf:
        for i in range(1, 41):
            r = urlopen('http://www.millionbook.net/wx/j/jingyong/sdyxz/%03d.htm'%i)
            html = r.read().decode('cp950', "ignore")
            bs = BeautifulSoup(html, 'lxml')
            text = bs.findAll('td')[6].get_text()
            if len(text)<100:
                print("error")
                break
            print (i, len(text))
            outf.write(text)

In [5]:
# 忽略不要的詞
ignore_words = set("""the
of
is
and
to
in
that
we
for
an
are
by
be
as
on
with
can
if
from
which
you
it
this
then
at
have
all
not
one
has
or
that











一個
沒有




我們
你們
妳們
他們
她們
是否""".split("\n"))|set(",。*「」:?\n\u3000!、…』『《》-")

# 自訂辭典
jieba.add_word("黃蓉")
jieba.suggest_freq("黃蓉", True)
jieba.add_word("郭靖")
jieba.suggest_freq("郭靖", True)
with open("sdyxz_all.txt", "r") as f:
    words = [w for w in jieba.cut(f.read()) if w not in ignore_words]
print("len=", len(words))
print(words[:100])


len= 384051
['第一回', '風雪', '驚變', '錢塘江', '浩浩', '江水', '日日夜夜', '無窮', '無休', '從', '臨安', '牛家村', '邊', '繞過', '東流', '入海', '江畔', '一排', '數十株', '烏', '柏樹', '葉子', '似', '火燒', '般紅', '正是', '八月', '天時', '村前村後', '野草', '剛', '起始', '變黃', '一抹', '斜陽', '映照', '之下', '更增', '幾分', '蕭索', '兩株', '大', '松樹', '下圍', '一堆', '村民', '男男女女', '十幾個', '小孩', '正自', '聚精會神', '聽', '瘦削', '老者', '說話', '那', '說話', '人五', '十來', '歲', '年紀', '一件', '青布', '長袍', '早洗', '得', '褪成', '藍灰色', '只', '聽', '兩片', '梨花', '木板', '碰', '幾下', '左手', '中', '竹棒', '在', '一面', '小', '羯鼓', '上', '敲起', '得', '得', '連聲', '唱道', '小桃', '無主自', '開花', '煙草', '茫茫', '帶', '晚鴉', '幾處', '敗垣圍', '故井', '向來', '一一']

先處理掉少用字


In [6]:
# 總共有多少種字?
len(set(words))


Out[6]:
47134

我們只考慮最常用的 10000 字, 其他字用 UNK 取代


In [7]:
import collections
# 先統計字數
counter = collections.Counter(words)
# 可以看一下 counter 是的內容

In [8]:
def sep_words(words):
    for w in words:
        if counter[w]<3:
            for c in w:
                yield c
        else:
            yield w
words = list(sep_words(words))
len(words)


Out[8]:
434741

In [9]:
counter = collections.Counter(words)

In [10]:
# 最常見的 30 個字
counter.most_common(30)


Out[10]:
[('道', 7119),
 ('在', 5042),
 ('郭靖', 3364),
 ('也', 3086),
 ('不', 3050),
 ('得', 2640),
 ('又', 2622),
 ('這', 2561),
 ('那', 2524),
 ('黃蓉', 2493),
 ('去', 2225),
 ('上', 2098),
 ('一', 2001),
 ('人', 1916),
 ('中', 1867),
 ('說', 1857),
 ('卻', 1777),
 ('有', 1728),
 ('來', 1697),
 ('已', 1682),
 ('到', 1678),
 ('見', 1559),
 ('聽', 1548),
 ('但', 1504),
 ('要', 1452),
 ('叫', 1320),
 ('向', 1300),
 ('大', 1234),
 ('之', 1225),
 ('好', 1187)]

In [11]:
vocabulary_size = sum(x>2 for x in counter.values())
print(vocabulary_size)
wordfreq = counter.most_common(vocabulary_size-1)


# 建立 編號: 字 的對照表
num2word = ['UNK'] + [w for (w, _) in wordfreq]
freq = np.array([0]+[n for (_, n) in wordfreq], dtype="float64")
freq[0] = len(words) - freq.sum()
# 建立一個 字: 編號 的對照表
word2num = {w: i for i, w in enumerate(num2word)}

# 把 words 轉成對定的編號
data = np.array([word2num.get(word, 0) for word in words])
# 不需要 words 了
del words
del wordfreq
freq[:10]


17179
Out[11]:
array([ 1381.,  7119.,  5042.,  3364.,  3086.,  3050.,  2640.,  2622.,
        2561.,  2524.])

看一下目前的狀況


In [12]:
print(data[:20])
print(" - ".join(num2word[n] for n in data[:20]))


[  499    13   143  8023   467   923 12139  4536  4536  4662 14639  4768
    64  2511    56   681   866   437  4192   577]
第 - 一 - 回 - 風雪 - 驚 - 變 - 錢塘江 - 浩 - 浩 - 江水 - 日日夜夜 - 無窮 - 無 - 休 - 從 - 臨安 - 牛家村 - 邊 - 繞過 - 東

生成 skip-gram 模型的 batch

keywords skipgram, cbow, n-gram


In [13]:
import keras.backend as K


Using TensorFlow backend.

In [14]:
from keras.layers import Embedding, Dense, Flatten, Input
from keras.models import Sequential, Model
import keras.backend as K
import tensorflow as tf

# vector 的維度
embedding_size = 64

# 這其實只是線性映射,只不過輸入不是 one hot 而是 integer, 所以等同查表
word2vec = Sequential()
word2vec.add(Embedding(vocabulary_size, embedding_size, input_length=1))
word2vec.add(Flatten())
train_input = word2vec.inputs[0]
embeddings = word2vec.layers[0].embeddings 

# 對應到的 context
train_labels = Input(shape=(1,), dtype="int32")

# 這裡利用 tensorflow 的 nce_loss
nce_W = K.variable(K.random_normal((vocabulary_size, embedding_size),stddev=(embedding_size)**-0.5))
loss = K.mean(tf.nn.nce_loss(
                     weights=nce_W,
                     biases=K.zeros((vocabulary_size,)),
                     labels=train_labels,
                     inputs=word2vec.output,
                     num_sampled=64, # Number of negative examples to sample.
                     num_classes=vocabulary_size))

# 利用 tensorflow 的 optimizer
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

# 之後要拿來檢驗的例子
valid_examples = np.array([word2num[x] for x in ["郭靖", "黃蓉", "聽", "梅超風", "自己", "武功"]])
#valid_examples = np.array(np.random.choice(100, size=16, replace=False)+1)
valid_size = len(valid_examples)
valid_dataset = K.constant(valid_examples[:, None], "int32")
valid_embeddings = word2vec(valid_dataset)

# 正規化 embeddings, 來找 nearest neighbor 
normalized_embeddings = K.l2_normalize(embeddings, 1)
similarity = K.dot(valid_embeddings, K.transpose(normalized_embeddings))

# Add variable initializer.
init = tf.global_variables_initializer()

In [15]:
def skipgram_batch(data, batch_size, num_skips, skip_window):
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window    
    context_length = skip_window*2+1
    X = np.ndarray(shape=batch_size, dtype=np.int32)
    Y = np.ndarray(shape=batch_size, dtype=np.int32)
    idx = 0
    while True:
        for i in range(0, batch_size, num_skips):
            X[i:i+num_skips] = data[idx+skip_window]            
            context = data[idx:idx+context_length][np.arange(context_length) != skip_window]        
            # subsampling 機率
            #p = np.ones(2*skip_window)/2/skip_window
            Y[i:i+num_skips] = np.random.choice(context, size=num_skips, replace=False)
            idx = (idx+1)%(len(data)-context_length)
        yield X[:, None], Y
# 測試看看
X,Y = next(skipgram_batch(data, 20, 4, 3))
for x,y in zip(X, Y):
    print("{} -> {}".format(num2word[x[0]], num2word[y]) )


風雪 -> 一
風雪 -> 變
風雪 -> 錢塘江
風雪 -> 第
驚 -> 回
驚 -> 一
驚 -> 錢塘江
驚 -> 風雪
變 -> 回
變 -> 浩
變 -> 錢塘江
變 -> 風雪
錢塘江 -> 浩
錢塘江 -> 驚
錢塘江 -> 浩
錢塘江 -> 江水
浩 -> 錢塘江
浩 -> 驚
浩 -> 江水
浩 -> 變

In [16]:
import time
t0 = time.time()
batch_gen = skipgram_batch(data, batch_size=128, num_skips=4, skip_window=3)
with tf.Session() as sess:
    sess.run(init)
    average_loss = 0
    for step in range(0,200001):
        X,Y = next(batch_gen)
        feed_dict = {train_input: X, train_labels: Y[:, None]}
        _, loss_val = sess.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val
        if step >0 and step %10000 == 0:
            print(step, "average loss", average_loss/2000, time.time()-t0)
            average_loss = 0
        if step % 50000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = num2word[valid_examples[i]]
                nearest = (-sim[i, :]).argsort()[1:8 + 1]
                print(valid_word, [num2word[x] for x in nearest])
    final_embeddings = normalized_embeddings.eval()


郭靖 ['一方', '開眼界', '開外', '相伴', '有勞', '還不給', '汗巾', '偏偏']
黃蓉 ['頭破血流', '農夫', '遺命', '撿拾', '碰上', '褲腰', '分神', '這根']
聽 ['四拜', '陪同', '身高', '奇特', '鑽研', '隨他', '上門', '怎是']
梅超風 ['走開', '手忙腳亂', '妙手', '移', '駛近', '自身', '昂首', '十成']
自己 ['累死', '擒住', '花槍', '一碗', '昏睡', '顯見', '有事', '拿些']
武功 ['數十枚', '再也不會', '愛憐', '多遠', '爐火純青', '噹噹', '每次', '自然']
10000 average loss 117.482979189 19.07629156112671
20000 average loss 26.6575458162 37.70765018463135
30000 average loss 25.3875504308 59.566888093948364
40000 average loss 25.2453875465 80.43117189407349
50000 average loss 25.0363836396 98.54785776138306
郭靖 ['歐陽克', '陸冠英', '魯有腳', '書畫', '陸莊主', '微一', '沉吟', '了頭']
黃蓉 ['一燈', '陸冠英', '傻姑', '穆念慈', '微笑', '洪七公', '了頭', '陸莊主']
聽 ['有人', '盼', '聽得', '聽到', '一人', '聲音', '傾聽', '簫聲']
梅超風 ['離座', '簡管家', '一交', '壓住', '歐陽克見', '一登', '轎子', '已自']
自己 ['此刻', '內力', '深湛', '難保', '抵擋不住', '緊迫', '一燈大師', '精兵']
武功 ['武藝', '功夫', '內功', '武學', '雖然', '正宗', '畢竟', '本領']
60000 average loss 24.8738398213 116.52232456207275
70000 average loss 24.7359488605 134.91940927505493
80000 average loss 24.6463236263 152.87690782546997
90000 average loss 24.4874018786 170.96612906455994
100000 average loss 24.3574825135 188.77369213104248
郭靖 ['歐陽克', '招呼', '當下', '郭靖拉', '細細', '那小王爺', '忽變', '兩人']
黃蓉 ['穆念慈', '華箏', '搖手', '黃蓉微微', '一燈', '黃蓉忙', '瞪視', '那道士']
聽 ['聽得', '傾聽', '郭靖聽', '聲音', '幾聲', '有人', '靜夜', '黃蓉聽']
梅超風 ['歐陽鋒', '歐陽克', '已自', '周伯通', '柯鎮惡', '一燈大師', '微弱', '彭連虎']
自己 ['不顧', '此刻', '那丫頭', '一燈大師', '除非', '忘卻', '給人', '死不瞑目']
武功 ['武藝', '內功', '武學', '功夫', '本領', '畢竟', '全真教', '功力']
110000 average loss 24.2703796308 206.72057342529297
120000 average loss 24.1616960599 225.14439344406128
130000 average loss 24.0465097893 243.3114731311798
140000 average loss 23.940991281 261.03778862953186
150000 average loss 23.8917481182 278.7579357624054
郭靖 ['歐陽克', '黃蓉', '那小王爺', '陸冠英', '盛情', '難決', '來問', '說知']
黃蓉 ['穆念慈', '搖手', '華箏', '完顏康', '說知', '郭靖微', '噢', '喜不喜歡']
聽 ['聽得', '幾聲', '傾聽', '郭靖聽', '數聲', '聲音', '有人', '馬蹄聲']
梅超風 ['黃藥師', '歐陽鋒', '歐陽克', '柯鎮惡', '已自', '正以', '郭靖單', '劇震']
自己 ['情深', '重責', '提不起來', '但六怪', '驚詫', '翻來覆去', '不活', '自傷']
武功 ['武藝', '功夫', '功力', '武學', '本領', '內功', '要旨', '師弟']
160000 average loss 23.7584630456 296.74143528938293
170000 average loss 23.673121639 314.8308792114258
180000 average loss 23.5767048212 332.63936829566956
190000 average loss 23.5586743369 350.54309487342834
200000 average loss 23.421080289 368.4532642364502
郭靖 ['歐陽克', '黃蓉', '那小王爺', '急欲', '要分', '收力', '陸冠英', '兩人']
黃蓉 ['搖手', '郭靖微', '忽變', '掉頭', '畫像', '簡管家', '低眉', '來看']
聽 ['聽得', '傾聽', '嘲諷', '嘶啞', '擦擦', '郭靖聽', '一聽', '聽聽']
梅超風 ['歐陽鋒', '黃藥師', '正以', '柯鎮惡', '歐陽克', '推', '骨骼', '彭連虎']
自己 ['不慎', '重責', '我親', '提不起來', '情深', '滿肚子', '成全', '正以']
武功 ['武藝', '功力', '功夫', '武學', '內功', '本領', '要旨', '陣法']

In [17]:
def find_sim(v, num=10):
    if isinstance(v, str):
        v = w2v(v)
    return [num2word[x] for x in  (final_embeddings @ v).argsort()[-num-1:-1][::-1]]
def w2v(w):
    return final_embeddings[word2num.get(w, 0)] 
find_sim(w2v('黃藥師'))


Out[17]:
['梅超風', '歐陽鋒', '問那', '安好', '旁觀者清', '黃蓉', '硬闖', '中計', '二弟', '快馬一鞭']

In [18]:
find_sim('吃')


Out[18]:
['餵', '吃飯', '不吃', '吃過', '煮', '粥', '猛吃', '餓', '碗飯', '照面']

In [19]:
find_sim(w2v('西毒')+w2v('洪七公')-w2v('北丐'))


Out[19]:
['歐陽鋒', '穆易', '黃蓉', '一燈', '周伯通', '隨機應變', '生雙靨', '全愈', '七分', '郭靖']

In [20]:
find_sim( w2v('郭靖')+(w2v('穆念慈')-w2v('楊康')) )


Out[20]:
['穆念慈', '華箏', '黃蓉', '嗔', '陸冠英', '兩人', '沉吟', '分開', '靖哥哥', '先前']

In [32]:
find_sim( w2v('洪七公')+(w2v('蛤蟆功')-w2v('歐陽鋒')) )


Out[32]:
['包惜弱', '天邊', '大踏步', '走出', '傻姑', '走', '歇歇', '完顏洪烈', '歐陽鋒', '巖洞']

In [30]:
find_sim(w2v('吃')- w2v("飯")+w2v("酒"))


Out[30]:
['喝', '酒', '餵', '一杯', '一大口', '吃過', '生得', '骨嘟', '咦', '咕嘟']

In [23]:
# 用 t-sne 降低維度
from sklearn.manifold import TSNE
samples = 500
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
low_dim_embs = tsne.fit_transform(final_embeddings[:samples])
labels = num2word[1:samples]

In [24]:
# 從 00Download.ipynb 來的
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import os.path
from urllib.request import urlopen
def download_req(req, filename):
    total = int(req.getheader("Content-Length"))
    with open(filename,'wb') as f:
        i = 0
        for data in iter(lambda: req.read(8192), b""):  
            i+=1
            f.write(data)
            print("\rdownloading: %5.1f%%"%(i*8192*100.0/total), end="")

# 字體下載
font_filename = 'NotoSansCJKtc-hinted.zip'
font_url = "https://noto-website-2.storage.googleapis.com/pkgs/"+font_filename
# 改變這行才能真正下載
if not (os.path.isfile(font_filename) and os.path.getsize(font_filename)==121247052):
    with urlopen(font_url) as req:
        download_req(req, "NotoSansCJKtc-hinted.zip")
# Extract Font files
import zipfile
with zipfile.ZipFile(font_filename) as zf:
    for f in zf.namelist():
        if f.endswith('.otf'):
            print("extract", f)
            zf.extract(f)

fp = matplotlib.font_manager.FontProperties(fname = 'NotoSansCJKtc-Regular.otf')
matplotlib.font_manager.fontManager.ttffiles.append(fp.get_file())
font_entry = matplotlib.font_manager.FontEntry(fp.get_file(), name=fp.get_name(),
                                               style=fp.get_style(), variant=fp.get_variant(),
                                              weight=fp.get_weight(), stretch=fp.get_stretch(), size=fp.get_size())

matplotlib.font_manager.fontManager.ttflist.append(font_entry)
plt.rcParams['font.family'] = fp.get_name()


extract NotoSansCJKtc-Black.otf
extract NotoSansCJKtc-Bold.otf
extract NotoSansCJKtc-DemiLight.otf
extract NotoSansCJKtc-Light.otf
extract NotoSansCJKtc-Medium.otf
extract NotoSansCJKtc-Regular.otf
extract NotoSansCJKtc-Thin.otf
extract NotoSansMonoCJKtc-Bold.otf
extract NotoSansMonoCJKtc-Regular.otf

In [25]:
# 畫出來

plt.figure(figsize=(20,20))
plt.scatter(low_dim_embs[:, 0], low_dim_embs[:, 1])
for i, label in enumerate(labels):
    x, y = low_dim_embs[i]
    plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 fontsize=14,
                 ha='right',
                 va='bottom')



In [ ]: