In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import numpy as np
In [2]:
# 設定環境變數來控制 keras, theano
import os
os.environ['KERAS_BACKEND']="tensorflow"
os.environ['THEANO_FLAGS']="floatX=float32, device=cuda"
In [3]:
import jieba
jieba.set_dictionary('dict.txt.big')
" - ".join(jieba.cut("今天天氣很好"))
Out[3]:
In [4]:
if not os.path.isfile("sdyxz_all.txt"):
with open("sdyxz_all.txt","w") as outf:
for i in range(1, 41):
r = urlopen('http://www.millionbook.net/wx/j/jingyong/sdyxz/%03d.htm'%i)
html = r.read().decode('cp950', "ignore")
bs = BeautifulSoup(html, 'lxml')
text = bs.findAll('td')[6].get_text()
if len(text)<100:
print("error")
break
print (i, len(text))
outf.write(text)
In [5]:
# 忽略不要的詞
ignore_words = set("""the
of
is
and
to
in
that
we
for
an
are
by
be
as
on
with
can
if
from
which
you
it
this
then
at
have
all
not
one
has
or
that
的
了
和
是
就
都
而
及
與
著
或
一個
沒有
你
我
他
她
我們
你們
妳們
他們
她們
是否""".split("\n"))|set(",。*「」:?\n\u3000!、…』『《》-")
# 自訂辭典
jieba.add_word("黃蓉")
jieba.suggest_freq("黃蓉", True)
jieba.add_word("郭靖")
jieba.suggest_freq("郭靖", True)
with open("sdyxz_all.txt", "r") as f:
words = [w for w in jieba.cut(f.read()) if w not in ignore_words]
print("len=", len(words))
print(words[:100])
In [6]:
# 總共有多少種字?
len(set(words))
Out[6]:
我們只考慮最常用的 10000 字, 其他字用 UNK 取代
In [7]:
import collections
# 先統計字數
counter = collections.Counter(words)
# 可以看一下 counter 是的內容
In [8]:
def sep_words(words):
for w in words:
if counter[w]<3:
for c in w:
yield c
else:
yield w
words = list(sep_words(words))
len(words)
Out[8]:
In [9]:
counter = collections.Counter(words)
In [10]:
# 最常見的 30 個字
counter.most_common(30)
Out[10]:
In [11]:
vocabulary_size = sum(x>2 for x in counter.values())
print(vocabulary_size)
wordfreq = counter.most_common(vocabulary_size-1)
# 建立 編號: 字 的對照表
num2word = ['UNK'] + [w for (w, _) in wordfreq]
freq = np.array([0]+[n for (_, n) in wordfreq], dtype="float64")
freq[0] = len(words) - freq.sum()
# 建立一個 字: 編號 的對照表
word2num = {w: i for i, w in enumerate(num2word)}
# 把 words 轉成對定的編號
data = np.array([word2num.get(word, 0) for word in words])
# 不需要 words 了
del words
del wordfreq
freq[:10]
Out[11]:
看一下目前的狀況
In [12]:
print(data[:20])
print(" - ".join(num2word[n] for n in data[:20]))
生成 skip-gram 模型的 batch
keywords skipgram, cbow, n-gram
In [13]:
import keras.backend as K
In [14]:
from keras.layers import Embedding, Dense, Flatten, Input
from keras.models import Sequential, Model
import keras.backend as K
import tensorflow as tf
# vector 的維度
embedding_size = 64
# 這其實只是線性映射,只不過輸入不是 one hot 而是 integer, 所以等同查表
word2vec = Sequential()
word2vec.add(Embedding(vocabulary_size, embedding_size, input_length=1))
word2vec.add(Flatten())
train_input = word2vec.inputs[0]
embeddings = word2vec.layers[0].embeddings
# 對應到的 context
train_labels = Input(shape=(1,), dtype="int32")
# 這裡利用 tensorflow 的 nce_loss
nce_W = K.variable(K.random_normal((vocabulary_size, embedding_size),stddev=(embedding_size)**-0.5))
loss = K.mean(tf.nn.nce_loss(
weights=nce_W,
biases=K.zeros((vocabulary_size,)),
labels=train_labels,
inputs=word2vec.output,
num_sampled=64, # Number of negative examples to sample.
num_classes=vocabulary_size))
# 利用 tensorflow 的 optimizer
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
# 之後要拿來檢驗的例子
valid_examples = np.array([word2num[x] for x in ["郭靖", "黃蓉", "聽", "梅超風", "自己", "武功"]])
#valid_examples = np.array(np.random.choice(100, size=16, replace=False)+1)
valid_size = len(valid_examples)
valid_dataset = K.constant(valid_examples[:, None], "int32")
valid_embeddings = word2vec(valid_dataset)
# 正規化 embeddings, 來找 nearest neighbor
normalized_embeddings = K.l2_normalize(embeddings, 1)
similarity = K.dot(valid_embeddings, K.transpose(normalized_embeddings))
# Add variable initializer.
init = tf.global_variables_initializer()
In [15]:
def skipgram_batch(data, batch_size, num_skips, skip_window):
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
context_length = skip_window*2+1
X = np.ndarray(shape=batch_size, dtype=np.int32)
Y = np.ndarray(shape=batch_size, dtype=np.int32)
idx = 0
while True:
for i in range(0, batch_size, num_skips):
X[i:i+num_skips] = data[idx+skip_window]
context = data[idx:idx+context_length][np.arange(context_length) != skip_window]
# subsampling 機率
#p = np.ones(2*skip_window)/2/skip_window
Y[i:i+num_skips] = np.random.choice(context, size=num_skips, replace=False)
idx = (idx+1)%(len(data)-context_length)
yield X[:, None], Y
# 測試看看
X,Y = next(skipgram_batch(data, 20, 4, 3))
for x,y in zip(X, Y):
print("{} -> {}".format(num2word[x[0]], num2word[y]) )
In [16]:
import time
t0 = time.time()
batch_gen = skipgram_batch(data, batch_size=128, num_skips=4, skip_window=3)
with tf.Session() as sess:
sess.run(init)
average_loss = 0
for step in range(0,200001):
X,Y = next(batch_gen)
feed_dict = {train_input: X, train_labels: Y[:, None]}
_, loss_val = sess.run([optimizer, loss], feed_dict=feed_dict)
average_loss += loss_val
if step >0 and step %10000 == 0:
print(step, "average loss", average_loss/2000, time.time()-t0)
average_loss = 0
if step % 50000 == 0:
sim = similarity.eval()
for i in range(valid_size):
valid_word = num2word[valid_examples[i]]
nearest = (-sim[i, :]).argsort()[1:8 + 1]
print(valid_word, [num2word[x] for x in nearest])
final_embeddings = normalized_embeddings.eval()
In [17]:
def find_sim(v, num=10):
if isinstance(v, str):
v = w2v(v)
return [num2word[x] for x in (final_embeddings @ v).argsort()[-num-1:-1][::-1]]
def w2v(w):
return final_embeddings[word2num.get(w, 0)]
find_sim(w2v('黃藥師'))
Out[17]:
In [18]:
find_sim('吃')
Out[18]:
In [19]:
find_sim(w2v('西毒')+w2v('洪七公')-w2v('北丐'))
Out[19]:
In [20]:
find_sim( w2v('郭靖')+(w2v('穆念慈')-w2v('楊康')) )
Out[20]:
In [32]:
find_sim( w2v('洪七公')+(w2v('蛤蟆功')-w2v('歐陽鋒')) )
Out[32]:
In [30]:
find_sim(w2v('吃')- w2v("飯")+w2v("酒"))
Out[30]:
In [23]:
# 用 t-sne 降低維度
from sklearn.manifold import TSNE
samples = 500
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
low_dim_embs = tsne.fit_transform(final_embeddings[:samples])
labels = num2word[1:samples]
In [24]:
# 從 00Download.ipynb 來的
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import os.path
from urllib.request import urlopen
def download_req(req, filename):
total = int(req.getheader("Content-Length"))
with open(filename,'wb') as f:
i = 0
for data in iter(lambda: req.read(8192), b""):
i+=1
f.write(data)
print("\rdownloading: %5.1f%%"%(i*8192*100.0/total), end="")
# 字體下載
font_filename = 'NotoSansCJKtc-hinted.zip'
font_url = "https://noto-website-2.storage.googleapis.com/pkgs/"+font_filename
# 改變這行才能真正下載
if not (os.path.isfile(font_filename) and os.path.getsize(font_filename)==121247052):
with urlopen(font_url) as req:
download_req(req, "NotoSansCJKtc-hinted.zip")
# Extract Font files
import zipfile
with zipfile.ZipFile(font_filename) as zf:
for f in zf.namelist():
if f.endswith('.otf'):
print("extract", f)
zf.extract(f)
fp = matplotlib.font_manager.FontProperties(fname = 'NotoSansCJKtc-Regular.otf')
matplotlib.font_manager.fontManager.ttffiles.append(fp.get_file())
font_entry = matplotlib.font_manager.FontEntry(fp.get_file(), name=fp.get_name(),
style=fp.get_style(), variant=fp.get_variant(),
weight=fp.get_weight(), stretch=fp.get_stretch(), size=fp.get_size())
matplotlib.font_manager.fontManager.ttflist.append(font_entry)
plt.rcParams['font.family'] = fp.get_name()
In [25]:
# 畫出來
plt.figure(figsize=(20,20))
plt.scatter(low_dim_embs[:, 0], low_dim_embs[:, 1])
for i, label in enumerate(labels):
x, y = low_dim_embs[i]
plt.annotate(label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
fontsize=14,
ha='right',
va='bottom')
In [ ]: