In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import numpy as np
In [2]:
# 設定環境變數來控制 keras, theano
import os
os.environ['KERAS_BACKEND']="tensorflow"
os.environ['THEANO_FLAGS']="floatX=float32, device=cuda"
In [3]:
import jieba
jieba.set_dictionary('dict.txt.big')
" - ".join(jieba.cut("今天天氣很好"))
Out[3]:
In [4]:
if not os.path.isfile("sdyxz_all.txt"):
with open("sdyxz_all.txt","w") as outf:
for i in range(1, 41):
r = urlopen('http://www.millionbook.net/wx/j/jingyong/sdyxz/%03d.htm'%i)
html = r.read().decode('cp950', "ignore")
bs = BeautifulSoup(html, 'lxml')
text = bs.findAll('td')[6].get_text()
if len(text)<100:
print("error")
break
print (i, len(text))
outf.write(text)
In [5]:
# 忽略不要的詞
ignore_words = set("""the
of
is
and
to
in
that
we
for
an
are
by
be
as
on
with
can
if
from
which
you
it
this
then
at
have
all
not
one
has
or
that
的
了
和
是
就
都
而
及
與
著
或
一個
沒有
你
我
他
她
我們
你們
妳們
他們
她們
是否""".split("\n"))|set(",。*「」:?\n\u3000!、…』『《》-")
# 自訂辭典
jieba.add_word("黃蓉")
jieba.suggest_freq("黃蓉", True)
jieba.add_word("郭靖")
jieba.suggest_freq("郭靖", True)
with open("sdyxz_all.txt", "r") as f:
words = [w for w in jieba.cut(f.read()) if w not in ignore_words]
print("len=", len(words))
print(words[:100])
In [6]:
# 總共有多少種字?
len(set(words))
Out[6]:
我們只考慮最常用的 10000 字, 其他字用 UNK 取代
In [7]:
import collections
# 先統計字數
counter = collections.Counter(words)
# 可以看一下 counter 是的內容
In [8]:
def sep_words(words):
for w in words:
if counter[w]<3:
for c in w:
yield c
else:
yield w
words = list(sep_words(words))
len(words)
Out[8]:
In [9]:
counter = collections.Counter(words)
In [10]:
# 最常見的 30 個字
counter.most_common(30)
Out[10]:
In [11]:
words
Out[11]:
In [12]:
from gensim.models import word2vec
import logging
In [23]:
sentences = [words[i:i+1000] for i in range(0, len(words), 1000)]
In [53]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = word2vec.Word2Vec(sentences, sg=1, size=100, min_count=1, window=8, iter=25, negative=5)
In [55]:
model.wv.most_similar('吃')
Out[55]:
In [56]:
model.wv.most_similar(positive=['西毒','洪七公'], negative=['北丐'])
Out[56]:
In [66]:
model.wv.most_similar_cosmul(positive=['郭靖','穆念慈'], negative = ['楊康'] )
Out[66]:
In [65]:
model.wv.most_similar_cosmul(positive=['洪七公', '蛤蟆功'], negative=['歐陽鋒'])
Out[65]:
In [62]:
model.wv.most_similar(positive=['吃', "酒"], negative=["飯"])
Out[62]:
In [71]:
samples = 500
labels = [w[0] for w in counter.most_common(samples)]
vec = np.array([model[w] for w in labels])
In [73]:
# 用 t-sne 降低維度
from sklearn.manifold import TSNE
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
low_dim_embs = tsne.fit_transform(vec)
In [74]:
# 從 00Download.ipynb 來的
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import os.path
from urllib.request import urlopen
def download_req(req, filename):
total = int(req.getheader("Content-Length"))
with open(filename,'wb') as f:
i = 0
for data in iter(lambda: req.read(8192), b""):
i+=1
f.write(data)
print("\rdownloading: %5.1f%%"%(i*8192*100.0/total), end="")
# 字體下載
font_filename = 'NotoSansCJKtc-hinted.zip'
font_url = "https://noto-website-2.storage.googleapis.com/pkgs/"+font_filename
# 改變這行才能真正下載
if not (os.path.isfile(font_filename) and os.path.getsize(font_filename)==121247052):
with urlopen(font_url) as req:
download_req(req, "NotoSansCJKtc-hinted.zip")
# Extract Font files
import zipfile
with zipfile.ZipFile(font_filename) as zf:
for f in zf.namelist():
if f.endswith('.otf'):
print("extract", f)
zf.extract(f)
fp = matplotlib.font_manager.FontProperties(fname = 'NotoSansCJKtc-Regular.otf')
matplotlib.font_manager.fontManager.ttffiles.append(fp.get_file())
font_entry = matplotlib.font_manager.FontEntry(fp.get_file(), name=fp.get_name(),
style=fp.get_style(), variant=fp.get_variant(),
weight=fp.get_weight(), stretch=fp.get_stretch(), size=fp.get_size())
matplotlib.font_manager.fontManager.ttflist.append(font_entry)
plt.rcParams['font.family'] = fp.get_name()
In [75]:
# 畫出來
plt.figure(figsize=(20,20))
plt.scatter(low_dim_embs[:, 0], low_dim_embs[:, 1])
for i, label in enumerate(labels):
x, y = low_dim_embs[i]
plt.annotate(label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
fontsize=14,
ha='right',
va='bottom')
In [ ]: