In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import numpy as np

In [2]:
# 設定環境變數來控制 keras, theano
import os
os.environ['KERAS_BACKEND']="tensorflow"
os.environ['THEANO_FLAGS']="floatX=float32, device=cuda"

用 jieba 斷詞


In [3]:
import jieba
jieba.set_dictionary('dict.txt.big')
" - ".join(jieba.cut("今天天氣很好"))


Building prefix dict from /home/tjw/src/HackNTU_Data_2017/Week08/dict.txt.big ...
Loading model from cache /tmp/jieba.uabe345d172aa2c353a2c9ed1c2ff896a.cache
Loading model cost 0.986 seconds.
Prefix dict has been built succesfully.
Out[3]:
'今天天氣 - 很 - 好'

下載資料


In [4]:
if not os.path.isfile("sdyxz_all.txt"):    
    with open("sdyxz_all.txt","w") as outf:
        for i in range(1, 41):
            r = urlopen('http://www.millionbook.net/wx/j/jingyong/sdyxz/%03d.htm'%i)
            html = r.read().decode('cp950', "ignore")
            bs = BeautifulSoup(html, 'lxml')
            text = bs.findAll('td')[6].get_text()
            if len(text)<100:
                print("error")
                break
            print (i, len(text))
            outf.write(text)

In [5]:
# 忽略不要的詞
ignore_words = set("""the
of
is
and
to
in
that
we
for
an
are
by
be
as
on
with
can
if
from
which
you
it
this
then
at
have
all
not
one
has
or
that











一個
沒有




我們
你們
妳們
他們
她們
是否""".split("\n"))|set(",。*「」:?\n\u3000!、…』『《》-")

# 自訂辭典
jieba.add_word("黃蓉")
jieba.suggest_freq("黃蓉", True)
jieba.add_word("郭靖")
jieba.suggest_freq("郭靖", True)
with open("sdyxz_all.txt", "r") as f:
    words = [w for w in jieba.cut(f.read()) if w not in ignore_words]
print("len=", len(words))
print(words[:100])


len= 384051
['第一回', '風雪', '驚變', '錢塘江', '浩浩', '江水', '日日夜夜', '無窮', '無休', '從', '臨安', '牛家村', '邊', '繞過', '東流', '入海', '江畔', '一排', '數十株', '烏', '柏樹', '葉子', '似', '火燒', '般紅', '正是', '八月', '天時', '村前村後', '野草', '剛', '起始', '變黃', '一抹', '斜陽', '映照', '之下', '更增', '幾分', '蕭索', '兩株', '大', '松樹', '下圍', '一堆', '村民', '男男女女', '十幾個', '小孩', '正自', '聚精會神', '聽', '瘦削', '老者', '說話', '那', '說話', '人五', '十來', '歲', '年紀', '一件', '青布', '長袍', '早洗', '得', '褪成', '藍灰色', '只', '聽', '兩片', '梨花', '木板', '碰', '幾下', '左手', '中', '竹棒', '在', '一面', '小', '羯鼓', '上', '敲起', '得', '得', '連聲', '唱道', '小桃', '無主自', '開花', '煙草', '茫茫', '帶', '晚鴉', '幾處', '敗垣圍', '故井', '向來', '一一']

先處理掉少用字


In [6]:
# 總共有多少種字?
len(set(words))


Out[6]:
47134

我們只考慮最常用的 10000 字, 其他字用 UNK 取代


In [7]:
import collections
# 先統計字數
counter = collections.Counter(words)
# 可以看一下 counter 是的內容

In [8]:
def sep_words(words):
    for w in words:
        if counter[w]<3:
            for c in w:
                yield c
        else:
            yield w
words = list(sep_words(words))
len(words)


Out[8]:
434741

In [9]:
counter = collections.Counter(words)

In [10]:
# 最常見的 30 個字
counter.most_common(30)


Out[10]:
[('道', 7119),
 ('在', 5042),
 ('郭靖', 3364),
 ('也', 3086),
 ('不', 3050),
 ('得', 2640),
 ('又', 2622),
 ('這', 2561),
 ('那', 2524),
 ('黃蓉', 2493),
 ('去', 2225),
 ('上', 2098),
 ('一', 2001),
 ('人', 1916),
 ('中', 1867),
 ('說', 1857),
 ('卻', 1777),
 ('有', 1728),
 ('來', 1697),
 ('已', 1682),
 ('到', 1678),
 ('見', 1559),
 ('聽', 1548),
 ('但', 1504),
 ('要', 1452),
 ('叫', 1320),
 ('向', 1300),
 ('大', 1234),
 ('之', 1225),
 ('好', 1187)]

In [11]:
words


Out[11]:
['第',
 '一',
 '回',
 '風雪',
 '驚',
 '變',
 '錢塘江',
 '浩',
 '浩',
 '江水',
 '日日夜夜',
 '無窮',
 '無',
 '休',
 '從',
 '臨安',
 '牛家村',
 '邊',
 '繞過',
 '東',
 '流',
 '入海',
 '江',
 '畔',
 '一排',
 '數',
 '十',
 '株',
 '烏',
 '柏樹',
 '葉',
 '子',
 '似',
 '火燒',
 '般',
 '紅',
 '正是',
 '八月',
 '天時',
 '村',
 '前',
 '村',
 '後',
 '野',
 '草',
 '剛',
 '起始',
 '變',
 '黃',
 '一抹',
 '斜',
 '陽',
 '映照',
 '之下',
 '更增',
 '幾分',
 '蕭',
 '索',
 '兩株',
 '大',
 '松樹',
 '下',
 '圍',
 '一堆',
 '村民',
 '男',
 '男',
 '女',
 '女',
 '十幾個',
 '小孩',
 '正自',
 '聚精會神',
 '聽',
 '瘦削',
 '老者',
 '說話',
 '那',
 '說話',
 '人',
 '五',
 '十來',
 '歲',
 '年紀',
 '一件',
 '青布',
 '長袍',
 '早',
 '洗',
 '得',
 '褪',
 '成',
 '藍',
 '灰',
 '色',
 '只',
 '聽',
 '兩',
 '片',
 '梨',
 '花',
 '木板',
 '碰',
 '幾下',
 '左手',
 '中',
 '竹棒',
 '在',
 '一面',
 '小',
 '羯',
 '鼓',
 '上',
 '敲',
 '起',
 '得',
 '得',
 '連聲',
 '唱道',
 '小',
 '桃',
 '無',
 '主',
 '自',
 '開',
 '花',
 '煙',
 '草',
 '茫茫',
 '帶',
 '晚',
 '鴉',
 '幾',
 '處',
 '敗',
 '垣',
 '圍',
 '故',
 '井',
 '向來',
 '一一',
 '人家',
 '那',
 '說話',
 '人',
 '將',
 '木板',
 '敲',
 '幾下',
 '說道',
 '這首',
 '七',
 '言',
 '詩',
 '說',
 '兵',
 '火',
 '過後',
 '原來',
 '家',
 '家',
 '戶',
 '戶',
 '變成',
 '斷',
 '牆',
 '殘',
 '瓦',
 '破敗',
 '之地',
 '小人',
 '剛才',
 '說',
 '到',
 '那',
 '葉',
 '老',
 '漢',
 '一家',
 '四',
 '口',
 '悲',
 '歡',
 '離',
 '合',
 '聚',
 '又',
 '散',
 '散',
 '又',
 '聚',
 '四',
 '人',
 '給',
 '金兵',
 '衝',
 '散',
 '好容易',
 '又',
 '再',
 '團聚',
 '歡天喜地',
 '回到',
 '故鄉',
 '卻',
 '見',
 '房屋',
 '已給',
 '金兵',
 '燒',
 '得',
 '乾乾淨淨',
 '無可奈何',
 '只得',
 '去',
 '到',
 '汴梁',
 '想',
 '覓',
 '個',
 '生',
 '計',
 '不',
 '料想',
 '天',
 '有',
 '不',
 '測',
 '風',
 '雲',
 '人',
 '有',
 '旦',
 '夕',
 '禍',
 '福',
 '他',
 '四',
 '人',
 '剛',
 '進',
 '汴梁',
 '城',
 '迎面',
 '便',
 '過來',
 '一隊',
 '金兵',
 '帶兵',
 '頭兒',
 '一雙',
 '三',
 '角',
 '眼',
 '覷',
 '將',
 '過去',
 '見',
 '那',
 '葉三姐',
 '生',
 '得',
 '美貌',
 '跳',
 '下',
 '馬來',
 '當即',
 '一把',
 '抱住',
 '哈哈大笑',
 '便將',
 '放',
 '上',
 '馬鞍',
 '說道',
 '小姑娘',
 '跟',
 '回家',
 '服侍',
 '老爺',
 '那',
 '葉三姐',
 '如何',
 '肯',
 '從',
 '拚命',
 '掙扎',
 '那金兵',
 '長官',
 '喝道',
 '不肯',
 '從',
 '便',
 '殺',
 '父母',
 '兄弟',
 '提起',
 '狼牙棒',
 '一',
 '棒打',
 '在',
 '那',
 '葉',
 '三',
 '郎',
 '頭上',
 '登時',
 '腦漿',
 '迸裂',
 '一',
 '命',
 '鳴',
 '呼',
 '正是',
 '陰世',
 '新',
 '添',
 '枉',
 '死',
 '鬼',
 '陽',
 '間',
 '不見',
 '少年人',
 '葉',
 '老',
 '漢',
 '媽媽',
 '嚇得',
 '呆',
 '撲將',
 '上去',
 '摟住',
 '兒子',
 '死',
 '屍',
 '放聲大哭',
 '那',
 '長官',
 '提起',
 '狼牙棒',
 '一棒',
 '又',
 '了',
 '帳',
 '那',
 '葉三姐',
 '卻',
 '不',
 '啼哭',
 '說道',
 '長官',
 '休',
 '得',
 '兇惡',
 '跟',
 '回家',
 '便',
 '那',
 '長官',
 '大喜',
 '將',
 '葉三姐',
 '帶',
 '得',
 '回家',
 '不料',
 '葉三姐',
 '覷',
 '不',
 '防',
 '突然',
 '搶步',
 '過去',
 '拔出',
 '那',
 '長官',
 '腰刀',
 '對準',
 '心口',
 '一刀',
 '刺',
 '將',
 '過去',
 '說',
 '時',
 '遲',
 '那',
 '時',
 '快',
 '這',
 '一刀',
 '刺去',
 '眼見',
 '便',
 '可',
 '報',
 '得',
 '父母',
 '兄弟',
 '大仇',
 '不料',
 '那',
 '長官',
 '久經',
 '戰陣',
 '武藝',
 '精熟',
 '順手',
 '一推',
 '葉三姐',
 '登時',
 '摔',
 '出去',
 '那',
 '長官',
 '剛',
 '罵',
 '得',
 '一聲',
 '小',
 '賤人',
 '葉三姐',
 '已',
 '舉起',
 '鋼刀',
 '在',
 '脖子',
 '中',
 '一',
 '勒',
 '可憐',
 '花',
 '容',
 '月',
 '貌',
 '無雙',
 '女',
 '惆',
 '悵',
 '芳',
 '魂',
 '赴',
 '九',
 '泉',
 '說',
 '一段',
 '唱',
 '一段',
 '只',
 '聽',
 '得',
 '眾',
 '村民',
 '無不',
 '咬牙切齒',
 '憤怒',
 '歎',
 '息',
 '那人',
 '又',
 '道',
 '眾位',
 '看',
 '官',
 '常言道',
 '得',
 '好',
 '為',
 '人',
 '切',
 '莫',
 '用',
 '欺',
 '心',
 '舉頭',
 '三尺',
 '有',
 '神',
 '明',
 '若',
 '還',
 '作惡',
 '無',
 '報應',
 '天下',
 '兇',
 '徒',
 '人',
 '吃',
 '人',
 '可是',
 '那金兵',
 '佔',
 '大宋',
 '天下',
 '殺',
 '人',
 '放',
 '火',
 '姦淫擄掠',
 '無惡不作',
 '卻',
 '又',
 '不見',
 '遭到',
 '什麼',
 '報應',
 '只怪',
 '大宋',
 '官家',
 '不爭氣',
 '中國',
 '本來',
 '兵',
 '多',
 '將',
 '廣',
 '可是',
 '一',
 '見到',
 '金兵',
 '到來',
 '便',
 '遠遠',
 '逃',
 '之',
 '夭',
 '夭',
 '只',
 '剩下',
 '老百姓',
 '遭殃',
 '好似',
 '那',
 '葉三姐',
 '一家',
 '慘',
 '禍',
 '江北',
 '之地',
 '實',
 '成千成萬',
 '便',
 '如',
 '家',
 '常',
 '便',
 '飯',
 '一般',
 '諸',
 '君',
 '住',
 '在',
 '江南',
 '當真',
 '在',
 '天堂',
 '裡',
 '了',
 '怕只怕',
 '金兵',
 '何日',
 '到來',
 '正是',
 '寧',
 '作',
 '太平',
 '犬',
 '莫',
 '為',
 '亂',
 '世',
 '人',
 '小人',
 '張十五',
 '今日',
 '路經',
 '貴',
 '地',
 '服侍',
 '眾位',
 '看',
 '官',
 '這',
 '一段',
 '說話',
 '叫作',
 '葉三姐',
 '節烈',
 '記',
 '話',
 '本',
 '說',
 '徹',
 '權',
 '作',
 '散',
 '場',
 '將',
 '兩',
 '片',
 '梨',
 '花',
 '木板',
 '拍拍',
 '拍',
 '亂',
 '敲',
 '一陣',
 '托',
 '出',
 '一隻',
 '盤子',
 '眾',
 '村民',
 '便',
 '有人',
 '拿出',
 '兩',
 '文',
 '三',
 '文',
 '放入',
 '木盤',
 '霎時間',
 '得',
 '六',
 '七',
 '十',
 '文',
 '張十五',
 '謝',
 '將',
 '銅錢',
 '放入',
 '囊中',
 '便',
 '欲',
 '起',
 '行',
 '村民',
 '中',
 '走出',
 '一',
 '個',
 '二',
 '十',
 '來',
 '歲',
 '大漢',
 '說道',
 '張',
 '先生',
 '可',
 '是從',
 '北方',
 '來',
 '嗎',
 '張十五',
 '見',
 '身材',
 '魁梧',
 '濃眉大眼',
 '便',
 '道',
 '正是',
 '那',
 '大漢',
 '道',
 '小弟',
 '作東',
 '請',
 '先生',
 '去',
 '飲',
 '上',
 '三杯',
 '如何',
 '張十五',
 '大喜',
 '說道',
 '素不相識',
 '怎敢',
 '叨擾',
 '那',
 '大漢',
 '笑',
 '道',
 '喝',
 '上',
 '三杯',
 '那',
 '便',
 '相識',
 '我姓',
 '郭',
 '名叫',
 '郭嘯天',
 '指著',
 '身旁',
 '白淨',
 '面皮',
 '漢子',
 '道',
 '這位',
 '楊鐵心',
 '楊兄弟',
 '適才',
 '二人',
 '聽',
 '先生',
 '說',
 '唱',
 '葉三姐',
 '節烈',
 '記',
 '果然',
 '說得好',
 '卻',
 '有',
 '幾句話',
 '想要',
 '請問',
 '張十五',
 '道',
 '好',
 '說',
 '好',
 '說',
 '今日',
 '得遇',
 '郭楊',
 '二位',
 '也',
 '有緣',
 '郭嘯天',
 '帶著',
 '張十五',
 '來到',
 '村頭',
 '一家',
 '小',
 '酒店',
 '中',
 '在',
 '張',
 '飯',
 '桌',
 '旁',
 '坐',
 '小',
 '酒店',
 '主人',
 '個',
 '跛子',
 '撐',
 '兩根',
 '枴杖',
 '慢慢',
 '燙',
 '兩',
 '壺',
 '黃',
 '酒',
 '擺',
 '出',
 '一碟',
 '蠶豆',
 '一碟',
 '鹹',
 '花生',
 '一碟',
 '豆',
 '腐',
 '乾',
 '另有',
 '三個',
 '切',
 '開',
 '鹹蛋',
 '自行',
 '在',
 '門口',
 '板凳',
 '上',
 '坐',
 '抬頭',
 '瞧',
 '天邊',
 '正要',
 '落',
 '山',
 '太陽',
 '卻',
 '不',
 '更',
 '向',
 '三',
 '人',
 '望',
 '上',
 '一眼',
 '郭嘯天',
 '斟',
 '酒',
 '勸',
 '張十五',
 '喝',
 '兩杯',
 '說道',
 '鄉下',
 '地方',
 '只',
 '初二',
 '十六',
 '方有',
 '肉',
 '賣',
 '沒',
 '下酒',
 '之物',
 '先生',
 '莫怪',
 '張十五',
 '道',
 '有',
 '酒',
 '便',
 '好',
 '聽',
 '兩位',
 '口音',
 '遮',
 '莫',
 '也',
 '北方',
 '人',
 '楊鐵心',
 '道',
 '兩',
 '兄弟',
 '原',
 '山東',
 '人氏',
 '只',
 '因',
 '受不了',
 '金狗',
 '骯髒',
 '氣',
 '三年',
 '前來',
 '到',
 '此間',
 '愛',
 '這',
 '裡',
 '人情',
 '厚',
 '便',
 '住',
 '下來',
 '剛才',
 '聽',
 '得',
 '先生',
 '說道',
 '住',
 '在',
 '江南',
 '猶似',
 '在',
 '天堂',
 '裡',
 '一般',
 '怕只怕',
 '金兵',
 '何日',
 '到來',
 '說',
 '金兵',
 '會',
 '不會',
 '打過',
 '江',
 '來',
 '張十五',
 '歎',
 '道',
 '江南',
 '花花世界',
 '遍地',
 '皆',
 '金銀',
 '放眼',
 '但',
 '見',
 '美女',
 '金兵',
 '又',
 '有',
 '哪',
 '一日',
 '下',
 '想',
 '過來',
 '只是',
 '他來',
 '不來',
 '拿',
 '主',
 '意',
 '卻',
 '不是',
 '金國',
 '而是',
 '臨安',
 '大',
 '宋',
 '朝',
 '廷',
 '郭嘯天',
 '楊鐵心',
 '齊感',
 '詫異',
 '同聲',
 '問道',
 '這',
 '卻是',
 '怎生',
 '說',
 '張十五',
 '道',
 '中國',
 '百姓',
 '比',
 '女',
 '真',
 '人',
 '多',
 '上',
 '一',
 '百',
 '倍',
 '也還',
 '不止',
 '只要',
 '朝廷',
 '肯',
 '用',
 '忠臣',
 '良將',
 '咱們',
 '一百個',
 '打',
 '金兵',
 '如何',
 '能夠',
 '抵擋',
 '我大宋',
 '北方',
 '這',
 '半壁江山',
 '當年',
 '徽宗',
 '欽宗',
 '高宗',
 '父子',
 '三人',
 '奉',
 '送給',
 '金人',
 '這',
 '三個',
 '皇帝',
 '任',
 '用',
 '奸臣',
 '欺',
 '壓',
 '百',
 '姓',
 '把',
 '出力',
 '抵抗',
 '金兵',
 '大將',
 '罷免',
 '罷免',
 '殺頭',
 '殺頭',
 '花',
 '花',
 '江山',
 '雙手',
 '送',
 '將',
 '過去',
 '金',
 ...]

In [12]:
from gensim.models import word2vec
import logging

In [23]:
sentences = [words[i:i+1000] for i in range(0, len(words), 1000)]

In [53]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = word2vec.Word2Vec(sentences, sg=1, size=100, min_count=1, window=8, iter=25, negative=5)


2017-05-09 16:43:43,265 : INFO : collecting all words and their counts
2017-05-09 16:43:43,265 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-05-09 16:43:43,336 : INFO : collected 18172 word types from a corpus of 434741 raw words and 435 sentences
2017-05-09 16:43:43,337 : INFO : Loading a fresh vocabulary
2017-05-09 16:43:43,423 : INFO : min_count=1 retains 18172 unique words (100% of original 18172, drops 0)
2017-05-09 16:43:43,424 : INFO : min_count=1 leaves 434741 word corpus (100% of original 434741, drops 0)
2017-05-09 16:43:43,480 : INFO : deleting the raw counts dictionary of 18172 items
2017-05-09 16:43:43,481 : INFO : sample=0.001 downsamples 31 most-common words
2017-05-09 16:43:43,482 : INFO : downsampling leaves estimated 409275 word corpus (94.1% of prior 434741)
2017-05-09 16:43:43,482 : INFO : estimated required memory for 18172 words and 100 dimensions: 23623600 bytes
2017-05-09 16:43:43,553 : INFO : resetting layer weights
2017-05-09 16:43:43,706 : INFO : training model with 3 workers on 18172 vocabulary and 100 features, using sg=1 hs=0 sample=0.001 negative=5 window=8
2017-05-09 16:43:44,731 : INFO : PROGRESS: at 3.03% examples, 303427 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:43:45,749 : INFO : PROGRESS: at 6.16% examples, 308874 words/s, in_qsize 4, out_qsize 1
2017-05-09 16:43:46,790 : INFO : PROGRESS: at 9.38% examples, 311434 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:43:47,831 : INFO : PROGRESS: at 12.60% examples, 312657 words/s, in_qsize 6, out_qsize 0
2017-05-09 16:43:48,894 : INFO : PROGRESS: at 15.91% examples, 313832 words/s, in_qsize 6, out_qsize 0
2017-05-09 16:43:49,928 : INFO : PROGRESS: at 18.67% examples, 307052 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:43:50,936 : INFO : PROGRESS: at 21.61% examples, 305918 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:43:51,940 : INFO : PROGRESS: at 24.74% examples, 307450 words/s, in_qsize 6, out_qsize 0
2017-05-09 16:43:52,944 : INFO : PROGRESS: at 27.86% examples, 308624 words/s, in_qsize 6, out_qsize 0
2017-05-09 16:43:53,952 : INFO : PROGRESS: at 30.99% examples, 309515 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:43:54,953 : INFO : PROGRESS: at 34.11% examples, 310403 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:43:55,963 : INFO : PROGRESS: at 37.33% examples, 311704 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:43:57,010 : INFO : PROGRESS: at 40.46% examples, 311199 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:43:58,037 : INFO : PROGRESS: at 43.68% examples, 311875 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:43:59,051 : INFO : PROGRESS: at 46.71% examples, 311494 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:44:00,088 : INFO : PROGRESS: at 49.93% examples, 311872 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:44:01,131 : INFO : PROGRESS: at 52.78% examples, 309949 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:44:02,135 : INFO : PROGRESS: at 55.82% examples, 309914 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:44:03,147 : INFO : PROGRESS: at 58.94% examples, 310242 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:44:04,148 : INFO : PROGRESS: at 62.07% examples, 310698 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:44:05,152 : INFO : PROGRESS: at 65.20% examples, 311078 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:44:06,158 : INFO : PROGRESS: at 68.32% examples, 311374 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:44:07,166 : INFO : PROGRESS: at 71.45% examples, 311634 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:44:08,199 : INFO : PROGRESS: at 74.57% examples, 311559 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:44:09,260 : INFO : PROGRESS: at 77.89% examples, 311887 words/s, in_qsize 6, out_qsize 0
2017-05-09 16:44:10,262 : INFO : PROGRESS: at 81.01% examples, 312171 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:44:11,304 : INFO : PROGRESS: at 84.23% examples, 312315 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:44:12,329 : INFO : PROGRESS: at 87.36% examples, 312312 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:44:13,345 : INFO : PROGRESS: at 90.57% examples, 312716 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:44:14,349 : INFO : PROGRESS: at 93.70% examples, 312903 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:44:15,372 : INFO : PROGRESS: at 96.09% examples, 310518 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:44:16,374 : INFO : PROGRESS: at 98.85% examples, 309637 words/s, in_qsize 5, out_qsize 0
2017-05-09 16:44:16,864 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-05-09 16:44:16,866 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-05-09 16:44:16,901 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-05-09 16:44:16,902 : INFO : training on 10868525 raw words (10232314 effective words) took 33.2s, 308249 effective words/s

In [55]:
model.wv.most_similar('吃')


Out[55]:
[('一驚', 0.7296633720397949),
 ('糲', 0.687698245048523),
 ('五珍', 0.6864144802093506),
 ('蘑', 0.677970826625824),
 ('煨', 0.6770339012145996),
 ('麵', 0.6668568253517151),
 ('膾', 0.6651573777198792),
 ('苦頭', 0.6593719720840454),
 ('殘菜', 0.6576591730117798),
 ('好吃', 0.6303290724754333)]

In [56]:
model.wv.most_similar(positive=['西毒','洪七公'], negative=['北丐'])


Out[56]:
[('黃蓉', 0.5206567049026489),
 ('歐陽鋒', 0.49580344557762146),
 ('郭靖', 0.4456004202365875),
 ('沉肩', 0.43416541814804077),
 ('陰沉', 0.42809873819351196),
 ('果真如此', 0.4270661175251007),
 ('一線', 0.41873422265052795),
 ('叫化', 0.4184786379337311),
 ('黃蓉忙', 0.4172767996788025),
 ('七公', 0.4136349558830261)]

In [66]:
model.wv.most_similar_cosmul(positive=['郭靖','穆念慈'], negative = ['楊康'] )


Out[66]:
[('黃蓉', 0.9208547472953796),
 ('靖哥哥', 0.9063557386398315),
 ('柔聲道', 0.8894891738891602),
 ('廝守', 0.8872582316398621),
 ('永遠', 0.8833402991294861),
 ('蹙眉', 0.879569947719574),
 ('手道', 0.8783940076828003),
 ('礙事', 0.8731006383895874),
 ('拿藥', 0.8672891855239868),
 ('永不', 0.8671412467956543)]

In [65]:
model.wv.most_similar_cosmul(positive=['洪七公', '蛤蟆功'], negative=['歐陽鋒'])


Out[65]:
[('一陽指', 0.8603169322013855),
 ('降龍十八掌', 0.8508104085922241),
 ('空明拳', 0.8481013178825378),
 ('快又準', 0.8092823624610901),
 ('破去', 0.8084005117416382),
 ('第二招', 0.8070525527000427),
 ('青龍', 0.8029625415802002),
 ('算是', 0.8017399311065674),
 ('位居', 0.7996737360954285),
 ('不愧', 0.7967389225959778)]

In [62]:
model.wv.most_similar(positive=['吃', "酒"], negative=["飯"])


Out[62]:
[('喝', 0.6327210664749146),
 ('七碗', 0.5234313011169434),
 ('一大口', 0.5065366625785828),
 ('半碗', 0.49932488799095154),
 ('唱著', 0.4989270865917206),
 ('程二女', 0.4868518114089966),
 ('妙哉', 0.4837992191314697),
 ('蘑', 0.4831634759902954),
 ('煨', 0.48288285732269287),
 ('蠶豆', 0.4808100461959839)]

In [71]:
samples = 500
labels = [w[0] for w in counter.most_common(samples)]
vec = np.array([model[w] for w in labels])

In [73]:
# 用 t-sne 降低維度
from sklearn.manifold import TSNE
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
low_dim_embs = tsne.fit_transform(vec)

In [74]:
# 從 00Download.ipynb 來的
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import os.path
from urllib.request import urlopen
def download_req(req, filename):
    total = int(req.getheader("Content-Length"))
    with open(filename,'wb') as f:
        i = 0
        for data in iter(lambda: req.read(8192), b""):  
            i+=1
            f.write(data)
            print("\rdownloading: %5.1f%%"%(i*8192*100.0/total), end="")

# 字體下載
font_filename = 'NotoSansCJKtc-hinted.zip'
font_url = "https://noto-website-2.storage.googleapis.com/pkgs/"+font_filename
# 改變這行才能真正下載
if not (os.path.isfile(font_filename) and os.path.getsize(font_filename)==121247052):
    with urlopen(font_url) as req:
        download_req(req, "NotoSansCJKtc-hinted.zip")
# Extract Font files
import zipfile
with zipfile.ZipFile(font_filename) as zf:
    for f in zf.namelist():
        if f.endswith('.otf'):
            print("extract", f)
            zf.extract(f)

fp = matplotlib.font_manager.FontProperties(fname = 'NotoSansCJKtc-Regular.otf')
matplotlib.font_manager.fontManager.ttffiles.append(fp.get_file())
font_entry = matplotlib.font_manager.FontEntry(fp.get_file(), name=fp.get_name(),
                                               style=fp.get_style(), variant=fp.get_variant(),
                                              weight=fp.get_weight(), stretch=fp.get_stretch(), size=fp.get_size())

matplotlib.font_manager.fontManager.ttflist.append(font_entry)
plt.rcParams['font.family'] = fp.get_name()


extract NotoSansCJKtc-Black.otf
extract NotoSansCJKtc-Bold.otf
extract NotoSansCJKtc-DemiLight.otf
extract NotoSansCJKtc-Light.otf
extract NotoSansCJKtc-Medium.otf
extract NotoSansCJKtc-Regular.otf
extract NotoSansCJKtc-Thin.otf
extract NotoSansMonoCJKtc-Bold.otf
extract NotoSansMonoCJKtc-Regular.otf

In [75]:
# 畫出來

plt.figure(figsize=(20,20))
plt.scatter(low_dim_embs[:, 0], low_dim_embs[:, 1])
for i, label in enumerate(labels):
    x, y = low_dim_embs[i]
    plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 fontsize=14,
                 ha='right',
                 va='bottom')



In [ ]: