This is the test file to the idea prove.

Try to do the Json formatted corpus, but it is so hard, then I find the word2vec can avoid this hard work.


In [1]:
# -*- coding: utf-8 -*-

import os
import re
import time
import codecs
import argparse

TIME_FORMAT = '%Y-%m-%d %H:%M:%S'
BASE_FOLDER = "C:/Users/sethf/source/repos/chinesepoem/" # os.path.abspath(os.path.dirname(__file__))
DATA_FOLDER = os.path.join(BASE_FOLDER, 'data')
DEFAULT_FIN = os.path.join(DATA_FOLDER, '唐诗语料库.txt')
DEFAULT_FOUT = os.path.join(DATA_FOLDER, 'poem.txt')
reg_noisy = re.compile('[^\u3000-\uffee]')
reg_note = re.compile('((.*))') # Cannot deal with () in seperate lines
# 中文及全角标点符号(字符)是\u3000-\u301e\ufe10-\ufe19\ufe30-\ufe44\ufe50-\ufe6b\uff01-\uffee

读取数据,去掉不用的数据


In [11]:
if __name__ == '__main__':
  #  parser = set_arguments()
  #  cmd_args = parser.parse_args()

    print('{} START'.format(time.strftime(TIME_FORMAT)))

    fd = codecs.open(DEFAULT_FIN, 'r', 'utf-8')
    fw = codecs.open( DEFAULT_FOUT, 'w', 'utf-8')
    reg = re.compile('〖(.*)〗')
    start_flag = False
    for line in fd:
        line = line.strip()
        if not line or '《全唐诗》' in line or '<http'  in line or '□' in line:
            continue
        elif '〖' in line and '〗' in line:
            if start_flag:
                fw.write('\n')
            start_flag = True
            g = reg.search(line)
            if g:
                fw.write(g.group(1))
                fw.write('\n')
            else:a
                # noisy data
                print(line)
        else:
            line = reg_noisy.sub('', line)
            line = reg_note.sub('', line)
            line = line.replace(' .', '')
            fw.write(line)

    fd.close()
    fw.close()

    print('{} STOP'.format(time.strftime(TIME_FORMAT)))


2017-10-15 14:34:54 START
2017-10-15 14:34:56 STOP

分词实验

DEFAULT_FOUT = os.path.join(DATA_FOLDER, 'poem.txt')

thu1 = thulac.thulac(seg_only=True) #只进行分词,不进行词性标注

text = thu1.cut("我爱北京天安门", text=True) #进行一句话分词

print(text)

thu1 = thulac.thulac(seg_only=True) #只进行分词,不进行词性标注 thu1.cut_f(DEFAULT_FOUT, outp) #对input.txt文件内容进行分词,输出到output.txt


In [17]:
print('{} START'.format(time.strftime(TIME_FORMAT)))

import thulac 
DEFAULT_Segment = os.path.join(DATA_FOLDER, 'wordsegment.txt')

fd = codecs.open(DEFAULT_FOUT, 'r', 'utf-8')
fw = codecs.open(DEFAULT_Segment, 'w', 'utf-8')

thu1 = thulac.thulac(seg_only=True)   #只进行分词,不进行词性标注


for line in fd:
    #print(line)
    fw.write(thu1.cut(line, text=True))
    fw.write('\n')
    
fd.close()
fw.close()

print('{} STOP'.format(time.strftime(TIME_FORMAT)))


2017-10-15 16:26:15 START
Model loaded succeed
2017-10-15 16:27:58 STOP

In [19]:
print('{} START'.format(time.strftime(TIME_FORMAT)))
from gensim.models import word2vec


#DEFAULT_Segment = os.path.join(DATA_FOLDER, 'wordsegment.txt')
DEFAULT_Word2Vec = os.path.join(DATA_FOLDER, 'Word2Vec150.bin')

sentences = word2vec.Text8Corpus(DEFAULT_Segment)

model = word2vec.Word2Vec(sentences, size=150)

#DEFAULT_Segment = os.path.join(DATA_FOLDER, 'wordsegment.txt')
model.save(DEFAULT_Word2Vec)

print('{} STOP'.format(time.strftime(TIME_FORMAT)))


2017-10-15 16:30:20 START
C:\Anaconda3\lib\site-packages\gensim\utils.py:862: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
2017-10-15 16:30:31 STOP

In [21]:
model[u'男']


Out[21]:
array([ 0.30962595,  0.16889741, -0.01463027, -0.15809815,  0.09206317,
       -0.1456935 ,  0.16657346, -0.16048834,  0.03577007, -0.13513733,
       -0.09294472, -0.11723404, -0.12365381, -0.02067957,  0.1038581 ,
        0.00641506, -0.0062934 ,  0.23415405,  0.37439978, -0.0564473 ,
       -0.23397736, -0.19426669,  0.06946895, -0.3208392 ,  0.19368722,
        0.02603251, -0.00743247, -0.22094592,  0.01184341, -0.12694272,
       -0.32603887, -0.20273098, -0.07396571,  0.01315944, -0.10838111,
       -0.0909251 ,  0.00180263, -0.03625318, -0.2046182 , -0.09922028,
        0.34920788,  0.08904874, -0.25203493, -0.09772593, -0.03779411,
       -0.17694817,  0.07821831,  0.08035509,  0.25622529, -0.08985876,
        0.03270766, -0.19293341, -0.30891556,  0.05773695, -0.03148178,
        0.33995509, -0.22352351,  0.09742409,  0.14914362, -0.07318434,
        0.03735919, -0.08370081, -0.16495866,  0.14458466, -0.04542416,
       -0.24301586,  0.08908165,  0.06313832,  0.0586113 , -0.15221816,
        0.06224625,  0.08598434, -0.0115755 , -0.09099659,  0.06226088,
       -0.07644724,  0.02220215,  0.07566795,  0.04833851,  0.00838657,
       -0.05597517, -0.06397859,  0.03784521,  0.02023427, -0.12724152,
       -0.01048566,  0.1487288 ,  0.08827937, -0.17855296,  0.31425136,
        0.06090816, -0.16096003, -0.07982934,  0.10440107, -0.04465724,
        0.06235282, -0.1461063 ,  0.22972585, -0.02483237,  0.1252525 ,
       -0.17958631,  0.04755906,  0.26136953,  0.16259584,  0.11282863,
        0.10273369, -0.1521662 , -0.11136056,  0.44112033, -0.1723136 ,
        0.08373854,  0.16581547, -0.06470159, -0.14097695,  0.07161622,
        0.22370109,  0.26647383,  0.24355215, -0.11299301,  0.14951281,
       -0.05022607,  0.196927  , -0.06548793,  0.50461113,  0.18641786,
       -0.2149298 , -0.05788758,  0.28251058,  0.14605965,  0.4527784 ,
        0.00892602,  0.08880702,  0.16401401, -0.03404955, -0.3267473 ,
        0.14250852,  0.20599096,  0.13325472, -0.12572202,  0.02558975,
       -0.06050026, -0.09717743, -0.20002677,  0.14861256,  0.22908178,
       -0.05484885,  0.08654279,  0.07304503,  0.17076297,  0.38086078], dtype=float32)

In [3]:
DEFAULT_FIN = os.path.join(DATA_FOLDER, '唐诗语料库.txt')
DEFAULT_FOUT = os.path.join(DATA_FOLDER, 'poem.txt')
DEFAULT_Segment = os.path.join(DATA_FOLDER, 'wordsegment.txt')
def GetFirstNline(filePath, linesNumber):
    fd = codecs.open(filePath, 'r', 'utf-8')
    for i in range(1,linesNumber):
        print(fd.readline())
    fd.close()

GetFirstNline(DEFAULT_Segment, 3)
GetFirstNline(DEFAULT_FOUT, 3)


饮马长城 窟行

塞外 悲风切 , 交河 冰 已 结 。 瀚海 百重波 , 阴山 千 里 雪 。 迥戍危 烽火 , 层峦 引高节 。 悠悠 卷 旆旌 , 饮马 出 长城 。 寒沙 连 骑迹 , 朔吹断 边声 。 胡尘清玉塞 , 羌 笛韵 金钲 。 绝漠 干戈戢 , 车徒 振 原隰 。 都 尉反龙堆 , 将 军旋 马邑 。 扬 麾氛 雾静 , 纪石 功名 立 。 荒裔 一戎衣 , 灵台 凯歌 入 。

饮马长城窟行

塞外悲风切,交河冰已结。瀚海百重波,阴山千里雪。迥戍危烽火,层峦引高节。悠悠卷旆旌,饮马出长城。寒沙连骑迹,朔吹断边声。胡尘清玉塞,羌笛韵金钲。绝漠干戈戢,车徒振原隰。都尉反龙堆,将军旋马邑。扬麾氛雾静,纪石功名立。荒裔一戎衣,灵台凯歌入。

分词不是很成功,我们转向直接用汉字字符来代替分段,我们保留标点符号


In [10]:
print('{} START'.format(time.strftime(TIME_FORMAT)))

DEFAULT_FOUT = os.path.join(DATA_FOLDER, 'poem.txt')
DEFAULT_charSegment = os.path.join(DATA_FOLDER, 'Charactersegment.txt')

fd = codecs.open(DEFAULT_FOUT, 'r', 'utf-8')
fw = codecs.open(DEFAULT_charSegment, 'w', 'utf-8')

start_flag = False
for line in fd:
    if len(line) > 0:
        for c in line:
            if c != '\n':
                fw.write(c)
                fw.write(' ')
    fw.write('\n')

fd.close()
fw.close()

print('{} STOP'.format(time.strftime(TIME_FORMAT)))


2017-10-15 17:22:55 START
2017-10-15 17:23:02 STOP

In [11]:
GetFirstNline(DEFAULT_charSegment, 3)


饮 马 长 城 窟 行 

塞 外 悲 风 切 , 交 河 冰 已 结 。 瀚 海 百 重 波 , 阴 山 千 里 雪 。 迥 戍 危 烽 火 , 层 峦 引 高 节 。 悠 悠 卷 旆 旌 , 饮 马 出 长 城 。 寒 沙 连 骑 迹 , 朔 吹 断 边 声 。 胡 尘 清 玉 塞 , 羌 笛 韵 金 钲 。 绝 漠 干 戈 戢 , 车 徒 振 原 隰 。 都 尉 反 龙 堆 , 将 军 旋 马 邑 。 扬 麾 氛 雾 静 , 纪 石 功 名 立 。 荒 裔 一 戎 衣 , 灵 台 凯 歌 入 。 


In [10]:
print('{} START'.format(time.strftime(TIME_FORMAT)))
from gensim.models import word2vec


#DEFAULT_Segment = os.path.join(DATA_FOLDER, 'wordsegment.txt')
DEFAULT_Char2Vec = os.path.join(DATA_FOLDER, 'Char2Vec100.bin')

fd = codecs.open(DEFAULT_charSegment, 'r', 'utf-8')

sentences = fd.readlines()

fd.close


model = word2vec.Word2Vec(sentences, size=100)

#DEFAULT_Segment = os.path.join(DATA_FOLDER, 'wordsegment.txt')
model.save(DEFAULT_Char2Vec)

print('{} STOP'.format(time.strftime(TIME_FORMAT)))


2017-10-16 22:17:17 START
2017-10-16 22:17:32 STOP

In [11]:
model[u'男']


Out[11]:
array([ 0.2900829 , -0.04809159, -0.46607766, -0.60195959, -0.79692709,
        1.45317233, -0.73875636, -0.23516993,  0.52468306, -0.4141095 ,
        0.31254441,  0.06157973,  0.52587473,  0.98117661,  0.76936024,
        0.17090531,  0.54503411,  0.89224559,  0.63628626, -0.65704244,
        0.19324228, -2.19337821, -0.0736718 , -1.12545574,  0.36714867,
       -0.23592179,  0.65851527,  1.97759676,  0.0664974 ,  0.34336987,
        0.16321452, -0.45230347, -1.16129088, -1.37885571, -0.70058161,
       -2.71629333, -0.47714323, -1.35716736, -0.5040586 ,  0.84255946,
        0.29387042,  0.96084136,  0.5980038 ,  1.53590572,  0.78642726,
       -0.70572197,  2.15199852, -0.09091973,  0.70999056, -1.26367903,
       -0.23834354,  0.40385616,  0.76464611, -0.65731245,  0.3340157 ,
        0.97213268,  1.46448743,  1.32762229,  0.21536438, -0.69748122,
       -1.24047554,  0.52763128,  0.48480916, -0.98241204, -0.71260804,
       -0.54136884, -1.04192448,  1.04139686,  0.46493888,  0.94138777,
        0.21847701, -0.44784865, -1.06913686, -1.06480539, -0.28641865,
       -0.57710785, -0.42219958,  0.06467494,  0.29220659,  0.56308562,
       -0.69409251, -1.28817475,  0.24338399, -0.0228632 ,  0.33695638,
        0.73314172,  0.78557426,  0.78446829,  0.42267925, -0.7360608 ,
       -0.18527743,  0.4405438 ,  1.22639728,  1.25485229,  1.98212445,
        0.5071575 , -0.30095363, -0.10453363, -0.94564468,  0.3795009 ], dtype=float32)

In [3]:
print('{} START'.format(time.strftime(TIME_FORMAT)))
from gensim.models import word2vec

DEFAULT_charSegment = os.path.join(DATA_FOLDER, 'Charactersegment.txt')
DEFAULT_Char2Vec50 = os.path.join(DATA_FOLDER, 'Char2Vec50.bin')

fd = codecs.open(DEFAULT_charSegment, 'r', 'utf-8')

sentences = fd.readlines()

fd.close

model = word2vec.Word2Vec(sentences, size=50)

#DEFAULT_Segment = os.path.join(DATA_FOLDER, 'wordsegment.txt')
model.save(DEFAULT_Char2Vec50)

print('{} STOP'.format(time.strftime(TIME_FORMAT)))


2017-10-16 20:25:31 START
2017-10-16 20:25:41 STOP

In [9]:
model.wv.most_similar([u'好'])


Out[9]:
[('最', 0.7617394924163818),
 ('爱', 0.7001036405563354),
 ('共', 0.6234053373336792),
 ('赏', 0.5743197202682495),
 (' ', 0.5637354850769043),
 ('似', 0.560402512550354),
 ('近', 0.5548217296600342),
 ('谢', 0.5457607507705688),
 ('伴', 0.5440549850463867),
 ('待', 0.5435962677001953)]

In [ ]: