In [250]:
from __future__ import absolute_import #导入3.x的特征函数
from __future__ import print_function
from imp import reload
import pandas as pd #导入Pandas
import numpy as np #导入Numpy
import jieba #导入结巴分词
import gensim
import h5py
import sys
import re
reload(sys)
#sys.setdefaultencoding('utf-8')
from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
#from keras.utils import plot_model
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.layers.wrappers import Bidirectional #包装器,把一个层应用到输入的每一个时间步上
from keras.callbacks import TensorBoard #TensorBoard是TensorFlow提供的可视化工具,
#该回调函数将日志信息写入TensorBorad,
#使得你可以动态的观察训练和测试指标的图像以及不同层的激活值直方图。
from keras.metrics import binary_accuracy #对二分类问题,计算在所有预测值上的平均正确率
from keras import backend as K #keras后端
from pickle import dump, load
In [416]:
jieba.enable_parallel(32)
cw = lambda x: list(jieba.cut(str(x))) #定义分词函数
In [417]:
re_sub = lambda x:re.sub("\W", "", x)
re_sub_vec = np.vectorize(re_sub)
#re_sub(text.content[0])
In [418]:
text = list()
for i in range(1,141):
with open('/home/yuangxue/abstract/news/training%d.txt' % (i)) as f:
a = f.readlines()
text.append(''.join(a))
# with open('/home/yuangxue/abstract/news/training%d.txt' % (i)) as f:
# a = f.readlines()
# text.append(''.join(a))
text = pd.DataFrame({'content':text})
text['content'] = re_sub_vec(text['content'])
text['words'] = text['content'].apply(cw) #评论分词
abstract = list()
for i in range(1,141):
with open('/home/yuangxue/abstract/abstract/training%db.txt' % (i)) as f:
a = f.readlines()
abstract.append(''.join(a))
# with open('/home/yuangxue/abstract/abstract/training%da.txt' % (i)) as f:
# a = f.readlines()
# abstract.append(''.join(a))
abstract = pd.DataFrame({'content':abstract})
abstract['content'] = re_sub_vec(abstract['content'])
abstract['words'] = abstract['content'].apply(cw) #评论分词
text_len = 1200
abstract_len = 20
In [419]:
# wordvec_model = gensim.models.word2vec.Word2Vec.load('/home/yuangxue/word2vec_from_weixin/word2vec/word2vec_wx')
wordvec_model = gensim.models.word2vec.Word2Vec.load('/home/jeffmxh/word2vec/wiki.zh.text.model')
wordvec_weight = wordvec_model.wv.syn0
vocab = dict([(k, v.index) for k, v in wordvec_model.wv.vocab.items()])
vocab_inv = {vocab[w]:w for w in vocab.keys()}
word_embedding_layer = Embedding(
input_dim=wordvec_weight.shape[0],
output_dim=wordvec_weight.shape[1],
weights=[wordvec_weight],
trainable=False)
word_to_id = lambda word: not (vocab.get(word) is None) and vocab.get(word) or 0 #词->index
words_to_ids = lambda words: list(map(word_to_id, words))
# text['sent'] = list(sequence.pad_sequences(text['words'], maxlen=text_len, padding='post', truncating='post'))
In [420]:
len(wordvec_model['开心'])
Out[420]:
In [422]:
text['sent'] = text['words'].apply(words_to_ids) #变成432100
text['sent'] = list(sequence.pad_sequences(text['sent'], maxlen=text_len, padding='post', truncating='post'))
abstract['sent'] = abstract['words'].apply(words_to_ids)
abstract['sent'] = list(sequence.pad_sequences(abstract['sent'], maxlen=abstract_len, padding='post', truncating='post'))
In [423]:
def vectorize(sent_list, length, dim = 256):
vec = np.zeros((length, dim), dtype = float)
for i, ch in enumerate(sent_list):
vec[i] = wordvec_model[vocab_inv[ch]]
return vec
In [424]:
xa = np.zeros((len(text), 1200, 300))
ya = np.zeros((len(abstract), 20, 300))
for i in range(len(text.sent)):
xa[i] = vectorize(text.sent[i], 1200, 300)
for i in range(len(abstract.sent)):
ya[i] = vectorize(abstract.sent[i], 20, 300)
In [425]:
x = xa[::2] #训练集
xt = xa[1::2] #测试集
y = ya[::2] #训练集
yt = ya[1::2] #测试集
In [429]:
model = Sequential()
model.add(LSTM(128, input_shape=(None,300), return_sequences=False))
# model.add(Dropout(0.5))
model.add(Dense(128, activation="relu"))
model.add(RepeatVector(20))
model.add(LSTM(128, return_sequences=True))
# model.add(Dropout(0.2))
model.add(TimeDistributed(Dense(300, activation="linear")))
model.compile(loss="mse", optimizer='adam')
In [430]:
print(model.summary())
In [431]:
model.fit(xa, ya, validation_data=(xt, yt), epochs=10, batch_size=16)
Out[431]:
In [432]:
xtest = np.zeros((1, 1200, 300))
xtest[0] = x[0]
b = model.predict(xtest)[0]
# print(y[1])
# print(b)
In [441]:
text.content[0]
Out[441]:
In [439]:
for i in range(20):
print(wordvec_model.similar_by_vector(x[0][i])[7])
In [356]:
wordvec_model.similar_by_vector(wordvec_model['!'])
Out[356]:
In [216]:
x1 = np.array(list(text['sent']))[::2] #训练集
xt1 = np.array(list(text['sent']))[1::2] #测试集
xa1 = np.array(list(text['sent'])) #全集
y1 = np.array(list(abstract['sent']))[::2] #训练集
yt1 = np.array(list(abstract['sent']))[1::2] #测试集
ya1 = np.array(list(abstract['sent'])) #全集
In [217]:
model = Sequential()
model.add(word_embedding_layer)
model.add(LSTM(128, return_sequences=False))
model.add(Dense(128, activation="relu"))
model.add(RepeatVector(100))
model.add(LSTM(128, return_sequences=True))
model.add(TimeDistributed(Dense(256, activation="linear")))
model.compile(loss="mse", optimizer='adam')