In [5]:
from collections import namedtuple
class A():
    def __init__(self):
        self.a = 1
        self.b = 2
    @staticmethod
    def default_params():
        return{
            "a":1,
            "b":2
        }
try1 = A()
a = A.default_params()

class Params():
    def __init__(self, rnn_type):
        if rnn_type not in ['lstm', 'bilstm', 'gru']:
            raise KeyError
        self.num_classes = 8
        self.maxlen = 50
        self.batch_size = 16
        self.epochs = 10
        self.layer = rnn_type
        self.train_data_path = 'train_data/train_data.xlsx'
        self.word2vec_path = 'word2vec/word2vec_wx'
        self.model_name = 'lstm_seven_senti'
        self.embedding_train = False

params = Params('lstm')
params.num_classes
# 'log/' + params.model_name + "_train.log"
params.model_name


Out[5]:
'lstm_seven_senti'

In [16]:
import numpy as np

a = np.zeros((0,1))
# np.concatenate((a, [1,1,2]))
a


Out[16]:
array([], shape=(0, 1), dtype=float64)

In [35]:
import keras.backend as K

x_batch = K.placeholder(shape=(None, 256, 80))
y_batch = K.placeholder(shape=(None, 80))
y_batch = K.expand_dims(y_batch, axis=-1)
xy_batch_dot = K.batch_dot(x_batch, y_batch, axes=[2, 1])
K.int_shape(xy_batch_dot)

# def get_R(X):
#     Y, alpha = X[0], X[1]
#     ans = K.batch_dot(Y, alpha, axes=[2,1])
#     return ans


Out[35]:
(None, 256, 1)

In [40]:
x = K.placeholder(shape=(32, 28, 3))
y = K.placeholder(shape=(3, 4))
xy = K.dot(x, y)
xy


Out[40]:
<tf.Tensor 'Reshape_2:0' shape=(32, 28, 4) dtype=float32>

In [89]:
params = {
    'word2vec_path' : 'wordvec/word2vec_wx',
    'model_path' : 'models/lstm_seven_senti_0519.json',
    'weight_path' : 'models/lstm_seven_senti_0519_weight.h5'
}

def load_predict_data(data_path, target_column, params, dict_w):
    comment = pd.read_excel(data_path)
    cw = lambda x: list(jieba.cut(str(x))) #定义分词函数
    maxlen = 50
    filter_word = lambda x: [a for a in x if a in dict_w.index]
    get_sent = lambda x: list(dict_w['id'][x])
    comment = comment[comment[target_column].notnull()] #仅读取非空评论
    comment['words'] = comment[target_column].apply(cw) #评论分词
    comment['words'] = comment['words'].apply(filter_word)
    comment['sent'] = comment['words'].apply(get_sent)
    comment['sent'] = list(sequence.pad_sequences(comment['sent'], maxlen=maxlen, padding='post', truncating='post'))
    return comment
a = load_predict_data('/home/jeffmxh/37_5data.xlsx', 'content', params, dict_w)

In [92]:
len(a['sent'][0])


Out[92]:
50

In [85]:
get_sent = lambda x: list(dict_w['id'][x])
b = get_sent(['hahaha','我'])
b = a['words'][0]
b = [x for x in b if x in dict_w.index]
get_sent(b)


Out[85]:
[140, 172, 2, 26, 12848, 7, 4, 7]

In [59]:
b = a['sent'][0]
print(b)
type(b[7])


[140.0, 172.0, 2.0, 26.0, 12848.0, 7.0, nan, nan, 4.0, 7.0, nan]
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-59-fb854a37497f> in <module>()
      1 b = a['sent'][0]
      2 print(b)
----> 3 type(b[7])

TypeError: 'str' object is not callable

In [18]:
def load_word2vec(model_path):
    try:
        wordvec_model = gensim.models.word2vec.Word2Vec.load(model_path)
        wordvec_weight = wordvec_model.wv.syn0
    except:
        wordvec_model = ''
        wordvec_weight = ''
    return wordvec_model, wordvec_weight

a,b = load_word2vec('/home/jeffmxh/word2vec_wx/word2vec_wx')
a
if a:
    print('hahaha')


hahaha

In [19]:
# from __future__ import absolute_import #导入3.x的特征函数
# from __future__ import print_function
import pandas as pd #导入Pandas
import keras
import numpy as np #导入Numpy
import jieba #导入结巴分词
jieba.enable_parallel(32)
import h5py
import gensim
# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')

from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.layers.wrappers import Bidirectional
from keras import backend as K
from keras import metrics
from multiprocessing import Pool
from collections import namedtuple

'''
参数设置
'''
params = {
    'num_classes' : 8,
    'maxlen' : 50,
    'batch_size' : 16,
    'epochs' : 10,
    'layer':'lstm',
    'train_data_path' : '/home/jeffmxh/ML_learn/emotion_classify/train_data/train_data.xlsx',
    'word2vec_path' : '/home/jeffmxh/ML_learn/emotion_classify/word2vec/word2vec_wx',
    'model_path' : 'models/2gru_seven_senti_0509_train.json',
    'weight_path' : 'models/2gru_seven_senti_0509_train_weight.h5',
    'embedding_train' : False
}

Train_Set = namedtuple('Train_Set', 'x y xt yt xa ya')

def trans_emo(emo):
    trans_dict = dict(zip(['none', 'disgust', 'like', 'happiness', 'sadness', 'surprise', 'anger', 'fear'], range(8)))
    return trans_dict[emo]

def load_word2vec(model_path):
    wordvec_model = gensim.models.word2vec.Word2Vec.load(model_path)
    wordvec_weight = wordvec_model.wv.syn0
    return wordvec_model, wordvec_weight

def preprocess_data(file_path, wordvec_model, params):
    raw_data = pd.read_excel(file_path)
    print('Data loaded!')
    data = pd.DataFrame({'sent' : raw_data.sentence,
                         'mark' : raw_data.emotion_1 })
    data['mark'] = data['mark'].apply(trans_emo)
    print('emotion_tag transformed!')
    cw = lambda x: list(jieba.cut(str(x))) #定义分词函数
    data['words'] = data['sent'].apply(cw)
    vocab = dict([(k, v.index) for k, v in wordvec_model.wv.vocab.items()])
    word_to_id = lambda word: not (vocab.get(word) is None) and vocab.get(word) or 0
    words_to_ids = lambda words: list(map(word_to_id, words))
    data['sent'] = data['words'].apply(words_to_ids)
    reverse_seq = lambda id_seq: id_seq[::-1]
    concat_seq = lambda a,b: list(np.hstack((a, b)))
    print("Pad sequences (samples x time)")
    data['sent_rev'] = list(sequence.pad_sequences(data['sent'], maxlen=params['maxlen']))
    data['sent_rev'] = data['sent_rev'].apply(reverse_seq)
    data['sent'] = list(sequence.pad_sequences(data['sent'], maxlen=params['maxlen'], padding='post', truncating='post'))
    data['sent'] = data['sent'].combine(data['sent_rev'], func=concat_seq)
    return data

def split_data(train_data):
    x = np.array(list(train_data['sent']))[::2] #训练集
    y = np.array(list(train_data['mark']))[::2]
    y = keras.utils.to_categorical(y, num_classes)
    xt = np.array(list(train_data['sent']))[1::2] #测试集
    yt = np.array(list(train_data['mark']))[1::2]
    yt = keras.utils.to_categorical(yt, num_classes)
    xa = np.array(list(train_data['sent'])) #全集
    ya = np.array(list(train_data['mark']))
    return Train_Set(x, y, xt, yt, xa, ya)

def build_model(wordvec_weight, params):
    word_embedding_layer = Embedding(
        input_dim=wordvec_weight.shape[0],
        output_dim=wordvec_weight.shape[1],
        weights=[wordvec_weight],
        trainable=params['embedding_train'])
    print('Build model...')
    model = Sequential()
    model.add(word_embedding_layer)
    model.add(Dropout(0.1))
    if params['layer']=='lstm':
        model.add(LSTM(128, return_sequences = False))
    if params['layer']=='bilstm':
        model.add(Bidirectional(LSTM(128, return_sequences = False))) 
    if params['layer']=='gru':
        model.add(GRU(128, return_sequences = False))
    model.add(Dropout(0.5))
    model.add(Dense(params['num_classes']))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[metrics.mae, metrics.categorical_accuracy])
    return model
    
print('try loading pretrained word2vec model.')
wordvec_model, wordvec_weight = load_word2vec(params['word2vec_path'])
data_all = preprocess_data(params['train_data_path'], wordvec_model, params)
# train_data = split_data(data_all)
# model = build_model(wordvec_weight, params)

# model.summary()

# model.fit(train_data.x, train_data.y, batch_size=params['batch_size'], epochs=params['epochs'], validation_data=(train_data.xt, train_data.yt))

# scores = model.evaluate(train_data.xt, train_data.yt, verbose=0)
# print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
# print("%s: %.2f%%" % (model.metrics_names[2], scores[2]*100))

# json_string = model.to_json()
# with open(params['model_path'], 'wt') as f:
#     f.write(json_string)
# model.save_weights(params['weight_path'])


try loading pretrained word2vec model.
Data loaded!
emotion_tag transformed!
Pad sequences (samples x time)

In [20]:
data_all


Out[20]:
mark sent words sent_rev
0 0 [44916, 8803, 0, 20, 6245, 0, 18, 0, 0, 0, 0, ... [今儿, 老爸, 逮着, 我, 一顿, 狠念, !] [18, 0, 6245, 20, 0, 8803, 44916, 0, 0, 0, 0, ...
1 1 [0, 20, 356, 8553, 100, 100, 0, 0, 0, 0, 0, 0,... [念得, 我, 各种, 烦躁, …, …] [100, 100, 8553, 356, 20, 0, 0, 0, 0, 0, 0, 0,...
2 1 [0, 167, 897, 109, 839, 1297, 1, 5379, 332, 5,... [我要, 不要, 考虑, 下, 降低, 回家, 的, 频率, 啊, , 回来, 一次, 吵... [18, 18, 18, 36167, 704, 183, 5, 183, 303, 882...
3 0 [9, 897, 451, 31, 0, 0, 239970, 0, 1233, 67, 6... [在, 考虑, 是否, 要, 回刷, 2.3, Rom, ,, 黑, /, 锁屏, 状态, ... [55, 2262, 7258, 4768, 0, 772, 63987, 67, 1233...
4 0 [0, 39643, 453, 100, 0, 0, 0, 0, 0, 0, 0, 0, 0... [发烫, 耗电, 快, …] [100, 453, 39643, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
5 1 [37, 48, 20, 3461, 3364, 18, 0, 0, 0, 0, 0, 0,... [这, 让, 我, 蛋, 疼, !] [18, 3364, 3461, 20, 48, 37, 0, 0, 0, 0, 0, 0,...
6 2 [511, 0, 233, 0, 11296, 1, 7819, 18, 0, 0, 0, ... [不过, ,, 喜欢, 4.0, 原生态, 的, 界面, !] [18, 7819, 1, 11296, 0, 233, 0, 511, 0, 0, 0, ...
7 0 [1867, 1, 335, 20, 84, 33, 7, 707, 21451, 0, 1... [去年, 的, 今天, 我, 去, 上, 了, 政治, 辅导班, ,, 回来, 看, 芒果,... [3, 9115, 7, 25531, 954, 9, 67355, 7, 9193, 37...
8 2 [108, 31, 2817, 20, 1134, 0, 1, 14370, 7, 0, 2... [又, 要, 重复, 我, 每年, 必说, 的, 台词, 了, ,, 2010, 就要, 过... [3, 149, 9113, 64, 20, 0, 7, 710, 918, 2882, 0...
9 2 [1375, 0, 149, 4827, 64, 326, 3, 0, 0, 0, 0, 0... [至少, ,, 它, 看上去, 很, 美, 。] [3, 326, 64, 4827, 149, 0, 1375, 0, 0, 0, 0, 0...
10 3 [676, 15, 0, 2634, 0, 130293, 18, 0, 0, 0, 0, ... [欢迎, 你, ,, 2011, ,, 新年快乐, !] [18, 130293, 0, 2634, 0, 15, 676, 0, 0, 0, 0, ...
11 0 [939, 41, 0, 0, 0, 13, 0, 759, 39946, 3, 0, 0,... [下午, 到, 台里, ,, 吴凤花, 和, 方亚芬, 正在, 换装, 。] [3, 39946, 759, 0, 13, 0, 0, 0, 41, 939, 0, 0,...
12 0 [20, 0, 57125, 0, 23, 82, 62408, 229, 49, 9597... [我, 刚过去, 打个招呼, ,, 就, 被, 德才, 老师, 说, 耽误, 他们, 录像, 。] [3, 13502, 111, 9597, 49, 229, 62408, 82, 23, ...
13 4 [20, 742, 32754, 179, 3, 0, 0, 0, 0, 0, 0, 0, ... [我, 那个, 冤, 呢, 。] [3, 179, 32754, 742, 20, 0, 0, 0, 0, 0, 0, 0, ...
14 0 [20, 49, 20, 193, 2639, 3, 0, 0, 0, 0, 0, 0, 0... [我, 说, 我, 没, 采访, 。] [3, 2639, 193, 20, 49, 20, 0, 0, 0, 0, 0, 0, 0...
15 0 [39, 49, 2553, 22, 2489, 0, 9597, 514, 2462, 3... [他, 说, 聊天, 也, 不行, ,, 耽误, 穿, 服装, 。] [3, 2462, 514, 9597, 0, 2489, 22, 2553, 49, 39...
16 0 [1129, 3830, 49, 1723, 58, 568, 23, 70, 1273, ... [方, 姐姐, 说, 腿, 好, 的话, 就, 来, 段, 对, 花枪, 了, 。] [3, 7, 220999, 34, 1273, 70, 23, 568, 58, 1723...
17 4 [58, 5705, 5, 426, 207, 24, 2614, 15, 5, 426, ... [好, 难过, , 为什么, 每天, 都, 回忆, 你, , 为什么, 每天, 都, 想... [158, 9, 65, 306, 5, 7, 890, 825, 5, 0, 58, 24...
18 4 [426, 20, 1, 2129, 231, 733, 179, 0, 89045, 33... [为什么, 我, 的, 房间, 那么, 干, 呢, ,, 嗓子疼, 啊, ~, !] [18, 135, 332, 89045, 0, 179, 733, 231, 2129, ...
19 4 [0, 311, 62, 1323, 163, 7, 20, 2317, 15786, 1,... [今早, 起来, 更, 体现, 出, 了, 我, 昨天, 排练, 的, 成果, ,, 腰酸背... [18, 18, 18, 21432, 0, 1991, 1, 15786, 2317, 2...
20 0 [511, 5110, 229, 0, 2803, 49, 1, 15230, 20, 22... [不过, 彭, 老师, 今早, 出门, 说, 的, 那句话, 我, 也, 赞同, ......] [0, 11228, 22, 20, 15230, 1, 49, 2803, 0, 229,...
21 2 [3959, 1073, 70087, 38863, 5, 66911, 0, 313430... [这家, 位于, 槟城, George, , Town, ,, Tunes, , Hot... [1152, 15561, 1149, 3, 1196, 64, 24, 9612, 474...
22 4 [335, 6, 89, 1282, 1460, 13, 7077, 1, 1657, 84... [今天, 是, 个, 值得, 开心, 和, 崩溃, 的, 日子, ~] [840, 1657, 1, 7077, 13, 1460, 1282, 89, 6, 33...
23 4 [6555, 1319, 38436, 0, 0, 20, 5187, 24, 163, 7... [大腿, 再次, 拉伤, ,, 痛到, 我, 眼泪, 都, 出, 了, …] [100, 7, 163, 24, 5187, 20, 0, 0, 38436, 1319,...
24 4 [1620, 3691, 1385, 20051, 0, 7077, 100, 0, 0, ... [回到, 宿舍, 连续, 拉肚子, ,, 崩溃, …] [100, 7077, 0, 20051, 1385, 3691, 1620, 0, 0, ...
25 3 [2097, 455, 0, 0, 840, 0, 0, 0, 0, 0, 0, 0, 0,... [接下来, 继续, 红红的, 红歌赛, ~] [840, 0, 0, 455, 2097, 0, 0, 0, 0, 0, 0, 0, 0,...
26 5 [53093, 0, 6, 20, 19, 1848, 6483, 10509, 7, 20... [OMG, ,, 是, 我, 不, 关心, 八卦, 太久, 了, 吗, ?, !] [18, 36, 204, 7, 10509, 6483, 1848, 19, 20, 6,...
27 0 [3372, 65, 9, 279, 22749, 1827, 129270, 16123,... [白天, 还, 在, 跟, 小黑, 讨论, Jensen, 过生日, ,, 跟, 大, J,... [18, 18, 7, 311, 4384, 108, 1045, 228647, 117,...
28 5 [471, 2188, 0, 70, 279, 20, 49, 6483, 8, 15357... [结果, 刚刚, KANA, 来, 跟, 我, 说, 八卦, :, J2, 周末, 一起, ... [18, 18, 18, 7, 2014, 88, 21, 82, 3015, 0, 25,...
29 5 [388, 53, 509, 20, 37, 854, 474, 23665, 23665,... [谁, 能, 告诉, 我, 这, 是不是, 真的, 啊啊啊, 啊啊啊, ,, 我要, 疯狂,... [135, 135, 135, 135, 135, 135, 135, 135, 135, ...
... ... ... ... ...
16676 0 [60, 770, 4062, 3, 4062, 269, 4566, 1, 1216, 0... [3, 学会, 宽容, 。, 宽容, 像, 春天, 的, 阳光, ,, 照耀, 别人, 也,... [3, 42, 1229, 22, 308, 16957, 0, 1216, 1, 4566...
16677 0 [99, 770, 21691, 3, 0, 0, 142, 0, 3, 0, 0, 0, ... [4, 学会, 隐忍, 。, 隐而, 不发, 不是, 不发, 。] [3, 0, 142, 0, 0, 3, 21691, 770, 99, 0, 0, 0, ...
16678 0 [85, 770, 419, 3, 681, 13, 606, 24, 6, 419, 1,... [5, 学会, 简单, 。, 美丽, 和, 幸福, 都, 是, 简单, 的, ,, 把握, ... [3, 419, 19, 76, 419, 7, 1902, 0, 1, 419, 6, 2...
16679 0 [141, 770, 14911, 1246, 3, 0, 1559, 0, 209, 23... [6, 学会, 换位, 思考, 。, 换个, 角度, ,, 世界, 就, 会, 大不相同, ..] [0, 38220, 27, 23, 209, 0, 1559, 0, 3, 1246, 1...
16680 0 [0, 5, 611, 0, 1141, 47977, 0, 0, 0, 0, 0, 0, ... [thx, , @, 王力, 群, Alex] [47977, 1141, 0, 611, 5, 0, 0, 0, 0, 0, 0, 0, ...
16681 0 [8756, 734, 17, 18663, 16830, 7212, 5, 4278, 9... [东南, 网, -, 海峡, 都市报, 讯, , 昨日, 下午, 2, 时许, ,, 送,... [3, 3732, 25259, 0, 37292, 11635, 3277, 82, 0,...
16682 0 [2344, 325, 9, 1111, 55737, 1, 6443, 33, 0, 21... [事故, 发生, 在, 往, 二环路, 的, 车道, 上, ,, 肇事, 小车, 停, 在,... [3, 650, 13293, 82, 235, 19410, 0, 33, 23097, ...
16683 0 [29788, 1083, 1749, 49, 0, 728, 330, 1749, 23,... [目击者, 林, 女士, 说, ,, 当时, 万, 女士, 就, 站, 在, 斑马线, 上,... [3, 7, 79980, 23, 0, 0, 0, 0, 0, 6340, 110, 50...
16684 0 [32255, 35660, 21085, 17, 6300, 154820, 3733, ... [太古, 三里屯, 北区, -, 围, 合式, 分布, 的, 旗舰, 街区, ,, 商铺, ... [3, 272, 92, 1204, 6870, 2488, 6447, 6560, 125...
16685 0 [412, 8, 0, 25011, 4, 44758, 4, 5316, 11868, 4... [品牌, :, 阿一, 鲍鱼, 、, 阿玛尼, 、, 巴黎, 世家, 、, 浪凡, 、, 范... [61333, 50, 1, 20, 4, 2549, 4, 181783, 4, 4593...
16686 0 [20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... [我, ................] [0, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
16687 0 [10773, 158484, 0, 10773, 138953, 0, 10773, 20... [轻度, 失忆症, ,, 轻度, 妄想症, ,, 轻度, 自闭症, ,, 轻度, 抑郁症, 。] [3, 9723, 10773, 0, 20241, 10773, 0, 138953, 1...
16688 0 [2211, 1838, 4928, 163498, 0, 20, 24, 23765, 2... [一个月, 配, 三次, 锁匙, ,, 我, 都, 唔, 知, 我, 搞, 乜, !] [18, 90617, 1936, 20, 2042, 23765, 24, 20, 0, ...
16689 0 [0, 1227, 174, 172201, 4681, 2638, 0, 20, 42, ... [我仲, 以为, 前, 几晚, 醉, 左, 翻黎系, 我, 自己, 开门, ,, 原来, 系... [18, 24871, 8164, 21, 480, 3239, 1260, 1115, 0...
16690 0 [13100, 0, 118411, 614, 2638, 37456, 23, 4681,... [唉, ,, 千杯不醉, 变, 左, 三杯, 就, 醉, !] [18, 4681, 23, 37456, 2638, 614, 118411, 0, 13...
16691 0 [726, 42, 4681, 2638, 24, 23765, 2042, 0, 0, 5... [连, 自己, 醉, 左, 都, 唔, 知, ,, 翻时, 好, 清醒, 甘, ,, 训醒,... [1152, 43810, 2638, 1149, 18, 84920, 532, 23, ...
16692 0 [1844, 0, 10642, 638, 200, 593, 0, 26, 3493, 2... [近日, ,, 甘肃省, 政府, 向, 14, 个市, (, 州, ), 、, 省政府, 有... [3, 5631, 317, 157, 3723, 1194, 0, 9835, 1, 10...
16693 2 [0, 20, 40757, 15, 474, 1474, 502, 249, 9902, ... [金厉旭, 我, 只在乎, 你, 真的, 好好, 听, 哦, 崇拜, !] [18, 9902, 249, 502, 1474, 474, 15, 40757, 20,...
16694 2 [939, 2351, 24, 8546, 29879, 0, 199, 1684, 0, ... [下午, 场, 都, 录, 嘞, ,, 现在, 休息, 休惨, 了, 哈哈哈, 哈哈, ,,... [100, 147330, 6, 1, 11737, 78, 20, 48, 335, 87...
16695 0 [9649, 0, 124, 670, 0, 9482, 145, 110, 7825, 5... [玩游戏, ,, 看, 电影, ,, 小姑娘, 吃, 过, 西瓜, 摸, 着, 肚子, 睡着... [100, 100, 7, 8399, 3311, 75, 5085, 7825, 110,...
16696 2 [0, 23, 2436, 0, 0, 6, 929, 1, 3186, 135, 20, ... [一看, 就, 舒服, ,, 这才, 是, 完美, 的, 夏天, ~, 我, 想, 放暑假, !] [18, 95867, 117, 20, 135, 3186, 1, 929, 6, 0, ...
16697 0 [230, 388, 117, 95867, 36, 0, 0, 0, 0, 0, 0, 0... [还有, 谁, 想, 放暑假, ?] [36, 95867, 117, 388, 230, 0, 0, 0, 0, 0, 0, 0...
16698 6 [1723, 33, 1, 5589, 58, 761, 5, 3761, 5, 3761,... [腿, 上, 的, 伤口, 好, 明显, , T, , T, , 这, 又, 不是, ... [36, 204, 142, 7, 777, 0, 2242, 4991, 332, 195...
16699 5 [1955, 7, 35320, 65, 95589, 131, 146, 3919, 6,... [戴, 了, 护膝, 还, 摔成, 这样, 那, 冲击, 是, 有, 多, 大, [, 泪, ]] [1152, 5726, 1149, 68, 46, 16, 6, 3919, 146, 1...
16700 0 [72570, 49, 8, 16, 390, 6, 5411, 392, 1, 0, 16... [麦兜, 说, :, 有, 事情, 是, 要说, 出来, 的, ,, 不要, 等, 着, 对... [3, 1327, 6, 748, 0, 5559, 13, 6063, 6, 768, 3...
16701 0 [35, 74, 5908, 2589, 1, 4600, 13, 20, 49, 72, ... [一个, 做, 二手车, 生意, 的, 哥哥, 和, 我, 说, 时, 我, 还, 不信, ... [3, 7, 474, 6, 2410, 0, 0, 0, 65, 20, 72, 49, ...
16702 6 [1173, 16, 7674, 3093, 23, 27, 16, 2858, 0, 31... [每次, 有, 新政, 出台, 就, 会, 有, 一批, 人富, 起来, !] [18, 311, 0, 2858, 16, 27, 23, 3093, 7674, 16,...
16703 6 [1149, 21945, 1152, 1149, 5475, 1152, 0, 0, 0,... [[, 惊恐, ], [, 愤怒, ]] [1152, 5475, 1149, 1152, 21945, 1149, 0, 0, 0,...
16704 4 [327, 4105, 0, 5201, 8600, 3, 0, 0, 0, 0, 0, 0... [人生, 不止, ,, 寂寞, 不已, 。] [3, 8600, 5201, 0, 4105, 327, 0, 0, 0, 0, 0, 0...
16705 4 [5201, 327, 158, 46989, 0, 5201, 6, 158, 787, ... [寂寞, 人生, 爱, 无休, ,, 寂寞, 是, 爱, 永远, 的, 主题, 、, 我, ... [3, 15, 117, 9, 24, 0, 8089, 1, 20, 13, 20, 11...

16706 rows × 4 columns


In [28]:
from multiprocessing import Pool
# w = [] #将所有词语整合在一起
# for i in data_all['words']:
#     w.extend(i)
dict_w = pd.DataFrame(pd.Series(w).value_counts()) #统计词的出现次数
dict_w['id']=list(range(1,len(dict_w)+1))
dict_w
pool = Pool(16)
get_sent = lambda x: list(dict_w['id'][x])
data['sent'] = data['words'].apply(get_sent) #速度太慢
def get_sent(x):
    return list(dict_w['id'][x])
data_all['sent'] = pool.map(get_sent, data_all['words'])

In [38]:
dict_w


Out[38]:
0 id
20362 1
13430 2
9686 3
5196 4
5082 5
4327 6
3370 7
3209 8
2567 9
2393 10
1955 11
[ 1938 12
] 1937 13
1558 14
1503 15
1478 16
1418 17
1397 18
1384 19
1329 20
~ 1289 21
1197 22
1190 23
1172 24
1142 25
一个 1073 26
1011 27
965 28
, 941 29
898 30
... ... ...
問題 1 37354
菠萝包 1 37355
开门见山 1 37356
柿子 1 37357
要个 1 37358
静思 1 37359
孔文燕 1 37360
川柯南 1 37361
人敢 1 37362
收听 1 37363
吐出来 1 37364
装箱 1 37365
再陷 1 37366
开卷 1 37367
装机容量 1 37368
攀缘 1 37369
LGBT 1 37370
受得住 1 37371
日次 1 37372
戴维 1 37373
哈喇子 1 37374
中轴线 1 37375
挑灯 1 37376
阎王爷 1 37377
天一 1 37378
本色 1 37379
人超 1 37380
不提 1 37381
逃命 1 37382
相接 1 37383

37383 rows × 2 columns


In [31]:
import errno
import os
print(FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), 'foobar'))


[Errno 2] No such file or directory: 'foobar'

In [14]:
train_data.y


Out[14]:
array([[ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])