notebook.community

Edit and run



In [5]:

    
from collections import namedtuple
class A():
    def __init__(self):
        self.a = 1
        self.b = 2
    @staticmethod
    def default_params():
        return{
            "a":1,
            "b":2
        }
try1 = A()
a = A.default_params()

class Params():
    def __init__(self, rnn_type):
        if rnn_type not in ['lstm', 'bilstm', 'gru']:
            raise KeyError
        self.num_classes = 8
        self.maxlen = 50
        self.batch_size = 16
        self.epochs = 10
        self.layer = rnn_type
        self.train_data_path = 'train_data/train_data.xlsx'
        self.word2vec_path = 'word2vec/word2vec_wx'
        self.model_name = 'lstm_seven_senti'
        self.embedding_train = False

params = Params('lstm')
params.num_classes
# 'log/' + params.model_name + "_train.log"
params.model_name









    Out[5]:





'lstm_seven_senti'



In [16]:

    
import numpy as np

a = np.zeros((0,1))
# np.concatenate((a, [1,1,2]))
a









    Out[16]:





array([], shape=(0, 1), dtype=float64)



In [35]:

    
import keras.backend as K

x_batch = K.placeholder(shape=(None, 256, 80))
y_batch = K.placeholder(shape=(None, 80))
y_batch = K.expand_dims(y_batch, axis=-1)
xy_batch_dot = K.batch_dot(x_batch, y_batch, axes=[2, 1])
K.int_shape(xy_batch_dot)

# def get_R(X):
#     Y, alpha = X[0], X[1]
#     ans = K.batch_dot(Y, alpha, axes=[2,1])
#     return ans









    Out[35]:





(None, 256, 1)



In [40]:

    
x = K.placeholder(shape=(32, 28, 3))
y = K.placeholder(shape=(3, 4))
xy = K.dot(x, y)
xy









    Out[40]:





<tf.Tensor 'Reshape_2:0' shape=(32, 28, 4) dtype=float32>



In [89]:

    
params = {
    'word2vec_path' : 'wordvec/word2vec_wx',
    'model_path' : 'models/lstm_seven_senti_0519.json',
    'weight_path' : 'models/lstm_seven_senti_0519_weight.h5'
}

def load_predict_data(data_path, target_column, params, dict_w):
    comment = pd.read_excel(data_path)
    cw = lambda x: list(jieba.cut(str(x))) #定义分词函数
    maxlen = 50
    filter_word = lambda x: [a for a in x if a in dict_w.index]
    get_sent = lambda x: list(dict_w['id'][x])
    comment = comment[comment[target_column].notnull()] #仅读取非空评论
    comment['words'] = comment[target_column].apply(cw) #评论分词
    comment['words'] = comment['words'].apply(filter_word)
    comment['sent'] = comment['words'].apply(get_sent)
    comment['sent'] = list(sequence.pad_sequences(comment['sent'], maxlen=maxlen, padding='post', truncating='post'))
    return comment
a = load_predict_data('/home/jeffmxh/37_5data.xlsx', 'content', params, dict_w)



In [92]:

    
len(a['sent'][0])









    Out[92]:





50



In [85]:

    
get_sent = lambda x: list(dict_w['id'][x])
b = get_sent(['hahaha','我'])
b = a['words'][0]
b = [x for x in b if x in dict_w.index]
get_sent(b)









    Out[85]:





[140, 172, 2, 26, 12848, 7, 4, 7]



In [59]:

    
b = a['sent'][0]
print(b)
type(b[7])









    



[140.0, 172.0, 2.0, 26.0, 12848.0, 7.0, nan, nan, 4.0, 7.0, nan]






    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-59-fb854a37497f> in <module>()
      1 b = a['sent'][0]
      2 print(b)
----> 3 type(b[7])

TypeError: 'str' object is not callable



In [18]:

    
def load_word2vec(model_path):
    try:
        wordvec_model = gensim.models.word2vec.Word2Vec.load(model_path)
        wordvec_weight = wordvec_model.wv.syn0
    except:
        wordvec_model = ''
        wordvec_weight = ''
    return wordvec_model, wordvec_weight

a,b = load_word2vec('/home/jeffmxh/word2vec_wx/word2vec_wx')
a
if a:
    print('hahaha')









    



hahaha



In [19]:

    
# from __future__ import absolute_import #导入3.x的特征函数
# from __future__ import print_function
import pandas as pd #导入Pandas
import keras
import numpy as np #导入Numpy
import jieba #导入结巴分词
jieba.enable_parallel(32)
import h5py
import gensim
# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')

from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.layers.wrappers import Bidirectional
from keras import backend as K
from keras import metrics
from multiprocessing import Pool
from collections import namedtuple

'''
参数设置
'''
params = {
    'num_classes' : 8,
    'maxlen' : 50,
    'batch_size' : 16,
    'epochs' : 10,
    'layer':'lstm',
    'train_data_path' : '/home/jeffmxh/ML_learn/emotion_classify/train_data/train_data.xlsx',
    'word2vec_path' : '/home/jeffmxh/ML_learn/emotion_classify/word2vec/word2vec_wx',
    'model_path' : 'models/2gru_seven_senti_0509_train.json',
    'weight_path' : 'models/2gru_seven_senti_0509_train_weight.h5',
    'embedding_train' : False
}

Train_Set = namedtuple('Train_Set', 'x y xt yt xa ya')

def trans_emo(emo):
    trans_dict = dict(zip(['none', 'disgust', 'like', 'happiness', 'sadness', 'surprise', 'anger', 'fear'], range(8)))
    return trans_dict[emo]

def load_word2vec(model_path):
    wordvec_model = gensim.models.word2vec.Word2Vec.load(model_path)
    wordvec_weight = wordvec_model.wv.syn0
    return wordvec_model, wordvec_weight

def preprocess_data(file_path, wordvec_model, params):
    raw_data = pd.read_excel(file_path)
    print('Data loaded!')
    data = pd.DataFrame({'sent' : raw_data.sentence,
                         'mark' : raw_data.emotion_1 })
    data['mark'] = data['mark'].apply(trans_emo)
    print('emotion_tag transformed!')
    cw = lambda x: list(jieba.cut(str(x))) #定义分词函数
    data['words'] = data['sent'].apply(cw)
    vocab = dict([(k, v.index) for k, v in wordvec_model.wv.vocab.items()])
    word_to_id = lambda word: not (vocab.get(word) is None) and vocab.get(word) or 0
    words_to_ids = lambda words: list(map(word_to_id, words))
    data['sent'] = data['words'].apply(words_to_ids)
    reverse_seq = lambda id_seq: id_seq[::-1]
    concat_seq = lambda a,b: list(np.hstack((a, b)))
    print("Pad sequences (samples x time)")
    data['sent_rev'] = list(sequence.pad_sequences(data['sent'], maxlen=params['maxlen']))
    data['sent_rev'] = data['sent_rev'].apply(reverse_seq)
    data['sent'] = list(sequence.pad_sequences(data['sent'], maxlen=params['maxlen'], padding='post', truncating='post'))
    data['sent'] = data['sent'].combine(data['sent_rev'], func=concat_seq)
    return data

def split_data(train_data):
    x = np.array(list(train_data['sent']))[::2] #训练集
    y = np.array(list(train_data['mark']))[::2]
    y = keras.utils.to_categorical(y, num_classes)
    xt = np.array(list(train_data['sent']))[1::2] #测试集
    yt = np.array(list(train_data['mark']))[1::2]
    yt = keras.utils.to_categorical(yt, num_classes)
    xa = np.array(list(train_data['sent'])) #全集
    ya = np.array(list(train_data['mark']))
    return Train_Set(x, y, xt, yt, xa, ya)

def build_model(wordvec_weight, params):
    word_embedding_layer = Embedding(
        input_dim=wordvec_weight.shape[0],
        output_dim=wordvec_weight.shape[1],
        weights=[wordvec_weight],
        trainable=params['embedding_train'])
    print('Build model...')
    model = Sequential()
    model.add(word_embedding_layer)
    model.add(Dropout(0.1))
    if params['layer']=='lstm':
        model.add(LSTM(128, return_sequences = False))
    if params['layer']=='bilstm':
        model.add(Bidirectional(LSTM(128, return_sequences = False))) 
    if params['layer']=='gru':
        model.add(GRU(128, return_sequences = False))
    model.add(Dropout(0.5))
    model.add(Dense(params['num_classes']))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[metrics.mae, metrics.categorical_accuracy])
    return model
    
print('try loading pretrained word2vec model.')
wordvec_model, wordvec_weight = load_word2vec(params['word2vec_path'])
data_all = preprocess_data(params['train_data_path'], wordvec_model, params)
# train_data = split_data(data_all)
# model = build_model(wordvec_weight, params)

# model.summary()

# model.fit(train_data.x, train_data.y, batch_size=params['batch_size'], epochs=params['epochs'], validation_data=(train_data.xt, train_data.yt))

# scores = model.evaluate(train_data.xt, train_data.yt, verbose=0)
# print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
# print("%s: %.2f%%" % (model.metrics_names[2], scores[2]*100))

# json_string = model.to_json()
# with open(params['model_path'], 'wt') as f:
#     f.write(json_string)
# model.save_weights(params['weight_path'])









    



try loading pretrained word2vec model.
Data loaded!
emotion_tag transformed!
Pad sequences (samples x time)



In [20]:

    
data_all









    Out[20]:






  
    
      
      mark
      sent
      words
      sent_rev
    
  
  
    
      0
      0
      [44916, 8803, 0, 20, 6245, 0, 18, 0, 0, 0, 0, ...
      [今儿, 老爸, 逮着, 我, 一顿, 狠念, ！]
      [18, 0, 6245, 20, 0, 8803, 44916, 0, 0, 0, 0, ...
    
    
      1
      1
      [0, 20, 356, 8553, 100, 100, 0, 0, 0, 0, 0, 0,...
      [念得, 我, 各种, 烦躁, …, …]
      [100, 100, 8553, 356, 20, 0, 0, 0, 0, 0, 0, 0,...
    
    
      2
      1
      [0, 167, 897, 109, 839, 1297, 1, 5379, 332, 5,...
      [我要, 不要, 考虑, 下, 降低, 回家, 的, 频率, 啊,  , 回来, 一次, 吵...
      [18, 18, 18, 36167, 704, 183, 5, 183, 303, 882...
    
    
      3
      0
      [9, 897, 451, 31, 0, 0, 239970, 0, 1233, 67, 6...
      [在, 考虑, 是否, 要, 回刷, 2.3, Rom, ，, 黑, /, 锁屏, 状态, ...
      [55, 2262, 7258, 4768, 0, 772, 63987, 67, 1233...
    
    
      4
      0
      [0, 39643, 453, 100, 0, 0, 0, 0, 0, 0, 0, 0, 0...
      [发烫, 耗电, 快, …]
      [100, 453, 39643, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
    
    
      5
      1
      [37, 48, 20, 3461, 3364, 18, 0, 0, 0, 0, 0, 0,...
      [这, 让, 我, 蛋, 疼, ！]
      [18, 3364, 3461, 20, 48, 37, 0, 0, 0, 0, 0, 0,...
    
    
      6
      2
      [511, 0, 233, 0, 11296, 1, 7819, 18, 0, 0, 0, ...
      [不过, ，, 喜欢, 4.0, 原生态, 的, 界面, ！]
      [18, 7819, 1, 11296, 0, 233, 0, 511, 0, 0, 0, ...
    
    
      7
      0
      [1867, 1, 335, 20, 84, 33, 7, 707, 21451, 0, 1...
      [去年, 的, 今天, 我, 去, 上, 了, 政治, 辅导班, ，, 回来, 看, 芒果,...
      [3, 9115, 7, 25531, 954, 9, 67355, 7, 9193, 37...
    
    
      8
      2
      [108, 31, 2817, 20, 1134, 0, 1, 14370, 7, 0, 2...
      [又, 要, 重复, 我, 每年, 必说, 的, 台词, 了, ，, 2010, 就要, 过...
      [3, 149, 9113, 64, 20, 0, 7, 710, 918, 2882, 0...
    
    
      9
      2
      [1375, 0, 149, 4827, 64, 326, 3, 0, 0, 0, 0, 0...
      [至少, ，, 它, 看上去, 很, 美, 。]
      [3, 326, 64, 4827, 149, 0, 1375, 0, 0, 0, 0, 0...
    
    
      10
      3
      [676, 15, 0, 2634, 0, 130293, 18, 0, 0, 0, 0, ...
      [欢迎, 你, ，, 2011, ，, 新年快乐, ！]
      [18, 130293, 0, 2634, 0, 15, 676, 0, 0, 0, 0, ...
    
    
      11
      0
      [939, 41, 0, 0, 0, 13, 0, 759, 39946, 3, 0, 0,...
      [下午, 到, 台里, ，, 吴凤花, 和, 方亚芬, 正在, 换装, 。]
      [3, 39946, 759, 0, 13, 0, 0, 0, 41, 939, 0, 0,...
    
    
      12
      0
      [20, 0, 57125, 0, 23, 82, 62408, 229, 49, 9597...
      [我, 刚过去, 打个招呼, ，, 就, 被, 德才, 老师, 说, 耽误, 他们, 录像, 。]
      [3, 13502, 111, 9597, 49, 229, 62408, 82, 23, ...
    
    
      13
      4
      [20, 742, 32754, 179, 3, 0, 0, 0, 0, 0, 0, 0, ...
      [我, 那个, 冤, 呢, 。]
      [3, 179, 32754, 742, 20, 0, 0, 0, 0, 0, 0, 0, ...
    
    
      14
      0
      [20, 49, 20, 193, 2639, 3, 0, 0, 0, 0, 0, 0, 0...
      [我, 说, 我, 没, 采访, 。]
      [3, 2639, 193, 20, 49, 20, 0, 0, 0, 0, 0, 0, 0...
    
    
      15
      0
      [39, 49, 2553, 22, 2489, 0, 9597, 514, 2462, 3...
      [他, 说, 聊天, 也, 不行, ，, 耽误, 穿, 服装, 。]
      [3, 2462, 514, 9597, 0, 2489, 22, 2553, 49, 39...
    
    
      16
      0
      [1129, 3830, 49, 1723, 58, 568, 23, 70, 1273, ...
      [方, 姐姐, 说, 腿, 好, 的话, 就, 来, 段, 对, 花枪, 了, 。]
      [3, 7, 220999, 34, 1273, 70, 23, 568, 58, 1723...
    
    
      17
      4
      [58, 5705, 5, 426, 207, 24, 2614, 15, 5, 426, ...
      [好, 难过,  , 为什么, 每天, 都, 回忆, 你,  , 为什么, 每天, 都, 想...
      [158, 9, 65, 306, 5, 7, 890, 825, 5, 0, 58, 24...
    
    
      18
      4
      [426, 20, 1, 2129, 231, 733, 179, 0, 89045, 33...
      [为什么, 我, 的, 房间, 那么, 干, 呢, ，, 嗓子疼, 啊, ~, ！]
      [18, 135, 332, 89045, 0, 179, 733, 231, 2129, ...
    
    
      19
      4
      [0, 311, 62, 1323, 163, 7, 20, 2317, 15786, 1,...
      [今早, 起来, 更, 体现, 出, 了, 我, 昨天, 排练, 的, 成果, ，, 腰酸背...
      [18, 18, 18, 21432, 0, 1991, 1, 15786, 2317, 2...
    
    
      20
      0
      [511, 5110, 229, 0, 2803, 49, 1, 15230, 20, 22...
      [不过, 彭, 老师, 今早, 出门, 说, 的, 那句话, 我, 也, 赞同, ......]
      [0, 11228, 22, 20, 15230, 1, 49, 2803, 0, 229,...
    
    
      21
      2
      [3959, 1073, 70087, 38863, 5, 66911, 0, 313430...
      [这家, 位于, 槟城, George,  , Town, ，, Tunes,  , Hot...
      [1152, 15561, 1149, 3, 1196, 64, 24, 9612, 474...
    
    
      22
      4
      [335, 6, 89, 1282, 1460, 13, 7077, 1, 1657, 84...
      [今天, 是, 个, 值得, 开心, 和, 崩溃, 的, 日子, ～]
      [840, 1657, 1, 7077, 13, 1460, 1282, 89, 6, 33...
    
    
      23
      4
      [6555, 1319, 38436, 0, 0, 20, 5187, 24, 163, 7...
      [大腿, 再次, 拉伤, ，, 痛到, 我, 眼泪, 都, 出, 了, …]
      [100, 7, 163, 24, 5187, 20, 0, 0, 38436, 1319,...
    
    
      24
      4
      [1620, 3691, 1385, 20051, 0, 7077, 100, 0, 0, ...
      [回到, 宿舍, 连续, 拉肚子, ，, 崩溃, …]
      [100, 7077, 0, 20051, 1385, 3691, 1620, 0, 0, ...
    
    
      25
      3
      [2097, 455, 0, 0, 840, 0, 0, 0, 0, 0, 0, 0, 0,...
      [接下来, 继续, 红红的, 红歌赛, ～]
      [840, 0, 0, 455, 2097, 0, 0, 0, 0, 0, 0, 0, 0,...
    
    
      26
      5
      [53093, 0, 6, 20, 19, 1848, 6483, 10509, 7, 20...
      [OMG, ，, 是, 我, 不, 关心, 八卦, 太久, 了, 吗, ？, ！]
      [18, 36, 204, 7, 10509, 6483, 1848, 19, 20, 6,...
    
    
      27
      0
      [3372, 65, 9, 279, 22749, 1827, 129270, 16123,...
      [白天, 还, 在, 跟, 小黑, 讨论, Jensen, 过生日, ，, 跟, 大, J,...
      [18, 18, 7, 311, 4384, 108, 1045, 228647, 117,...
    
    
      28
      5
      [471, 2188, 0, 70, 279, 20, 49, 6483, 8, 15357...
      [结果, 刚刚, KANA, 来, 跟, 我, 说, 八卦, ：, J2, 周末, 一起, ...
      [18, 18, 18, 7, 2014, 88, 21, 82, 3015, 0, 25,...
    
    
      29
      5
      [388, 53, 509, 20, 37, 854, 474, 23665, 23665,...
      [谁, 能, 告诉, 我, 这, 是不是, 真的, 啊啊啊, 啊啊啊, ，, 我要, 疯狂,...
      [135, 135, 135, 135, 135, 135, 135, 135, 135, ...
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      16676
      0
      [60, 770, 4062, 3, 4062, 269, 4566, 1, 1216, 0...
      [3, 学会, 宽容, 。, 宽容, 像, 春天, 的, 阳光, ，, 照耀, 别人, 也,...
      [3, 42, 1229, 22, 308, 16957, 0, 1216, 1, 4566...
    
    
      16677
      0
      [99, 770, 21691, 3, 0, 0, 142, 0, 3, 0, 0, 0, ...
      [4, 学会, 隐忍, 。, 隐而, 不发, 不是, 不发, 。]
      [3, 0, 142, 0, 0, 3, 21691, 770, 99, 0, 0, 0, ...
    
    
      16678
      0
      [85, 770, 419, 3, 681, 13, 606, 24, 6, 419, 1,...
      [5, 学会, 简单, 。, 美丽, 和, 幸福, 都, 是, 简单, 的, ，, 把握, ...
      [3, 419, 19, 76, 419, 7, 1902, 0, 1, 419, 6, 2...
    
    
      16679
      0
      [141, 770, 14911, 1246, 3, 0, 1559, 0, 209, 23...
      [6, 学会, 换位, 思考, 。, 换个, 角度, ，, 世界, 就, 会, 大不相同, ..]
      [0, 38220, 27, 23, 209, 0, 1559, 0, 3, 1246, 1...
    
    
      16680
      0
      [0, 5, 611, 0, 1141, 47977, 0, 0, 0, 0, 0, 0, ...
      [thx,  , @, 王力, 群, Alex]
      [47977, 1141, 0, 611, 5, 0, 0, 0, 0, 0, 0, 0, ...
    
    
      16681
      0
      [8756, 734, 17, 18663, 16830, 7212, 5, 4278, 9...
      [东南, 网, -, 海峡, 都市报, 讯,  , 昨日, 下午, 2, 时许, ，, 送,...
      [3, 3732, 25259, 0, 37292, 11635, 3277, 82, 0,...
    
    
      16682
      0
      [2344, 325, 9, 1111, 55737, 1, 6443, 33, 0, 21...
      [事故, 发生, 在, 往, 二环路, 的, 车道, 上, ，, 肇事, 小车, 停, 在,...
      [3, 650, 13293, 82, 235, 19410, 0, 33, 23097, ...
    
    
      16683
      0
      [29788, 1083, 1749, 49, 0, 728, 330, 1749, 23,...
      [目击者, 林, 女士, 说, ，, 当时, 万, 女士, 就, 站, 在, 斑马线, 上,...
      [3, 7, 79980, 23, 0, 0, 0, 0, 0, 6340, 110, 50...
    
    
      16684
      0
      [32255, 35660, 21085, 17, 6300, 154820, 3733, ...
      [太古, 三里屯, 北区, -, 围, 合式, 分布, 的, 旗舰, 街区, ，, 商铺, ...
      [3, 272, 92, 1204, 6870, 2488, 6447, 6560, 125...
    
    
      16685
      0
      [412, 8, 0, 25011, 4, 44758, 4, 5316, 11868, 4...
      [品牌, ：, 阿一, 鲍鱼, 、, 阿玛尼, 、, 巴黎, 世家, 、, 浪凡, 、, 范...
      [61333, 50, 1, 20, 4, 2549, 4, 181783, 4, 4593...
    
    
      16686
      0
      [20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
      [我, ................]
      [0, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
    
    
      16687
      0
      [10773, 158484, 0, 10773, 138953, 0, 10773, 20...
      [轻度, 失忆症, ，, 轻度, 妄想症, ，, 轻度, 自闭症, ，, 轻度, 抑郁症, 。]
      [3, 9723, 10773, 0, 20241, 10773, 0, 138953, 1...
    
    
      16688
      0
      [2211, 1838, 4928, 163498, 0, 20, 24, 23765, 2...
      [一个月, 配, 三次, 锁匙, ，, 我, 都, 唔, 知, 我, 搞, 乜, ！]
      [18, 90617, 1936, 20, 2042, 23765, 24, 20, 0, ...
    
    
      16689
      0
      [0, 1227, 174, 172201, 4681, 2638, 0, 20, 42, ...
      [我仲, 以为, 前, 几晚, 醉, 左, 翻黎系, 我, 自己, 开门, ，, 原来, 系...
      [18, 24871, 8164, 21, 480, 3239, 1260, 1115, 0...
    
    
      16690
      0
      [13100, 0, 118411, 614, 2638, 37456, 23, 4681,...
      [唉, ，, 千杯不醉, 变, 左, 三杯, 就, 醉, ！]
      [18, 4681, 23, 37456, 2638, 614, 118411, 0, 13...
    
    
      16691
      0
      [726, 42, 4681, 2638, 24, 23765, 2042, 0, 0, 5...
      [连, 自己, 醉, 左, 都, 唔, 知, ，, 翻时, 好, 清醒, 甘, ，, 训醒,...
      [1152, 43810, 2638, 1149, 18, 84920, 532, 23, ...
    
    
      16692
      0
      [1844, 0, 10642, 638, 200, 593, 0, 26, 3493, 2...
      [近日, ，, 甘肃省, 政府, 向, 14, 个市, （, 州, ）, 、, 省政府, 有...
      [3, 5631, 317, 157, 3723, 1194, 0, 9835, 1, 10...
    
    
      16693
      2
      [0, 20, 40757, 15, 474, 1474, 502, 249, 9902, ...
      [金厉旭, 我, 只在乎, 你, 真的, 好好, 听, 哦, 崇拜, ！]
      [18, 9902, 249, 502, 1474, 474, 15, 40757, 20,...
    
    
      16694
      2
      [939, 2351, 24, 8546, 29879, 0, 199, 1684, 0, ...
      [下午, 场, 都, 录, 嘞, ，, 现在, 休息, 休惨, 了, 哈哈哈, 哈哈, ，,...
      [100, 147330, 6, 1, 11737, 78, 20, 48, 335, 87...
    
    
      16695
      0
      [9649, 0, 124, 670, 0, 9482, 145, 110, 7825, 5...
      [玩游戏, ，, 看, 电影, ，, 小姑娘, 吃, 过, 西瓜, 摸, 着, 肚子, 睡着...
      [100, 100, 7, 8399, 3311, 75, 5085, 7825, 110,...
    
    
      16696
      2
      [0, 23, 2436, 0, 0, 6, 929, 1, 3186, 135, 20, ...
      [一看, 就, 舒服, ，, 这才, 是, 完美, 的, 夏天, ~, 我, 想, 放暑假, ！]
      [18, 95867, 117, 20, 135, 3186, 1, 929, 6, 0, ...
    
    
      16697
      0
      [230, 388, 117, 95867, 36, 0, 0, 0, 0, 0, 0, 0...
      [还有, 谁, 想, 放暑假, ？]
      [36, 95867, 117, 388, 230, 0, 0, 0, 0, 0, 0, 0...
    
    
      16698
      6
      [1723, 33, 1, 5589, 58, 761, 5, 3761, 5, 3761,...
      [腿, 上, 的, 伤口, 好, 明显,  , T,  , T,  , 这, 又, 不是, ...
      [36, 204, 142, 7, 777, 0, 2242, 4991, 332, 195...
    
    
      16699
      5
      [1955, 7, 35320, 65, 95589, 131, 146, 3919, 6,...
      [戴, 了, 护膝, 还, 摔成, 这样, 那, 冲击, 是, 有, 多, 大, [, 泪, ]]
      [1152, 5726, 1149, 68, 46, 16, 6, 3919, 146, 1...
    
    
      16700
      0
      [72570, 49, 8, 16, 390, 6, 5411, 392, 1, 0, 16...
      [麦兜, 说, ：, 有, 事情, 是, 要说, 出来, 的, ，, 不要, 等, 着, 对...
      [3, 1327, 6, 748, 0, 5559, 13, 6063, 6, 768, 3...
    
    
      16701
      0
      [35, 74, 5908, 2589, 1, 4600, 13, 20, 49, 72, ...
      [一个, 做, 二手车, 生意, 的, 哥哥, 和, 我, 说, 时, 我, 还, 不信, ...
      [3, 7, 474, 6, 2410, 0, 0, 0, 65, 20, 72, 49, ...
    
    
      16702
      6
      [1173, 16, 7674, 3093, 23, 27, 16, 2858, 0, 31...
      [每次, 有, 新政, 出台, 就, 会, 有, 一批, 人富, 起来, ！]
      [18, 311, 0, 2858, 16, 27, 23, 3093, 7674, 16,...
    
    
      16703
      6
      [1149, 21945, 1152, 1149, 5475, 1152, 0, 0, 0,...
      [[, 惊恐, ], [, 愤怒, ]]
      [1152, 5475, 1149, 1152, 21945, 1149, 0, 0, 0,...
    
    
      16704
      4
      [327, 4105, 0, 5201, 8600, 3, 0, 0, 0, 0, 0, 0...
      [人生, 不止, ，, 寂寞, 不已, 。]
      [3, 8600, 5201, 0, 4105, 327, 0, 0, 0, 0, 0, 0...
    
    
      16705
      4
      [5201, 327, 158, 46989, 0, 5201, 6, 158, 787, ...
      [寂寞, 人生, 爱, 无休, ，, 寂寞, 是, 爱, 永远, 的, 主题, 、, 我, ...
      [3, 15, 117, 9, 24, 0, 8089, 1, 20, 13, 20, 11...
    
  

16706 rows × 4 columns



In [28]:

    
from multiprocessing import Pool
# w = [] #将所有词语整合在一起
# for i in data_all['words']:
#     w.extend(i)
dict_w = pd.DataFrame(pd.Series(w).value_counts()) #统计词的出现次数
dict_w['id']=list(range(1,len(dict_w)+1))
dict_w
pool = Pool(16)
get_sent = lambda x: list(dict_w['id'][x])
data['sent'] = data['words'].apply(get_sent) #速度太慢
def get_sent(x):
    return list(dict_w['id'][x])
data_all['sent'] = pool.map(get_sent, data_all['words'])



In [38]:

    
dict_w









    Out[38]:






  
    
      
      0
      id
    
  
  
    
      ，
      20362
      1
    
    
      的
      13430
      2
    
    
      。
      9686
      3
    
    
      ！
      5196
      4
    
    
      了
      5082
      5
    
    
      我
      4327
      6
    
    
      
      3370
      7
    
    
      是
      3209
      8
    
    
      你
      2567
      9
    
    
      在
      2393
      10
    
    
      、
      1955
      11
    
    
      [
      1938
      12
    
    
      ]
      1937
      13
    
    
      有
      1558
      14
    
    
      都
      1503
      15
    
    
      就
      1478
      16
    
    
      不
      1418
      17
    
    
      ：
      1397
      18
    
    
      人
      1384
      19
    
    
      ？
      1329
      20
    
    
      ~
      1289
      21
    
    
      和
      1197
      22
    
    
      …
      1190
      23
    
    
      也
      1172
      24
    
    
      ；
      1142
      25
    
    
      一个
      1073
      26
    
    
      说
      1011
      27
    
    
      他
      965
      28
    
    
      ,
      941
      29
    
    
      啊
      898
      30
    
    
      ...
      ...
      ...
    
    
      問題
      1
      37354
    
    
      菠萝包
      1
      37355
    
    
      开门见山
      1
      37356
    
    
      柿子
      1
      37357
    
    
      要个
      1
      37358
    
    
      静思
      1
      37359
    
    
      孔文燕
      1
      37360
    
    
      川柯南
      1
      37361
    
    
      人敢
      1
      37362
    
    
      收听
      1
      37363
    
    
      吐出来
      1
      37364
    
    
      装箱
      1
      37365
    
    
      再陷
      1
      37366
    
    
      开卷
      1
      37367
    
    
      装机容量
      1
      37368
    
    
      攀缘
      1
      37369
    
    
      LGBT
      1
      37370
    
    
      受得住
      1
      37371
    
    
      日次
      1
      37372
    
    
      戴维
      1
      37373
    
    
      哈喇子
      1
      37374
    
    
      中轴线
      1
      37375
    
    
      挑灯
      1
      37376
    
    
      阎王爷
      1
      37377
    
    
      天一
      1
      37378
    
    
      本色
      1
      37379
    
    
      人超
      1
      37380
    
    
      不提
      1
      37381
    
    
      逃命
      1
      37382
    
    
      相接
      1
      37383
    
  

37383 rows × 2 columns



In [31]:

    
import errno
import os
print(FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), 'foobar'))









    



[Errno 2] No such file or directory: 'foobar'



In [14]:

    
train_data.y









    Out[14]:





array([[ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

	mark	sent	words	sent_rev
0	0	[44916, 8803, 0, 20, 6245, 0, 18, 0, 0, 0, 0, ...	[今儿, 老爸, 逮着, 我, 一顿, 狠念, ！]	[18, 0, 6245, 20, 0, 8803, 44916, 0, 0, 0, 0, ...
1	1	[0, 20, 356, 8553, 100, 100, 0, 0, 0, 0, 0, 0,...	[念得, 我, 各种, 烦躁, …, …]	[100, 100, 8553, 356, 20, 0, 0, 0, 0, 0, 0, 0,...
2	1	[0, 167, 897, 109, 839, 1297, 1, 5379, 332, 5,...	[我要, 不要, 考虑, 下, 降低, 回家, 的, 频率, 啊, , 回来, 一次, 吵...	[18, 18, 18, 36167, 704, 183, 5, 183, 303, 882...
3	0	[9, 897, 451, 31, 0, 0, 239970, 0, 1233, 67, 6...	[在, 考虑, 是否, 要, 回刷, 2.3, Rom, ，, 黑, /, 锁屏, 状态, ...	[55, 2262, 7258, 4768, 0, 772, 63987, 67, 1233...
4	0	[0, 39643, 453, 100, 0, 0, 0, 0, 0, 0, 0, 0, 0...	[发烫, 耗电, 快, …]	[100, 453, 39643, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
5	1	[37, 48, 20, 3461, 3364, 18, 0, 0, 0, 0, 0, 0,...	[这, 让, 我, 蛋, 疼, ！]	[18, 3364, 3461, 20, 48, 37, 0, 0, 0, 0, 0, 0,...
6	2	[511, 0, 233, 0, 11296, 1, 7819, 18, 0, 0, 0, ...	[不过, ，, 喜欢, 4.0, 原生态, 的, 界面, ！]	[18, 7819, 1, 11296, 0, 233, 0, 511, 0, 0, 0, ...
7	0	[1867, 1, 335, 20, 84, 33, 7, 707, 21451, 0, 1...	[去年, 的, 今天, 我, 去, 上, 了, 政治, 辅导班, ，, 回来, 看, 芒果,...	[3, 9115, 7, 25531, 954, 9, 67355, 7, 9193, 37...
8	2	[108, 31, 2817, 20, 1134, 0, 1, 14370, 7, 0, 2...	[又, 要, 重复, 我, 每年, 必说, 的, 台词, 了, ，, 2010, 就要, 过...	[3, 149, 9113, 64, 20, 0, 7, 710, 918, 2882, 0...
9	2	[1375, 0, 149, 4827, 64, 326, 3, 0, 0, 0, 0, 0...	[至少, ，, 它, 看上去, 很, 美, 。]	[3, 326, 64, 4827, 149, 0, 1375, 0, 0, 0, 0, 0...
10	3	[676, 15, 0, 2634, 0, 130293, 18, 0, 0, 0, 0, ...	[欢迎, 你, ，, 2011, ，, 新年快乐, ！]	[18, 130293, 0, 2634, 0, 15, 676, 0, 0, 0, 0, ...
11	0	[939, 41, 0, 0, 0, 13, 0, 759, 39946, 3, 0, 0,...	[下午, 到, 台里, ，, 吴凤花, 和, 方亚芬, 正在, 换装, 。]	[3, 39946, 759, 0, 13, 0, 0, 0, 41, 939, 0, 0,...
12	0	[20, 0, 57125, 0, 23, 82, 62408, 229, 49, 9597...	[我, 刚过去, 打个招呼, ，, 就, 被, 德才, 老师, 说, 耽误, 他们, 录像, 。]	[3, 13502, 111, 9597, 49, 229, 62408, 82, 23, ...
13	4	[20, 742, 32754, 179, 3, 0, 0, 0, 0, 0, 0, 0, ...	[我, 那个, 冤, 呢, 。]	[3, 179, 32754, 742, 20, 0, 0, 0, 0, 0, 0, 0, ...
14	0	[20, 49, 20, 193, 2639, 3, 0, 0, 0, 0, 0, 0, 0...	[我, 说, 我, 没, 采访, 。]	[3, 2639, 193, 20, 49, 20, 0, 0, 0, 0, 0, 0, 0...
15	0	[39, 49, 2553, 22, 2489, 0, 9597, 514, 2462, 3...	[他, 说, 聊天, 也, 不行, ，, 耽误, 穿, 服装, 。]	[3, 2462, 514, 9597, 0, 2489, 22, 2553, 49, 39...
16	0	[1129, 3830, 49, 1723, 58, 568, 23, 70, 1273, ...	[方, 姐姐, 说, 腿, 好, 的话, 就, 来, 段, 对, 花枪, 了, 。]	[3, 7, 220999, 34, 1273, 70, 23, 568, 58, 1723...
17	4	[58, 5705, 5, 426, 207, 24, 2614, 15, 5, 426, ...	[好, 难过, , 为什么, 每天, 都, 回忆, 你, , 为什么, 每天, 都, 想...	[158, 9, 65, 306, 5, 7, 890, 825, 5, 0, 58, 24...
18	4	[426, 20, 1, 2129, 231, 733, 179, 0, 89045, 33...	[为什么, 我, 的, 房间, 那么, 干, 呢, ，, 嗓子疼, 啊, ~, ！]	[18, 135, 332, 89045, 0, 179, 733, 231, 2129, ...
19	4	[0, 311, 62, 1323, 163, 7, 20, 2317, 15786, 1,...	[今早, 起来, 更, 体现, 出, 了, 我, 昨天, 排练, 的, 成果, ，, 腰酸背...	[18, 18, 18, 21432, 0, 1991, 1, 15786, 2317, 2...
20	0	[511, 5110, 229, 0, 2803, 49, 1, 15230, 20, 22...	[不过, 彭, 老师, 今早, 出门, 说, 的, 那句话, 我, 也, 赞同, ......]	[0, 11228, 22, 20, 15230, 1, 49, 2803, 0, 229,...
21	2	[3959, 1073, 70087, 38863, 5, 66911, 0, 313430...	[这家, 位于, 槟城, George, , Town, ，, Tunes, , Hot...	[1152, 15561, 1149, 3, 1196, 64, 24, 9612, 474...
22	4	[335, 6, 89, 1282, 1460, 13, 7077, 1, 1657, 84...	[今天, 是, 个, 值得, 开心, 和, 崩溃, 的, 日子, ～]	[840, 1657, 1, 7077, 13, 1460, 1282, 89, 6, 33...
23	4	[6555, 1319, 38436, 0, 0, 20, 5187, 24, 163, 7...	[大腿, 再次, 拉伤, ，, 痛到, 我, 眼泪, 都, 出, 了, …]	[100, 7, 163, 24, 5187, 20, 0, 0, 38436, 1319,...
24	4	[1620, 3691, 1385, 20051, 0, 7077, 100, 0, 0, ...	[回到, 宿舍, 连续, 拉肚子, ，, 崩溃, …]	[100, 7077, 0, 20051, 1385, 3691, 1620, 0, 0, ...
25	3	[2097, 455, 0, 0, 840, 0, 0, 0, 0, 0, 0, 0, 0,...	[接下来, 继续, 红红的, 红歌赛, ～]	[840, 0, 0, 455, 2097, 0, 0, 0, 0, 0, 0, 0, 0,...
26	5	[53093, 0, 6, 20, 19, 1848, 6483, 10509, 7, 20...	[OMG, ，, 是, 我, 不, 关心, 八卦, 太久, 了, 吗, ？, ！]	[18, 36, 204, 7, 10509, 6483, 1848, 19, 20, 6,...
27	0	[3372, 65, 9, 279, 22749, 1827, 129270, 16123,...	[白天, 还, 在, 跟, 小黑, 讨论, Jensen, 过生日, ，, 跟, 大, J,...	[18, 18, 7, 311, 4384, 108, 1045, 228647, 117,...
28	5	[471, 2188, 0, 70, 279, 20, 49, 6483, 8, 15357...	[结果, 刚刚, KANA, 来, 跟, 我, 说, 八卦, ：, J2, 周末, 一起, ...	[18, 18, 18, 7, 2014, 88, 21, 82, 3015, 0, 25,...
29	5	[388, 53, 509, 20, 37, 854, 474, 23665, 23665,...	[谁, 能, 告诉, 我, 这, 是不是, 真的, 啊啊啊, 啊啊啊, ，, 我要, 疯狂,...	[135, 135, 135, 135, 135, 135, 135, 135, 135, ...
...	...	...	...	...
16676	0	[60, 770, 4062, 3, 4062, 269, 4566, 1, 1216, 0...	[3, 学会, 宽容, 。, 宽容, 像, 春天, 的, 阳光, ，, 照耀, 别人, 也,...	[3, 42, 1229, 22, 308, 16957, 0, 1216, 1, 4566...
16677	0	[99, 770, 21691, 3, 0, 0, 142, 0, 3, 0, 0, 0, ...	[4, 学会, 隐忍, 。, 隐而, 不发, 不是, 不发, 。]	[3, 0, 142, 0, 0, 3, 21691, 770, 99, 0, 0, 0, ...
16678	0	[85, 770, 419, 3, 681, 13, 606, 24, 6, 419, 1,...	[5, 学会, 简单, 。, 美丽, 和, 幸福, 都, 是, 简单, 的, ，, 把握, ...	[3, 419, 19, 76, 419, 7, 1902, 0, 1, 419, 6, 2...
16679	0	[141, 770, 14911, 1246, 3, 0, 1559, 0, 209, 23...	[6, 学会, 换位, 思考, 。, 换个, 角度, ，, 世界, 就, 会, 大不相同, ..]	[0, 38220, 27, 23, 209, 0, 1559, 0, 3, 1246, 1...
16680	0	[0, 5, 611, 0, 1141, 47977, 0, 0, 0, 0, 0, 0, ...	[thx, , @, 王力, 群, Alex]	[47977, 1141, 0, 611, 5, 0, 0, 0, 0, 0, 0, 0, ...
16681	0	[8756, 734, 17, 18663, 16830, 7212, 5, 4278, 9...	[东南, 网, -, 海峡, 都市报, 讯, , 昨日, 下午, 2, 时许, ，, 送,...	[3, 3732, 25259, 0, 37292, 11635, 3277, 82, 0,...
16682	0	[2344, 325, 9, 1111, 55737, 1, 6443, 33, 0, 21...	[事故, 发生, 在, 往, 二环路, 的, 车道, 上, ，, 肇事, 小车, 停, 在,...	[3, 650, 13293, 82, 235, 19410, 0, 33, 23097, ...
16683	0	[29788, 1083, 1749, 49, 0, 728, 330, 1749, 23,...	[目击者, 林, 女士, 说, ，, 当时, 万, 女士, 就, 站, 在, 斑马线, 上,...	[3, 7, 79980, 23, 0, 0, 0, 0, 0, 6340, 110, 50...
16684	0	[32255, 35660, 21085, 17, 6300, 154820, 3733, ...	[太古, 三里屯, 北区, -, 围, 合式, 分布, 的, 旗舰, 街区, ，, 商铺, ...	[3, 272, 92, 1204, 6870, 2488, 6447, 6560, 125...
16685	0	[412, 8, 0, 25011, 4, 44758, 4, 5316, 11868, 4...	[品牌, ：, 阿一, 鲍鱼, 、, 阿玛尼, 、, 巴黎, 世家, 、, 浪凡, 、, 范...	[61333, 50, 1, 20, 4, 2549, 4, 181783, 4, 4593...
16686	0	[20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...	[我, ................]	[0, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
16687	0	[10773, 158484, 0, 10773, 138953, 0, 10773, 20...	[轻度, 失忆症, ，, 轻度, 妄想症, ，, 轻度, 自闭症, ，, 轻度, 抑郁症, 。]	[3, 9723, 10773, 0, 20241, 10773, 0, 138953, 1...
16688	0	[2211, 1838, 4928, 163498, 0, 20, 24, 23765, 2...	[一个月, 配, 三次, 锁匙, ，, 我, 都, 唔, 知, 我, 搞, 乜, ！]	[18, 90617, 1936, 20, 2042, 23765, 24, 20, 0, ...
16689	0	[0, 1227, 174, 172201, 4681, 2638, 0, 20, 42, ...	[我仲, 以为, 前, 几晚, 醉, 左, 翻黎系, 我, 自己, 开门, ，, 原来, 系...	[18, 24871, 8164, 21, 480, 3239, 1260, 1115, 0...
16690	0	[13100, 0, 118411, 614, 2638, 37456, 23, 4681,...	[唉, ，, 千杯不醉, 变, 左, 三杯, 就, 醉, ！]	[18, 4681, 23, 37456, 2638, 614, 118411, 0, 13...
16691	0	[726, 42, 4681, 2638, 24, 23765, 2042, 0, 0, 5...	[连, 自己, 醉, 左, 都, 唔, 知, ，, 翻时, 好, 清醒, 甘, ，, 训醒,...	[1152, 43810, 2638, 1149, 18, 84920, 532, 23, ...
16692	0	[1844, 0, 10642, 638, 200, 593, 0, 26, 3493, 2...	[近日, ，, 甘肃省, 政府, 向, 14, 个市, （, 州, ）, 、, 省政府, 有...	[3, 5631, 317, 157, 3723, 1194, 0, 9835, 1, 10...
16693	2	[0, 20, 40757, 15, 474, 1474, 502, 249, 9902, ...	[金厉旭, 我, 只在乎, 你, 真的, 好好, 听, 哦, 崇拜, ！]	[18, 9902, 249, 502, 1474, 474, 15, 40757, 20,...
16694	2	[939, 2351, 24, 8546, 29879, 0, 199, 1684, 0, ...	[下午, 场, 都, 录, 嘞, ，, 现在, 休息, 休惨, 了, 哈哈哈, 哈哈, ，,...	[100, 147330, 6, 1, 11737, 78, 20, 48, 335, 87...
16695	0	[9649, 0, 124, 670, 0, 9482, 145, 110, 7825, 5...	[玩游戏, ，, 看, 电影, ，, 小姑娘, 吃, 过, 西瓜, 摸, 着, 肚子, 睡着...	[100, 100, 7, 8399, 3311, 75, 5085, 7825, 110,...
16696	2	[0, 23, 2436, 0, 0, 6, 929, 1, 3186, 135, 20, ...	[一看, 就, 舒服, ，, 这才, 是, 完美, 的, 夏天, ~, 我, 想, 放暑假, ！]	[18, 95867, 117, 20, 135, 3186, 1, 929, 6, 0, ...
16697	0	[230, 388, 117, 95867, 36, 0, 0, 0, 0, 0, 0, 0...	[还有, 谁, 想, 放暑假, ？]	[36, 95867, 117, 388, 230, 0, 0, 0, 0, 0, 0, 0...
16698	6	[1723, 33, 1, 5589, 58, 761, 5, 3761, 5, 3761,...	[腿, 上, 的, 伤口, 好, 明显, , T, , T, , 这, 又, 不是, ...	[36, 204, 142, 7, 777, 0, 2242, 4991, 332, 195...
16699	5	[1955, 7, 35320, 65, 95589, 131, 146, 3919, 6,...	[戴, 了, 护膝, 还, 摔成, 这样, 那, 冲击, 是, 有, 多, 大, [, 泪, ]]	[1152, 5726, 1149, 68, 46, 16, 6, 3919, 146, 1...
16700	0	[72570, 49, 8, 16, 390, 6, 5411, 392, 1, 0, 16...	[麦兜, 说, ：, 有, 事情, 是, 要说, 出来, 的, ，, 不要, 等, 着, 对...	[3, 1327, 6, 748, 0, 5559, 13, 6063, 6, 768, 3...
16701	0	[35, 74, 5908, 2589, 1, 4600, 13, 20, 49, 72, ...	[一个, 做, 二手车, 生意, 的, 哥哥, 和, 我, 说, 时, 我, 还, 不信, ...	[3, 7, 474, 6, 2410, 0, 0, 0, 65, 20, 72, 49, ...
16702	6	[1173, 16, 7674, 3093, 23, 27, 16, 2858, 0, 31...	[每次, 有, 新政, 出台, 就, 会, 有, 一批, 人富, 起来, ！]	[18, 311, 0, 2858, 16, 27, 23, 3093, 7674, 16,...
16703	6	[1149, 21945, 1152, 1149, 5475, 1152, 0, 0, 0,...	[[, 惊恐, ], [, 愤怒, ]]	[1152, 5475, 1149, 1152, 21945, 1149, 0, 0, 0,...
16704	4	[327, 4105, 0, 5201, 8600, 3, 0, 0, 0, 0, 0, 0...	[人生, 不止, ，, 寂寞, 不已, 。]	[3, 8600, 5201, 0, 4105, 327, 0, 0, 0, 0, 0, 0...
16705	4	[5201, 327, 158, 46989, 0, 5201, 6, 158, 787, ...	[寂寞, 人生, 爱, 无休, ，, 寂寞, 是, 爱, 永远, 的, 主题, 、, 我, ...	[3, 15, 117, 9, 24, 0, 8089, 1, 20, 13, 20, 11...

	0	id
，	20362	1
的	13430	2
。	9686	3
！	5196	4
了	5082	5
我	4327	6
	3370	7
是	3209	8
你	2567	9
在	2393	10
、	1955	11
[	1938	12
]	1937	13
有	1558	14
都	1503	15
就	1478	16
不	1418	17
：	1397	18
人	1384	19
？	1329	20
~	1289	21
和	1197	22
…	1190	23
也	1172	24
；	1142	25
一个	1073	26
说	1011	27
他	965	28
,	941	29
啊	898	30
...	...	...
問題	1	37354
菠萝包	1	37355
开门见山	1	37356
柿子	1	37357
要个	1	37358
静思	1	37359
孔文燕	1	37360
川柯南	1	37361
人敢	1	37362
收听	1	37363
吐出来	1	37364
装箱	1	37365
再陷	1	37366
开卷	1	37367
装机容量	1	37368
攀缘	1	37369
LGBT	1	37370
受得住	1	37371
日次	1	37372
戴维	1	37373
哈喇子	1	37374
中轴线	1	37375
挑灯	1	37376
阎王爷	1	37377
天一	1	37378
本色	1	37379
人超	1	37380
不提	1	37381
逃命	1	37382
相接	1	37383