Table of Contents

    
    
    In [1]:
    import pandas as pd
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)
    import numpy as np
    import time, os, json
    # Customized tools
    from lib import *
    from jieba import cut_for_search, add_word, suggest_freq, del_word
    
    from IPython.display import clear_output
    
    
    
    
    Model loaded succeed
    >> Synonyms on loading ...
    
    
    
    In [2]:
    result_dump = 'result_dump/test.csv'
    result = pd.read_csv(result_dump)
    
    corpus_2_list = json.load(open('Classifier/corpus2.json', 'r'))
    corpus_3_list = json.load(open('Classifier/corpus3.json', 'r'))
    corpus_add_2 = json.load(open('Classifier/corpus_add2.json', 'r'))
    corpus_add_3 = json.load(open('Classifier/corpus_add3.json', 'r'))
    
    trans = Transformer(corpus_2_list, corpus_3_list,
            corpus_add_2, corpus_add_3)
    
    
    
    
    Building prefix dict from the default dictionary ...
    Loading model from cache /tmp/jieba.cache
    Loading model cost 1.777 seconds.
    Prefix dict has been built succesfully.
    
    
    
    In [3]:
    n_total = result.shape[0]
    for i, row in result.iterrows():
    #     if (row.label == 100) and (trans.get_feature(row.title)[1] > 0):
        if True:
            clear_output()
            print('TITLE: {}'.format(row.title))
            print('LABEL: {}'.format(row.label))
            print('TOKEN:\n{}'.format(trans.tokenize(row.title)))
            print('FEATURE: {}'.format(trans.get_feature(row.title)))
            stdin = input('=== {}/{}. Press Enter to Continue ==='.format(i+1, n_total))
            if stdin == 'stop':
                break
    
    
    
    
    TITLE: 意外的恋爱时光_电影_高清1080P在线观看_腾讯视频
    LABEL: 100.0
    TOKEN:
    {'恋爱', '电影', '腾讯', '时光', '观看', '_', '1080p', '意外', '在线', '高清', '视频', '的'}
    FEATURE: [0 0 4 0]
    === 33/100. Press Enter to Continue ===stop
    
    
    
    In [97]:
    del_word('台中')
    print('/'.join(cut_for_search('「台中」正确应该不会被切开')))
    add_word('台中')
    print('/'.join(cut_for_search('「台中」正确应该不会被切开')))
    
    
    
    
    「/台/中/」/正确/应该/不会/被/切开
    「/台中/」/正确/应该/不会/被/切开
    
    
    
    In [8]:
    import json
    with open('Classifier/corpus_add2.json', 'r') as f:
        corp_add2 = json.load(f)
    
    
    
    In [ ]:
    new_list = ['内幕', '不可告人', '嘿咻', '嘿嘿', '']
    
    
    
    In [9]:
    r.append()
    
    
    
    
    Out[9]:
    ['pa']