In [1]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import numpy as np
import time, os, json
# Customized tools
from lib import *
from jieba import cut_for_search, add_word, suggest_freq, del_word
from IPython.display import clear_output
In [2]:
result_dump = 'result_dump/test.csv'
result = pd.read_csv(result_dump)
corpus_2_list = json.load(open('Classifier/corpus2.json', 'r'))
corpus_3_list = json.load(open('Classifier/corpus3.json', 'r'))
corpus_add_2 = json.load(open('Classifier/corpus_add2.json', 'r'))
corpus_add_3 = json.load(open('Classifier/corpus_add3.json', 'r'))
trans = Transformer(corpus_2_list, corpus_3_list,
corpus_add_2, corpus_add_3)
In [3]:
n_total = result.shape[0]
for i, row in result.iterrows():
# if (row.label == 100) and (trans.get_feature(row.title)[1] > 0):
if True:
clear_output()
print('TITLE: {}'.format(row.title))
print('LABEL: {}'.format(row.label))
print('TOKEN:\n{}'.format(trans.tokenize(row.title)))
print('FEATURE: {}'.format(trans.get_feature(row.title)))
stdin = input('=== {}/{}. Press Enter to Continue ==='.format(i+1, n_total))
if stdin == 'stop':
break
In [97]:
del_word('台中')
print('/'.join(cut_for_search('「台中」正确应该不会被切开')))
add_word('台中')
print('/'.join(cut_for_search('「台中」正确应该不会被切开')))
In [8]:
import json
with open('Classifier/corpus_add2.json', 'r') as f:
corp_add2 = json.load(f)
In [ ]:
new_list = ['内幕', '不可告人', '嘿咻', '嘿嘿', '']
In [9]:
r.append()
Out[9]: