notebook.community

Edit and run



In [ ]:

    
from bs4 import BeautifulSoup
import jieba ,util
import csv ,json
jieba.load_userdict("./new.dict_all")
stop_words = util.load_stop_words('stopword.txt')
count = 0

import csv
with open('./data/201505201506_non_spam.csv', 'rb') as csvfile:
    reader = csv.DictReader(csvfile,quoting=csv.QUOTE_NONE)
    fw = open('./data/new_parsed_no_spam.txt','w')

    for row in reader:
        soup = BeautifulSoup(row['content'])
        items = ''
        fw.write(str(count) + ':              ')
        for item in jieba.cut(soup.getText()) :
            if len(item) < 2 or item in stop_words: continue
            else : 
                items = items + item.replace('\n', ' ').replace('\r', '') + ' '
                
        fw.write(items.encode('utf-8') + ' ')
        fw.write('\r\n')
        count = count + 1
#         print count ,



In [ ]:



In [ ]: