In [ ]:
from bs4 import BeautifulSoup
import jieba ,util
import csv ,json
jieba.load_userdict("./new.dict_all")
stop_words = util.load_stop_words('stopword.txt')
count = 0
import csv
with open('./data/201505201506_non_spam.csv', 'rb') as csvfile:
reader = csv.DictReader(csvfile,quoting=csv.QUOTE_NONE)
fw = open('./data/new_parsed_no_spam.txt','w')
for row in reader:
soup = BeautifulSoup(row['content'])
items = ''
fw.write(str(count) + ': ')
for item in jieba.cut(soup.getText()) :
if len(item) < 2 or item in stop_words: continue
else :
items = items + item.replace('\n', ' ').replace('\r', '') + ' '
fw.write(items.encode('utf-8') + ' ')
fw.write('\r\n')
count = count + 1
# print count ,
In [ ]:
In [ ]: