In [1]:
from pythainlp.rank import rank
from pythainlp.segment import segment
import json
import codecs
import os
from pathlib import Path
import re
deli = ",!?&()[]“”-@'><#;"+'"'
In [5]:
# file_stopword_th = open('./stopwords/th.txt', encoding='utf8')
# stopword_th = []
# for line in file_stopword_th:
# stopword_th += [line];
# file_stopword_en = open('./stopwords/en.txt', encoding='utf8')
# stopword_en = []
# for line in file_stopword_en:
# stopword_en += [line];
In [2]:
for filename in os.listdir("./pages"):
if filename.endswith(".txt"):
# print(filename)
if not Path('./pages_json/'+filename.split('.')[0]+'.json').is_file():
file = open('./pages/'+filename, encoding='utf8')
words = []
for line in file:
line = ' '.join(w for w in re.split("["+"\\".join(deli)+"]", line) if w)
for sentence in line.split():
try:
words += segment(sentence)
except:
words += []
# no_stopword = []
# for word in words:
# if word not in stopword_th and word not in stopword_en:
# no_stopword += [word]
# print(rank(no_stopword))
with open('./pages_json/'+filename.split('.')[0]+'.json', 'w', encoding='utf8') as json_file:
json.dump(rank(words), json_file, ensure_ascii=False, sort_keys=True)
json_file.close()
file.close()
In [ ]: