notebook.community

Edit and run



In [1]:

    
from pythainlp.rank import rank
from pythainlp.segment import segment
import json
import codecs
import os
from pathlib import Path
import re
deli = ",!?&()[]“”-@'><#;"+'"'



In [5]:

    
# file_stopword_th = open('./stopwords/th.txt', encoding='utf8')
# stopword_th = []
# for line in file_stopword_th:
#     stopword_th += [line];
# file_stopword_en = open('./stopwords/en.txt', encoding='utf8')
# stopword_en = []
# for line in file_stopword_en:
#     stopword_en += [line];



In [2]:

    
for filename in os.listdir("./pages"):
    if filename.endswith(".txt"):
#         print(filename)
        if not Path('./pages_json/'+filename.split('.')[0]+'.json').is_file():
            file = open('./pages/'+filename, encoding='utf8')
            words = []
            for line in file:
                line = ' '.join(w for w in re.split("["+"\\".join(deli)+"]", line) if w)
                for sentence in line.split():
                    try:
                        words += segment(sentence)
                    except:
                        words += []
            # no_stopword = []
            # for word in words:
            #     if word not in stopword_th and word not in stopword_en:
            #         no_stopword += [word]
            # print(rank(no_stopword))
            with open('./pages_json/'+filename.split('.')[0]+'.json', 'w', encoding='utf8') as json_file:
                json.dump(rank(words), json_file, ensure_ascii=False, sort_keys=True)
                json_file.close()
            file.close()



In [ ]: