In [4]:
from gensim.models import Word2Vec
from pythainlp.segment import segment
from pythainlp.postaggers import tag
from pythainlp.rank import rank
In [5]:
model = Word2Vec.load("./wiki_model/wiki.th.text.model")
word = "แมว"
print(model.most_similar(word))
In [6]:
sentence = "เรียนภาษาญี่ปุ่น"
print(segment(sentence))
print(tag(sentence))
In [7]:
import re
deli = ",!?&()-@'><#;"+'"'
# file_stopword_th = open('./stopwords/th.txt', encoding='utf8')
# stopword_th = []
# for line in file_stopword_th:
# stopword_th += [line];
# file_stopword_en = open('./stopwords/en.txt', encoding='utf8')
# stopword_en = []
# for line in file_stopword_en:
# stopword_en += [line];
In [8]:
import json
import codecs
import os
from pathlib import Path
for filename in os.listdir("./pages"):
if filename.endswith(".txt"):
# print(filename)
if not Path('./pages_csv/'+filename.split('.')[0]+'.json').is_file():
file = open('./pages/'+filename, encoding='utf8')
words = []
for line in file:
line = ' '.join(w for w in re.split("["+"\\".join(deli)+"]", line) if w)
for sentence in line.split():
try:
from pythainlp.segment.dict import segment
words += segment(sentence)
except:
try:
from pythainlp.segment import segment
words += segment(sentence)
except:
words += []
# no_stopword = []
# for word in words:
# if word not in stopword_th and word not in stopword_en:
# no_stopword += [word]
# print(rank(no_stopword))
with open('./pages_csv/'+filename.split('.')[0]+'.json', 'w', encoding='utf8') as json_file:
json.dump(rank(words), json_file, ensure_ascii=False, sort_keys=True)
json_file.close()
file.close()