In [4]:
from gensim.models import Word2Vec
from pythainlp.segment import segment
from pythainlp.postaggers import tag
from pythainlp.rank import rank

In [5]:
model = Word2Vec.load("./wiki_model/wiki.th.text.model")
word = "แมว"
print(model.most_similar(word))


[('หอย', 0.6816743016242981), ('กระรอก', 0.6639907956123352), ('แรด', 0.6569611430168152), ('ปลา', 0.6538748741149902), ('แพะ', 0.6373416185379028), ('ควาย', 0.6290926933288574), ('กวาง', 0.6115422248840332), ('ไก', 0.6060198545455933), ('แกะ', 0.6005170345306396), ('หนอน', 0.5792226791381836)]

In [6]:
sentence = "เรียนภาษาญี่ปุ่น"
print(segment(sentence))
print(tag(sentence))


['เรียน', 'ภาษา', 'ญี่ปุ่น']
[('เรียน', 'VACT'), ('ภาษา', 'NCMN'), ('ญี่ปุ่น', 'NPRP')]

In [7]:
import re
deli = ",!?&()-@'><#;"+'"'
# file_stopword_th = open('./stopwords/th.txt', encoding='utf8')
# stopword_th = []
# for line in file_stopword_th:
#     stopword_th += [line];
# file_stopword_en = open('./stopwords/en.txt', encoding='utf8')
# stopword_en = []
# for line in file_stopword_en:
#     stopword_en += [line];

In [8]:
import json
import codecs
import os
from pathlib import Path
for filename in os.listdir("./pages"):
    if filename.endswith(".txt"):
#         print(filename)
        if not Path('./pages_csv/'+filename.split('.')[0]+'.json').is_file():
            file = open('./pages/'+filename, encoding='utf8')
            words = []
            for line in file:
                line = ' '.join(w for w in re.split("["+"\\".join(deli)+"]", line) if w)
                for sentence in line.split():
                    try:
                        from pythainlp.segment.dict import segment
                        words += segment(sentence)
                    except:
                        try:
                            from pythainlp.segment import segment
                            words += segment(sentence)
                        except:
                            words += []
            # no_stopword = []
            # for word in words:
            #     if word not in stopword_th and word not in stopword_en:
            #         no_stopword += [word]
            # print(rank(no_stopword))
            with open('./pages_csv/'+filename.split('.')[0]+'.json', 'w', encoding='utf8') as json_file:
                json.dump(rank(words), json_file, ensure_ascii=False, sort_keys=True)
                json_file.close()
            file.close()


101855326672940.txt
104163463021826.txt
1055598184511954.txt
109100059125975.txt
110602052308408.txt
1111925358883046.txt
1123184674393171.txt
1159270450790969.txt
119243644772011.txt
124510714381439.txt
129643877242538.txt
1377648855890489.txt
1394743067512240.txt
140345789312504.txt
1485623941733575.txt
1488261941413456.txt
1500140420240098.txt
150108621732715.txt
1512534755682844.txt
1519938018257271.txt
155676674461979.txt
1558453351063626.txt
1564062570484273.txt
1572694813007656.txt
1574455939492691.txt
1626748197601736.txt
1629626160626907.txt
1641393576100366.txt
1653252078230326.txt
165436840538238.txt
167090343625658.txt
168969886771344.txt
170194439665786.txt
170736923096528.txt
170972093097920.txt
1725871517670091.txt
180797605341122.txt
188069131228489.txt
191338370463.txt
202094046803662.txt
208631422498510.txt
218225131704700.txt
232910783956.txt
281176591946550.txt
285482004900192.txt
286548828134144.txt
299503200237984.txt
305512749626201.txt
309225189372.txt
309681779207248.txt
319921861479845.txt
348892151974913.txt
358839384230016.txt
394418884053947.txt
412674492211861.txt
419002571536737.txt
421859467977721.txt
423078901209982.txt
427071670643918.txt
431383406951825.txt
437882833038511.txt
453272418150729.txt
465374246921091.txt
468983709913391.txt
477063055791396.txt
482368755113431.txt
486704024831288.txt
491452930867938.txt
516461878390039.txt
534452399953827.txt
538338459619512.txt
538624313005886.txt
543209025719472.txt
552580508224925.txt
611205092299538.txt
639844446082078.txt
662688547224470.txt
670232653088455.txt
685611318201690.txt
686264231422297.txt
703910556335910.txt
710234179063682.txt
728845667250612.txt
731632196883480.txt
738735629503410.txt
745150592237114.txt
765542590141017.txt
786715381395749.txt
820044538105541.txt
834742146604404.txt
840893449254902.txt
842759152469328.txt
849051415125241.txt
858015854257493.txt
859381394090978.txt
869639369745978.txt
885080744852208.txt
898665433580000.txt
913197215405395.txt
948186481887159.txt