The following is a qualitative test of results returned by word2vec models generated by means of the CLTK and gensim's implementation of the algorithm. There are four models for Latin and Greek each, with the following parameters:
All the models were build with Word2Vec()
arguments size=100
, window=5
, and min_count=5
. The code made to generate these is in notebook </word2vec_build_model_phi5_tlg_test>. It was run on a remote server with this setup code.
In [1]:
from collections import defaultdict
from gensim.models import Word2Vec
import os
from termcolor import colored
In [2]:
models_paths = {'lemmatized_no_stops': '~/cltk_data/user_data/word2vec/w2v_latin_lemmatizer_stops.model',
'lemmatized_yes_stops': '~/cltk_data/user_data/word2vec/w2v_latin_lemmatizer.model',
'unlemmatized_no_stops': '~/cltk_data/user_data/word2vec/w2v_latin_stops.model',
'unlemmatized_yes_stops': '~/cltk_data/user_data/word2vec/w2v_latin.model'}
save_dir = '~/cltk_data/user_data/word2vec/'
In [3]:
def get_sims(model_name, word_1, word_2=None):
word = word_1
if word_2 and model_name in ['lemmatized_no_stops', 'lemmatized_yes_stops']:
word = word_2
return word_1, model.most_similar(word)
def print_sims(word_all_models_list):
for headword, sims in word_all_models_list.items():
print(colored(headword, 'red'))
for sims_pair in sims:
print(colored(sims_pair[0], 'blue'))
for sim in sims_pair[1]:
print(sim)
print()
In [4]:
word_all_models_list = defaultdict(list)
for model_name, model_path in models_paths.items():
# setup paths
model_path = os.path.expanduser(model_path)
save_dir = os.path.expanduser(save_dir)
save_file = os.path.join(save_dir, model_name)
model = Word2Vec.load(model_path)
words = ['amicitia', ('carus', 'carus1'), 'dignitas', 'amo', 'amor', ('industria', 'industria1'),
'facio', 'laus', 'scribo', 'cano', 'pudor']
for word in words:
if type(word) is str:
headword, sims = get_sims(model_name, word)
elif type(word) is tuple:
headword, sims = get_sims(model_name, word[0], word[1])
word_all_models_list[headword].append((model_name, sims))
In [5]:
print_sims(word_all_models_list)
laus
unlemmatized_yes_stops
('dignitas', 0.7702658176422119)
('commendatio', 0.7273139357566833)
('uirtus', 0.7205942869186401)
('prudentia', 0.7164344787597656)
('gloria', 0.7120715379714966)
('opinio', 0.709394097328186)
('auctoritas', 0.7079125642776489)
('perturbatio', 0.7022783160209656)
('disciplina', 0.7003540992736816)
('aequabilitas', 0.6998453140258789)
unlemmatized_no_stops
('grauitas', 0.7767825126647949)
('exspectatio', 0.7622722387313843)
('amplitudo', 0.7578041553497314)
('dignitas', 0.7558329105377197)
('ingeni', 0.7504878044128418)
('memoria', 0.7502184510231018)
('commendatio', 0.7450073957443237)
('perturbatio', 0.7413634061813354)
('contentio', 0.737472653388977)
('uoluntas', 0.7345441579818726)
lemmatized_yes_stops
('glorior', 0.7771285772323608)
('uirtus', 0.738848090171814)
('honor', 0.7327480316162109)
('industria1', 0.7065572738647461)
('dignitas', 0.7001360654830933)
('gloria', 0.6975167989730835)
('honesto', 0.663180947303772)
('eloquentia', 0.6575205326080322)
('studium', 0.6395516991615295)
('praestantia', 0.6373633146286011)
lemmatized_no_stops
('gloria', 0.771682858467102)
('glorior', 0.7458114624023438)
('honor', 0.7177746295928955)
('laudo', 0.6704500913619995)
('uirtus', 0.6648865342140198)
('studium', 0.6610381603240967)
('amplifico', 0.6564853191375732)
('industria1', 0.6444457769393921)
('continentia1', 0.6377541422843933)
('admirabilis', 0.6353957056999207)
carus
unlemmatized_yes_stops
('inimicus', 0.739293098449707)
('fidelis', 0.714043378829956)
('sollicitus', 0.7064939141273499)
('expertus', 0.6952502727508545)
('adflicto', 0.6927560567855835)
('dignior', 0.6921035051345825)
('cognitus', 0.682728111743927)
('carissimus', 0.6823166608810425)
('iucundus', 0.6813641786575317)
('paéne', 0.6810204982757568)
unlemmatized_no_stops
('carissimus', 0.7806285619735718)
('inimicus', 0.7400111556053162)
('unicus', 0.7288075685501099)
('uostra', 0.7263939380645752)
('disertus', 0.7260253429412842)
('superstes', 0.7259462475776672)
('maxumo', 0.7224482297897339)
('fuisti', 0.7199738621711731)
('pessume', 0.7066649794578552)
('curares', 0.7026439905166626)
lemmatized_yes_stops
('unicus', 0.665686309337616)
('iucundus', 0.6264019012451172)
('erga', 0.6138663291931152)
('beneuolens', 0.6121461391448975)
('gratus', 0.601753830909729)
('fidelitas', 0.5952837467193604)
('fidelis', 0.5939009189605713)
('tulliae', 0.5762821435928345)
('morigeror', 0.5726955533027649)
('illíus', 0.5708804130554199)
lemmatized_no_stops
('beneuolens', 0.6690277457237244)
('unicus', 0.6656185388565063)
('amicus', 0.6061148643493652)
('chrysalus', 0.605015754699707)
('erga', 0.6041693687438965)
('impertio', 0.5876685976982117)
('fidelitas', 0.5869717001914978)
('mei', 0.585202693939209)
('meum', 0.5791078209877014)
('maleuolens', 0.5764033794403076)
pudor
unlemmatized_yes_stops
('timor', 0.732406497001648)
('metus', 0.7207092046737671)
('amor', 0.6609115600585938)
('furor', 0.6386967897415161)
('dolor', 0.6124858260154724)
('ambitio', 0.6066770553588867)
('famae', 0.6056605577468872)
('pietas', 0.5939763784408569)
('honos', 0.5921438336372375)
('honor', 0.5903384685516357)
unlemmatized_no_stops
('furor', 0.6761273145675659)
('amor', 0.6753295063972473)
('tantus', 0.6703683137893677)
('dolor', 0.6549620032310486)
('metus', 0.6549262404441833)
('timor', 0.6540335416793823)
('constans', 0.6394664645195007)
('pietas', 0.6202282905578613)
('cupiditas', 0.6168038845062256)
('pudoris', 0.6138365864753723)
lemmatized_yes_stops
('timor', 0.6885049343109131)
('modestia', 0.6727569103240967)
('ignominia', 0.657335102558136)
('sollicitudo', 0.6540583968162537)
('audacia', 0.6486248970031738)
('metus', 0.6347575187683105)
('innocentia', 0.6325951814651489)
('laetitia', 0.6312640905380249)
('superbia', 0.6284981966018677)
('pertinacia', 0.6241970658302307)
lemmatized_no_stops
('uerecundia', 0.7423039674758911)
('misericordia', 0.6638351082801819)
('integritas', 0.6627805233001709)
('acerbitas', 0.6565795540809631)
('innocentia', 0.6536427736282349)
('recordatio', 0.6526570320129395)
('existimatio', 0.6291780471801758)
('dedecus', 0.6274505853652954)
('continentia1', 0.6274327039718628)
('angor', 0.6252697706222534)
cano
unlemmatized_yes_stops
('phoebique', 0.8140095472335815)
('maeret', 0.8085440993309021)
('erroresque', 0.8063466548919678)
('daunia', 0.8059276342391968)
('iugali', 0.804838240146637)
('infernae', 0.8045039772987366)
('uulcania', 0.8004024028778076)
('hennaea', 0.7992514371871948)
('bellona', 0.7981433868408203)
('diuae', 0.7978883981704712)
unlemmatized_no_stops
('cerno', 0.8293197154998779)
('mauors', 0.8207893371582031)
('gubernas', 0.8063743114471436)
('saeuo', 0.8040675520896912)
('marsa', 0.8025188446044922)
('manis', 0.7998283505439758)
('aeacides', 0.7950549125671387)
('dominamque', 0.7944770455360413)
('caneret', 0.7943785190582275)
('dumosa', 0.789914608001709)
lemmatized_yes_stops
('saepiae', 0.5535014271736145)
('mordeo', 0.5460636615753174)
('cacoëthe', 0.538771390914917)
('paralyticus', 0.5361864566802979)
('tibia', 0.5284138917922974)
('creber', 0.5214587450027466)
('tuba', 0.5202218294143677)
('potae', 0.5200856328010559)
('melleus', 0.5194711685180664)
('aspidas', 0.5125600099563599)
lemmatized_no_stops
('disseco', 0.5972294807434082)
('ursinus', 0.590910017490387)
('aspidas', 0.5873568058013916)
('spasticus', 0.5741567611694336)
('iocur', 0.5634351968765259)
('albico', 0.5632373690605164)
('saepiae', 0.5574640035629272)
('orthopnoea', 0.5570926070213318)
('uomitio', 0.5564547777175903)
('plumo', 0.5556727647781372)
scribo
unlemmatized_yes_stops
('scribam', 0.8057434558868408)
('scriberem', 0.7992566227912903)
('lubenter', 0.7968277335166931)
('exspectabam', 0.7848657369613647)
('gaudeam', 0.7732451558113098)
('adsequar', 0.7726513147354126)
('intellegis', 0.7708001136779785)
('facio', 0.7674623131752014)
('brute', 0.765916645526886)
('persuade', 0.7644809484481812)
unlemmatized_no_stops
('respondeo', 0.8407977819442749)
('lubenter', 0.8213809132575989)
('adiuuas', 0.813746452331543)
('postulo', 0.8095426559448242)
('brute', 0.8057581186294556)
('nuntias', 0.8032202124595642)
('consulerem', 0.79737788438797)
('credebam', 0.7953127026557922)
('committam', 0.7945393323898315)
('scripseram', 0.7916288375854492)
lemmatized_yes_stops
('philotimum', 0.6637271642684937)
('liber4', 0.6368970274925232)
('epistula', 0.5827207565307617)
('littera', 0.576386570930481)
('philotimo', 0.5750860571861267)
('philogenes', 0.5607917308807373)
('philotimi', 0.5528125762939453)
('coniectanea', 0.5488808155059814)
('oppium', 0.5451489090919495)
('loquor', 0.5436742305755615)
lemmatized_no_stops
('liber4', 0.6220735907554626)
('fadio', 0.568870484828949)
('libra', 0.567691445350647)
('littera', 0.5584113597869873)
('magniloquentia', 0.5575917959213257)
('balbus1', 0.5565413236618042)
('loquor', 0.5523605346679688)
('buthroto', 0.550958514213562)
('turranio', 0.5499093532562256)
('epistula', 0.5437573790550232)
amor
unlemmatized_yes_stops
('pudor', 0.660911500453949)
('error', 0.6432291269302368)
('furor', 0.6004754304885864)
('timor', 0.5997322797775269)
('expers', 0.5990047454833984)
('amicior', 0.5953745245933533)
('salutaris', 0.5937305688858032)
('dolor', 0.5898163318634033)
('mortalis', 0.5780104398727417)
('gemitus', 0.5755825042724609)
unlemmatized_no_stops
('pudor', 0.6753295063972473)
('pudoris', 0.6614177227020264)
('tantus', 0.6592569947242737)
('dolor', 0.6481508612632751)
('pietas', 0.622422456741333)
('summus', 0.6167148947715759)
('error', 0.6144319772720337)
('timor', 0.6143274307250977)
('furor', 0.5890376567840576)
('nullus', 0.5875043869018555)
lemmatized_yes_stops
('caritas', 0.6774085760116577)
('fidelitas', 0.6700564622879028)
('liberalitas', 0.6683306694030762)
('constantia1', 0.6632899045944214)
('industria1', 0.6600381135940552)
('beneuolens', 0.6542344093322754)
('cupiditas', 0.6538186073303223)
('beneuolentia', 0.6479369401931763)
('humanitas', 0.6472976207733154)
('continentia1', 0.6454772353172302)
lemmatized_no_stops
('caritas', 0.6510095596313477)
('pietas1', 0.6259500980377197)
('fidelitas', 0.6227724552154541)
('industria1', 0.6189290881156921)
('humanitas', 0.6177335381507874)
('amo', 0.6047547459602356)
('laetitia', 0.604009747505188)
('desiderium', 0.5951753854751587)
('misericordia', 0.5874444246292114)
('obseruantia', 0.5855789184570312)
dignitas
unlemmatized_yes_stops
('auctoritas', 0.800384521484375)
('existimatio', 0.7966275811195374)
('seueritas', 0.7812765836715698)
('exspectatio', 0.7726461887359619)
('laus', 0.7702658176422119)
('uoluntas', 0.7515003681182861)
('humanitas', 0.7514716386795044)
('salus', 0.7391500473022461)
('incredibilis', 0.7361450791358948)
('cupiditas', 0.7345121502876282)
unlemmatized_no_stops
('salus', 0.7925768494606018)
('exspectatio', 0.7666434645652771)
('laus', 0.7558329105377197)
('amplitudo', 0.7550672292709351)
('improbitate', 0.7334187030792236)
('auctoritas', 0.7301638126373291)
('perturbatio', 0.72828608751297)
('grauitas', 0.7250840663909912)
('modestia', 0.7243483066558838)
('existimatio', 0.7238847613334656)
lemmatized_yes_stops
('utilitas', 0.7155566215515137)
('amplitudo', 0.7129768133163452)
('auctoritas', 0.7017154693603516)
('officium', 0.7006381750106812)
('laus', 0.7001360058784485)
('existimatio', 0.6931731700897217)
('humanitas', 0.689584493637085)
('industria1', 0.6870406866073608)
('constantia1', 0.6846284866333008)
('liberalitas', 0.6801716089248657)
lemmatized_no_stops
('obseruantia', 0.7202469706535339)
('amplifico', 0.7043325901031494)
('auctoritas', 0.6831585168838501)
('humanitas', 0.6803106069564819)
('existimatio', 0.6800454258918762)
('utilitas', 0.6683915853500366)
('liberalitas', 0.667622447013855)
('integritas', 0.6672674417495728)
('honoro', 0.6635590195655823)
('honestas', 0.6613738536834717)
facio
unlemmatized_yes_stops
('feci', 0.8097273111343384)
('suscenseo', 0.7848771810531616)
('adsequar', 0.7793256044387817)
('exspectabam', 0.770606517791748)
('scribo', 0.7674623131752014)
('brute', 0.7636207342147827)
('catule', 0.757321298122406)
('faciam', 0.7539860606193542)
('cupio', 0.7537655234336853)
('effecero', 0.753574550151825)
unlemmatized_no_stops
('cures', 0.8110052943229675)
('postulo', 0.8057984113693237)
('facias', 0.7908643484115601)
('facis', 0.7853530645370483)
('tute', 0.7809972763061523)
('dixis', 0.7777658700942993)
('cedo', 0.7767754197120667)
('caue', 0.7738430500030518)
('metuo', 0.7723498940467834)
('catule', 0.7716257572174072)
lemmatized_yes_stops
('dico2', 0.6664252281188965)
('fio', 0.6521978974342346)
('sum1', 0.5840091705322266)
('habeo', 0.573696494102478)
('do', 0.5612642765045166)
('edo1', 0.5520738363265991)
('uideo', 0.5449216365814209)
('ipse', 0.5223643779754639)
('ito', 0.5134822130203247)
('uerus', 0.5085045695304871)
lemmatized_no_stops
('fio', 0.6308690905570984)
('sum1', 0.6158057451248169)
('dico2', 0.5920508503913879)
('habeo', 0.580849289894104)
('edo1', 0.514936089515686)
('uerus', 0.5004220008850098)
('sero1', 0.4947584271430969)
('uideo', 0.4933662712574005)
('uto', 0.48664236068725586)
('ito', 0.4825040400028229)
amo
unlemmatized_yes_stops
('rogem', 0.846319317817688)
('lubens', 0.8452644348144531)
('uah', 0.843443751335144)
('egŏn', 0.8251186609268188)
('mones', 0.8201998472213745)
('flocci', 0.8186986446380615)
('amas', 0.8186036348342896)
('laudo', 0.817556619644165)
('pereo', 0.8142061829566956)
('tĕ', 0.8137155771255493)
unlemmatized_no_stops
('ecastor', 0.8796558380126953)
('amas', 0.8678163886070251)
('mones', 0.8536465167999268)
('egone', 0.8366082906723022)
('hau', 0.8343548774719238)
('praedicas', 0.8326777219772339)
('chreme', 0.8296623229980469)
('pol', 0.8288089036941528)
('érgo', 0.8258687853813171)
('séd', 0.8233115077018738)
lemmatized_yes_stops
('odi', 0.6054990291595459)
('diligo', 0.6039066314697266)
('gaudeo', 0.5906010866165161)
('doleo', 0.5849798917770386)
('inuideo', 0.58259117603302)
('amor', 0.5787540078163147)
('dicaearchum', 0.5659167766571045)
('amicus1', 0.5599449276924133)
('cupidus', 0.5485079288482666)
('amicus', 0.5430886745452881)
lemmatized_no_stops
('diligo', 0.6319265365600586)
('amor', 0.6047547459602356)
('suauis', 0.5838252902030945)
('Hercules', 0.5677302479743958)
('efflictim', 0.5412902235984802)
('carus1', 0.5369195938110352)
('studiosus', 0.5256444215774536)
('odi', 0.5175546407699585)
('dĭ', 0.5165680646896362)
('inuideo', 0.5096327662467957)
amicitia
unlemmatized_yes_stops
('beneuolentia', 0.7057880759239197)
('dignitate', 0.69084632396698)
('eloquentia', 0.6800093650817871)
('disciplina', 0.6777817606925964)
('errato', 0.6772070527076721)
('societate', 0.6766867637634277)
('fide', 0.6733757257461548)
('temperantia', 0.673005998134613)
('calamitate', 0.6682467460632324)
('prudentia', 0.6669106483459473)
unlemmatized_no_stops
('commendatione', 0.743280827999115)
('improbitate', 0.7342466115951538)
('honorificentissime', 0.7253440022468567)
('beneuolentia', 0.7121111154556274)
('dignitate', 0.7069240808486938)
('liberalitas', 0.7062676548957825)
('dignitas', 0.7037380337715149)
('familiaritate', 0.7029399275779724)
('sesti', 0.6999698877334595)
('accurata', 0.6967668533325195)
lemmatized_yes_stops
('societas', 0.7176007628440857)
('liberalitas', 0.694193959236145)
('beneuolentia', 0.6752265095710754)
('fido', 0.6644117832183838)
('beneuolens', 0.6507399082183838)
('familiaritas', 0.6430079340934753)
('humanitas', 0.6386340856552124)
('fides1', 0.6246727705001831)
('dignitas', 0.616410493850708)
('necessitudo', 0.6114685535430908)
lemmatized_no_stops
('liberalitas', 0.6977777481079102)
('beneficium', 0.6852203011512756)
('necessitudo', 0.6782779693603516)
('humanitas', 0.6631476283073425)
('obseruantia', 0.6609437465667725)
('societas', 0.6592288017272949)
('familiaritas', 0.657794713973999)
('beneuolens', 0.6521384716033936)
('beneuolentia', 0.6520812511444092)
('erga', 0.6485866904258728)
industria
unlemmatized_yes_stops
('improbitate', 0.6982171535491943)
('integritate', 0.6909976601600647)
('auaritia', 0.6664460897445679)
('moderatione', 0.66603684425354)
('grauitate', 0.6607422232627869)
('amicitia', 0.644286036491394)
('dignitate', 0.6398682594299316)
('temperantia', 0.6375774145126343)
('audacia', 0.6358821988105774)
('honestate', 0.6348168849945068)
unlemmatized_no_stops
('diligentia', 0.7460492849349976)
('improbitate', 0.7443936467170715)
('prudentia', 0.7122471332550049)
('laetitia', 0.7083171606063843)
('constantia', 0.700203537940979)
('amplitudo', 0.7001898288726807)
('egeremus', 0.6970213055610657)
('accurata', 0.6963273286819458)
('improborum', 0.6960404515266418)
('utilitate', 0.68989497423172)
lemmatized_yes_stops
('constantia1', 0.826846182346344)
('prudentia', 0.7952827215194702)
('continentia1', 0.787614643573761)
('beneuolentia', 0.7848294973373413)
('integritas', 0.7819041013717651)
('diligentia', 0.7639070153236389)
('probitas', 0.7569852471351624)
('amplifico', 0.7569388151168823)
('commendatio', 0.7565748691558838)
('liberalitas', 0.7520667314529419)
lemmatized_no_stops
('fidelitas', 0.8541709184646606)
('continentia1', 0.8309833407402039)
('integritas', 0.8097890019416809)
('constantia1', 0.8040812015533447)
('liberalitas', 0.8011647462844849)
('adiumentum', 0.8009836673736572)
('amplifico', 0.7980403304100037)
('antepono', 0.7889832258224487)
('beneuolentia', 0.7887541055679321)
('beneuolens', 0.7876464128494263)
In [52]:
def write_model_vectors(language, models_paths, save_dir):
save_dir = os.path.expanduser(save_dir)
for model_name, model_path in models_paths.items():
# setup file paths
model_path = os.path.expanduser(model_path)
model_name = language + '_' + model_name + '.txt'
save_file = os.path.join(save_dir, model_name)
model = Word2Vec.load(model_path)
vocab = model.vocab
vocab_len = len(vocab)
counter = 0
final_list = []
print(vocab_len)
for word in vocab:
counter += 1
if counter % 10000 == 0:
print(counter, '/', vocab_len)
pairs = model.most_similar(word)
line = word + ': ' + str(pairs)
final_list.append(line)
final_str = '\n\n'.join(final_list)
with open(save_file, 'w') as file_open:
file_open.write(final_str)
print('Wrote file at:', save_file)
In [53]:
write_model_vectors('latin', models_paths, save_dir)
87835
striges: [('spumas', 0.8318402767181396), ('lapsumque', 0.8267949819564819), ('bucula', 0.8252154588699341), ('aetnaea', 0.8249616622924805), ('adleuat', 0.820337176322937), ('spumantis', 0.817375898361206), ('anhelo', 0.816428542137146), ('tergoque', 0.8148610591888428), ('pectine', 0.813423752784729), ('telas', 0.8128271102905273)]
fultus: [('hyacintho', 0.7800098657608032), ('examen', 0.7194118499755859), ('bipennibus', 0.7076067924499512), ('uersumque', 0.7016180753707886), ('trahebatur', 0.6994680166244507), ('tegitur', 0.6986311078071594), ('instaurat', 0.6970216035842896), ('cyllenius', 0.695743203163147), ('excuteret', 0.6950575113296509), ('nascebantur', 0.6947343945503235)]
consectentur: [('declinando', 0.725358247756958), ('toleraturos', 0.7214525938034058), ('turbidis', 0.714789867401123), ('sceleratis', 0.699560284614563), ('impares', 0.6862906217575073), ('uariantis', 0.6860278844833374), ('cientur', 0.6838405728340149), ('ignauae', 0.6830829977989197), ('terruere', 0.6810811758041382), ('desolata', 0.6806735992431641)]
detulerat: [('excusauit', 0.74037766456604), ('induxerat', 0.7346738576889038), ('obiecisset', 0.7232600450515747), ('exspectet', 0.7096316814422607), ('extingueretur', 0.7044765949249268), ('recreatum', 0.6984387040138245), ('consobrinum', 0.6984319090843201), ('offerret', 0.6982605457305908), ('induisset', 0.6979273557662964), ('tradendam', 0.6942371726036072)]
lectus: [('dictatorque', 0.6902541518211365), ('aedilis', 0.6459811925888062), ('cooptatus', 0.636102557182312), ('obses', 0.6267216205596924), ('frequentissimus', 0.6263905763626099), ('creatus', 0.6241918802261353), ('proxumus', 0.6239739060401917), ('repertus', 0.6177592873573303), ('unicus', 0.6166237592697144), ('notissimus', 0.6148399710655212)]
rixa: [('sollicitata', 0.7181073427200317), ('obsoleta', 0.6906036138534546), ('nitentibus', 0.6905712485313416), ('morosa', 0.6889892816543579), ('creatae', 0.6873690485954285), ('medendi', 0.6847342252731323), ('instabilis', 0.6833933591842651), ('suaserunt', 0.6771169900894165), ('auaris', 0.6742109656333923), ('tonsura', 0.6733936071395874)]
thynni: [('tundit', 0.8371689319610596), ('rorantes', 0.8153007626533508), ('circumligat', 0.8123067021369934), ('mulcebat', 0.8110316395759583), ('spumat', 0.8085201382637024), ('tereti', 0.808397650718689), ('rosis', 0.8081251382827759), ('spumeus', 0.8054652810096741), ('trementia', 0.804972231388092), ('intorto', 0.8021548986434937)]
arcum: [('micat', 0.8045886158943176), ('ardenti', 0.7999618053436279), ('tellurem', 0.7955467700958252), ('uolucrem', 0.7903378009796143), ('trux', 0.7900840044021606), ('haerentem', 0.7858254909515381), ('fulgente', 0.782285213470459), ('trementi', 0.7807439565658569), ('minantem', 0.7785213589668274), ('respectat', 0.7781765460968018)]
circinum: [('trocleae', 0.8776658177375793), ('diametros', 0.848968505859375), ('orbiculos', 0.8479334115982056), ('libramentum', 0.8451921939849854), ('perpendiculum', 0.8436546325683594), ('prominentia', 0.8400521874427795), ('normam', 0.840046763420105), ('triangulum', 0.8346719741821289), ('orbiculum', 0.8318036794662476), ('spirae', 0.829663097858429)]
papaueri: [('potu', 0.777767539024353), ('suffitu', 0.7508666515350342), ('pilosa', 0.7501565217971802), ('adalligata', 0.7500054836273193), ('myrti', 0.7488418221473694), ('incisuris', 0.7448790073394775), ('renes', 0.7448183298110962), ('caule', 0.7406085729598999), ('uitiato', 0.7404226064682007), ('densatur', 0.7401349544525146)]
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
/Users/kyle/cltk/venv/lib/python3.4/site-packages/IPython/kernel/zmq/kernelbase.py in _input_request(self, prompt, ident, parent, password)
675 try:
--> 676 ident, reply = self.session.recv(self.stdin_socket, 0)
677 except Exception:
/Users/kyle/cltk/venv/lib/python3.4/site-packages/IPython/kernel/zmq/session.py in recv(self, socket, mode, content, copy)
714 try:
--> 715 msg_list = socket.recv_multipart(mode, copy=copy)
716 except zmq.ZMQError as e:
/Users/kyle/cltk/venv/lib/python3.4/site-packages/zmq/sugar/socket.py in recv_multipart(self, flags, copy, track)
304 """
--> 305 parts = [self.recv(flags, copy=copy, track=track)]
306 # have first part already, only loop while more to receive
zmq/backend/cython/socket.pyx in zmq.backend.cython.socket.Socket.recv (zmq/backend/cython/socket.c:5772)()
zmq/backend/cython/socket.pyx in zmq.backend.cython.socket.Socket.recv (zmq/backend/cython/socket.c:5572)()
zmq/backend/cython/socket.pyx in zmq.backend.cython.socket._recv_copy (zmq/backend/cython/socket.c:1725)()
/Users/kyle/cltk/venv/lib/python3.4/site-packages/zmq/backend/cython/checkrc.pxd in zmq.backend.cython.checkrc._check_rc (zmq/backend/cython/socket.c:6022)()
10 cdef int errno = zmq_errno()
---> 11 PyErr_CheckSignals()
12 if rc < 0:
KeyboardInterrupt:
During handling of the above exception, another exception occurred:
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-53-5fbf237fd934> in <module>()
----> 1 write_model_vectors('latin', models_paths, save_dir)
<ipython-input-52-36e975e7f7ce> in write_model_vectors(language, models_paths, save_dir)
25 final_list.append(line)
26 print(line)
---> 27 input()
28
29 final_str = '\n\n'.join(final_list)
/Users/kyle/cltk/venv/lib/python3.4/site-packages/IPython/kernel/zmq/kernelbase.py in raw_input(self, prompt)
649 self._parent_ident,
650 self._parent_header,
--> 651 password=False,
652 )
653
/Users/kyle/cltk/venv/lib/python3.4/site-packages/IPython/kernel/zmq/kernelbase.py in _input_request(self, prompt, ident, parent, password)
679 except KeyboardInterrupt:
680 # re-raise KeyboardInterrupt, to truncate traceback
--> 681 raise KeyboardInterrupt
682 else:
683 break
KeyboardInterrupt:
In [55]:
models_paths = {'lemmatized_no_stops': '~/cltk_data/user_data/word2vec/w2v_greek_lemmatizer_stops.model',
'lemmatized_yes_stops': '~/cltk_data/user_data/word2vec/w2v_greek_lemmatizer.model',
'unlemmatized_no_stops': '~/cltk_data/user_data/word2vec/w2v_greek_stops.model',
'unlemmatized_yes_stops': '~/cltk_data/user_data/word2vec/w2v_greek.model'}
save_dir = '~/cltk_data/user_data/word2vec/'
In [21]:
word_all_models_list = defaultdict(list)
for model_name, model_path in models_paths.items():
# setup paths
model_path = os.path.expanduser(model_path)
save_dir = os.path.expanduser(save_dir)
save_file = os.path.join(save_dir, model_name)
model = Word2Vec.load(model_path)
words = ['ἄγγελος', 'εἶπον', 'λόγος', 'ἵππος', 'κύων', 'ὄνος', 'οἶδα']
for word in words:
if type(word) is str:
headword, sims = get_sims(model_name, word)
elif type(word) is tuple:
headword, sims = get_sims(model_name, word[0], word[1])
word_all_models_list[headword].append((model_name, sims))
In [22]:
print_sims(word_all_models_list)
οἶδα
unlemmatized_yes_stops
('οἶσθα', 0.7605229616165161)
('οἶδας', 0.7389824390411377)
('ἔγνων', 0.726704478263855)
('ἀγνοῶ', 0.7231594920158386)
('ἐπίσταμαι', 0.719325065612793)
('ἀγνοεῖς', 0.7121965289115906)
('εἶπον', 0.6860429048538208)
('πυροῦμαι', 0.683976411819458)
('ἔχω', 0.6616619825363159)
('ᾔδεις', 0.658226728439331)
lemmatized_yes_stops
('λέγω', 0.5777326822280884)
('ὁράω', 0.5736971497535706)
('φησὶν', 0.5720843076705933)
('ἀγνοέω', 0.567689836025238)
('δείκνυμι', 0.5646995306015015)
('εἶπον', 0.5587722063064575)
('εἰδὼς', 0.5557109713554382)
('εἰδότες', 0.552374005317688)
('ἐκεῖνος', 0.5251285433769226)
('δύναμαι', 0.5176582336425781)
lemmatized_no_stops
('λέγω', 0.5961875915527344)
('ἀγνοέω', 0.5912941694259644)
('εἰδὼς', 0.5896536111831665)
('εἰδότες', 0.5775903463363647)
('φησὶν', 0.5709648132324219)
('δείκνυμι', 0.5685352087020874)
('ὁράω', 0.5333505868911743)
('εἶπον', 0.5292441844940186)
('ἐκεῖνος', 0.5261932611465454)
('ἀκριβόω', 0.5231100916862488)
unlemmatized_no_stops
('ἐπίσταμαι', 0.7703808546066284)
('ἀγνοεῖς', 0.7517766356468201)
('οἶδας', 0.7468469738960266)
('ἔγνων', 0.7239306569099426)
('οἶσθα', 0.7151070237159729)
('ᾔδειν', 0.7119576334953308)
('ἀγνοῶ', 0.6988214254379272)
('πυροῦμαι', 0.6881002187728882)
('εἶπον', 0.6832518577575684)
('ἠλλοίωμαι', 0.672066330909729)
ἵππος
unlemmatized_yes_stops
('κύων', 0.8258955478668213)
('ὄνος', 0.7668443918228149)
('λύκος', 0.7265030145645142)
('θηλυμανὴς', 0.7227203845977783)
('βοῦς', 0.6930720806121826)
('ἐλέφας', 0.6789306402206421)
('κόραξ', 0.6616312265396118)
('ἀλώπηξ', 0.6601890325546265)
('ἀετὸς', 0.6590684652328491)
('ζῷον', 0.6446401476860046)
lemmatized_yes_stops
('ὄνος', 0.7201282978057861)
('βοῦς', 0.6780303716659546)
('ἀναβάτην', 0.6360746622085571)
('ἅρμα', 0.6193000078201294)
('χρυσοχαλίνων', 0.6130537986755371)
('καρδιηνῶν', 0.6100515127182007)
('βραδυσκελής', 0.6097468137741089)
('ἐλεφάντων', 0.6084158420562744)
('ἡμιόνων', 0.5958727598190308)
('ἀχαρνικοὶ', 0.5895185470581055)
lemmatized_no_stops
('ὄνος', 0.6850861310958862)
('ἅρμα', 0.6567941308021545)
('ἀναβάτην', 0.6535123586654663)
('βραδυσκελής', 0.6439781785011292)
('κέλητες', 0.6338183283805847)
('βοῦς', 0.6210334300994873)
('στροφούμενος', 0.6121916770935059)
('ἐπίβητε', 0.6012579202651978)
('θηλυμανὴς', 0.5995272397994995)
('ἡμίονοι', 0.5805424451828003)
unlemmatized_no_stops
('κύων', 0.7926816940307617)
('ὄνος', 0.7631309032440186)
('θηλυμανὴς', 0.739660918712616)
('λύκος', 0.6976257562637329)
('κύκνος', 0.682440996170044)
('χρεμετίζει', 0.6823428869247437)
('κόραξ', 0.6755474805831909)
('ἀετὸς', 0.6742944717407227)
('ὀρεὺς', 0.6727577447891235)
('πίθηκος', 0.6727405786514282)
ὄνος
unlemmatized_yes_stops
('κύων', 0.808297336101532)
('ἵππος', 0.7668445110321045)
('λύκος', 0.7638193368911743)
('ἐλέφας', 0.7350620031356812)
('βοῦς', 0.7217689156532288)
('ἀλώπηξ', 0.7106828093528748)
('τράγος', 0.6965163946151733)
('ἀετὸς', 0.6928150653839111)
('πατεῖ', 0.6818752288818359)
('ὗς', 0.6817245483398438)
lemmatized_yes_stops
('βοῦς', 0.7826263308525085)
('ἵππος', 0.7201282978057861)
('ὗς', 0.7026360034942627)
('πῶλος', 0.6973097324371338)
('κύων', 0.6898355484008789)
('ἀχαρνικοὶ', 0.6494538187980652)
('λύκος', 0.6418706178665161)
('ἔλαφος', 0.6322773694992065)
('βραδυσκελής', 0.625903844833374)
('κυνὶ', 0.6246753931045532)
lemmatized_no_stops
('βοῦς', 0.761025071144104)
('κύων', 0.6956329941749573)
('ὗς', 0.6888629198074341)
('ἵππος', 0.6850861310958862)
('ἔλαφος', 0.6785522699356079)
('πῶλος', 0.6772491931915283)
('βραδυσκελής', 0.6464596390724182)
('φάτνης', 0.6306556463241577)
('πρόβατον', 0.6189024448394775)
('λύκον', 0.6161036491394043)
unlemmatized_no_stops
('κύων', 0.8212473392486572)
('λύκος', 0.7772945165634155)
('ἵππος', 0.7631309032440186)
('ἀλώπηξ', 0.7415755987167358)
('ἀετὸς', 0.734527587890625)
('ἐλέφας', 0.7171768546104431)
('ὀρεὺς', 0.7041741013526917)
('βοῦς', 0.7024620771408081)
('πίθηκος', 0.7003055810928345)
('κροκόδειλος', 0.6847397685050964)
ἄγγελος
unlemmatized_yes_stops
('ἀρχιστράτηγος', 0.7051958441734314)
('“εἶπεν', 0.6776818633079529)
('μανωε', 0.6767134070396423)
('μεσίας', 0.6749094724655151)
('〈ὁ', 0.6727726459503174)
('ναζαρηνός', 0.6670150756835938)
('ἔκραζε', 0.6664544343948364)
('ἀρχάγγελος', 0.6657418012619019)
('ιερεμιας', 0.665309727191925)
('ῥαφαὴλ', 0.6632931232452393)
lemmatized_yes_stops
('διηκόνουν', 0.655651330947876)
('ἀρχάγγελοι', 0.621593713760376)
('κράζοντες', 0.610907793045044)
('ἀρχιστράτηγος', 0.6032100915908813)
('προσκυνησάτωσαν', 0.6030158996582031)
('κράζω', 0.6001020073890686)
('ἀπόκ', 0.5985332727432251)
('ἐνισχυσάτωσαν', 0.5969451665878296)
('γαβριὴλ', 0.5915992856025696)
('δαίμονες', 0.5868834853172302)
lemmatized_no_stops
('ἀρχάγγελοι', 0.6500431299209595)
('διηκόνουν', 0.6127337217330933)
('δαίμονες', 0.5984272956848145)
('ἐνισχυσάτωσαν', 0.5983909368515015)
('προσκυνησάτωσαν', 0.597992479801178)
('μανωε', 0.596545934677124)
('ἀπόστολοι', 0.5932490825653076)
('οὐρανοί', 0.5895529985427856)
('ἀπόκ', 0.5893698930740356)
('κράζω', 0.5775561332702637)
unlemmatized_no_stops
('μανωε', 0.7197017669677734)
('ἀρχάγγελος', 0.6879190802574158)
('ιερεμιας', 0.6853560209274292)
('ησαιας', 0.6822640299797058)
('ἐνέβλεψεν', 0.6793824434280396)
('ναζαρηνός', 0.6774595379829407)
('γαβριὴλ', 0.6712100505828857)
('ησαϊας', 0.6705929636955261)
('〉ὁ', 0.6677460670471191)
('μαλαχιας', 0.6654934883117676)
κύων
unlemmatized_yes_stops
('ἵππος', 0.8258956074714661)
('λύκος', 0.8193689584732056)
('ὄνος', 0.808297336101532)
('ἀετὸς', 0.7877009510993958)
('ἐλέφας', 0.7427119016647339)
('ποτάμιος', 0.7309486269950867)
('ἰχθὺς', 0.7272216081619263)
('πίθηκος', 0.7217716574668884)
('κροκόδειλος', 0.7197512984275818)
('βοῦς', 0.7154557704925537)
lemmatized_yes_stops
('λύκος', 0.7883812189102173)
('ἐλέφας', 0.7388116121292114)
('ἀετὸς', 0.738429069519043)
('πίθηκος', 0.7309204339981079)
('κόραξ', 0.7291678786277771)
('λαγωὸς', 0.7222265005111694)
('ἀετός', 0.7183389067649841)
('ὀρεὺς', 0.7084843516349792)
('ἀλεκτρυὼν', 0.7049040794372559)
('λαίθαργος', 0.7030189037322998)
lemmatized_no_stops
('λύκος', 0.7718775272369385)
('πίθηκος', 0.7702018022537231)
('ἀλώπηξ', 0.7383097410202026)
('ἀετὸς', 0.7264108657836914)
('κυνός', 0.724856972694397)
('λαγώς', 0.7191282510757446)
('λαγωὸς', 0.7165886163711548)
('θαλάττιος', 0.7149292230606079)
('ὗς', 0.7101212739944458)
('πάρδαλις', 0.708196759223938)
unlemmatized_no_stops
('ὄνος', 0.8212472200393677)
('ἵππος', 0.7926816344261169)
('ἀετὸς', 0.7820320129394531)
('λύκος', 0.7805715799331665)
('ἀλώπηξ', 0.7603527307510376)
('ἐλέφας', 0.7545673847198486)
('ἰχθὺς', 0.7447283864021301)
('πίθηκος', 0.7422963380813599)
('θαλάττιος', 0.730558454990387)
('λαγωὸς', 0.7255116701126099)
εἶπον
unlemmatized_yes_stops
('ἔλεγον', 0.7879046201705933)
('γινώσκετε', 0.707395613193512)
('ἀγνοεῖτε', 0.6947411298751831)
('οἶδα', 0.6860429048538208)
('ὑμεῖς', 0.6814215183258057)
('εἶπεν', 0.6742738485336304)
('ὀλιγόπιστοι', 0.6693212985992432)
('ἐπιστεύσατε', 0.6664608716964722)
('ἔλεγεν', 0.6637281179428101)
('ἔμαθον', 0.6613069772720337)
lemmatized_yes_stops
('ἐρῶ', 0.6896899938583374)
('λέγει', 0.6443853378295898)
('λέγειν', 0.635010838508606)
('λέγων', 0.6294112205505371)
('λέγω1', 0.5963457226753235)
('φησὶν', 0.5803765058517456)
('φησὶ', 0.5794665813446045)
('ἐκεῖνος', 0.5788511037826538)
('οἶδα', 0.5587722063064575)
('φημί', 0.5504785776138306)
lemmatized_no_stops
('ἐρῶ', 0.7306913137435913)
('λέγει', 0.6844544410705566)
('λέγειν', 0.6553261876106262)
('λέγων', 0.625251054763794)
('λέγω1', 0.6098009347915649)
('ἐκεῖνος', 0.6041553020477295)
('εἰπὼν', 0.5839270949363708)
('φησὶν', 0.5683282613754272)
('φησίν', 0.564296543598175)
('ποιέω', 0.5479371547698975)
unlemmatized_no_stops
('ἔλεγον', 0.7815759181976318)
('γινώσκετε', 0.7048661708831787)
('ἐπιστεύσατε', 0.7042254209518433)
('ὀλιγόπιστοι', 0.6989482045173645)
('ἀπολλῶ', 0.6937798261642456)
('εἶπεν', 0.6835825443267822)
('οἶδα', 0.6832518577575684)
('ἀγνοεῖτε', 0.6810635328292847)
('ἠσθενήσαμεν', 0.673568606376648)
('μαρτυρεῖτε', 0.6714804172515869)
λόγος
unlemmatized_yes_stops
('συντετμημένος', 0.6487975120544434)
('διττὸς', 0.6375629901885986)
('ὅρος', 0.6338368058204651)
('σκοπὸς', 0.6299418807029724)
('ἐγκωμιαστικὸς', 0.6253401637077332)
('ἀναγκαῖος', 0.619880735874176)
('διττός', 0.6190832853317261)
('σύντομος', 0.6170375347137451)
('ἀκριβέστερος', 0.6138684749603271)
('κατασκευαστικὸς', 0.6087167859077454)
lemmatized_yes_stops
('διττὸς', 0.6258721351623535)
('σύντομος', 0.5810777544975281)
('μυθώδης', 0.5795762538909912)
('ἐπιστημονικὸς', 0.5696719884872437)
('διδασκαλικὸς', 0.5643340349197388)
('ἐνεστηκὼς', 0.5588155388832092)
('ὑπερβολικὸς', 0.5585340261459351)
('ἐγκωμιαστικὸς', 0.5583504438400269)
('ἀσαφὴς', 0.5580397248268127)
('διωρισμένος', 0.5559536218643188)
lemmatized_no_stops
('σύντομος', 0.6178199052810669)
('διττὸς', 0.6090394258499146)
('κατασκευαστικὸς', 0.5776509046554565)
('αὐξητικὸς', 0.555382490158081)
('ἀμφίβολος', 0.5535582304000854)
('ἀληθὴς', 0.5530151128768921)
('μείζων', 0.5439461469650269)
('παροιμιώδης', 0.5436768531799316)
('ἀσαφὴς', 0.542860746383667)
('ἐπιστημονικὸς', 0.5393462777137756)
unlemmatized_no_stops
('διττὸς', 0.6785733699798584)
('συντετμημένος', 0.6784866452217102)
('σύντομος', 0.6503594517707825)
('ὅρος', 0.6378790140151978)
('ἐπιστημονικὸς', 0.6186484694480896)
('τρόπος', 0.6184679269790649)
('κατασκευαστικὸς', 0.6173063516616821)
('διττός', 0.6121178865432739)
('προκείμενος', 0.6116311550140381)
('ἀναγκαῖος', 0.6101460456848145)
In [ ]:
write_model_vectors('greek', models_paths, save_dir)
407239
διαχειρίζω: [('ταμιεύω', 0.8252047300338745), ('δαμάσω', 0.8073007464408875), ('ἰχανῶ', 0.8062002062797546), ('ὑβρίζω', 0.8042540550231934), ('προΐξω', 0.8021806478500366), ('ἐνδιατρίβω', 0.8002261519432068), ('ἔταον', 0.8000962734222412), ('νοήμη', 0.7943591475486755), ('διοικῶ', 0.7935482859611511), ('διαβάλλω', 0.7934131026268005)]
ἐνεγέννησεν: [('ἀποφέρουσα', 0.6958976984024048), ('ἐπαινετὴ', 0.6571913957595825), ('ἐπιστημονικὴ', 0.6513071060180664), ('ἀπηλλαγμένη', 0.6497482061386108), ('ἀντιποιουμένη', 0.6440267562866211), ('ἀναπόβλητος', 0.6410380601882935), ('ἀργία', 0.6388857364654541), ('ἀναγκαστικὴ', 0.6372959613800049), ('ἔφεσις', 0.6267553567886353), ('ὁδηγία', 0.6204506158828735)]
παλινδρομήσαντας: [('καθυβρίσαντες', 0.7391155958175659), ('παρεδήλουν', 0.7340777516365051), ('μετακαλέσασθαι', 0.7245435118675232), ('ἡγεμονεύοντα', 0.7109376192092896), ('εἰσκαλεσάμενος', 0.7104780673980713), ('ἐπικαλεσαμένης', 0.7087868452072144), ('ἀνέμενον', 0.7076782584190369), ('ἐνδημίαν', 0.7047619223594666), ('μετεκαλοῦντο', 0.7035496234893799), ('διαβήσεσθαι', 0.7033134698867798)]
πλωτῖνός: [('ἰάμβλιχός', 0.8086708784103394), ('πορφύριός', 0.8055563569068909), ('συμπεραινόμενός', 0.8012433052062988), ('ἀμμώνιός', 0.7799567580223083), ('πλούταρχός', 0.776447057723999), ('προϊών', 0.7731962203979492), ('ἐπιλυόμενός', 0.7515854835510254), ('διαρθρῶν', 0.7514673471450806), ('συριανός', 0.7435081601142883), ('τουτό', 0.7166218161582947)]
περαινομένην: [('ϲυρρεόντων', 0.8353908658027649), ('ἑτοιμοτέραν', 0.8353792428970337), ('ἀντίϲπαϲιν', 0.8334176540374756), ('ἀφικνεῖϲθαι', 0.8206106424331665), ('φλεγμονήν', 0.8202818632125854), ('ἀνάδοϲιν', 0.8198545575141907), ('ἐξορμώντων', 0.8175874948501587), ('παρακμήν', 0.8172018527984619), ('ἐπικτῶνται', 0.8170984983444214), ('ἀνατομήν', 0.8154889345169067)]
ἀντίσταθμον: [('καταναλίσκοντας', 0.5711706876754761), ('πανάριστον', 0.5636467933654785), ('ἀποπηδήσαντας', 0.5615572929382324), ('ἐπιδεικνυμένους', 0.558631181716919), ('προεμένη', 0.5580577850341797), ('ἀζητήτως', 0.5537200570106506), ('περιεποιεῖτο', 0.5473865270614624), ('ὑμνῳδὸν', 0.5470121502876282), ('ἐζηλωκόσι', 0.5463383793830872), ('εὐπαρακόμιστον', 0.5456492900848389)]
περσοῦ: [('στρατολογήσαντες', 0.7746959328651428), ('κωάδῃ', 0.742178201675415), ('βιταλιανὸς', 0.7225083708763123), ('συνεπομένης', 0.7217811346054077), ('δόρας', 0.7191195487976074), ('ἀπληκευόντων', 0.7178870439529419), ('ἰλλοῦ', 0.7107776403427124), ('ἐπιλαβομένης', 0.7076579332351685), ('μεσάζοντι', 0.7061315178871155), ('κοιτωνίτῃ', 0.7059804201126099)]
πυθιονίκου: [('βούθου', 0.8280751705169678), ('ἰατρείου', 0.7084528207778931), ('παχυφρόνων', 0.7049946188926697), ('πανθήρα', 0.6993095278739929), ('χαλδαίου', 0.6980409622192383), ('ὑποδείξεως', 0.6914699077606201), ('ἐμπόρου', 0.6787152290344238), ('ἐκμαγείου', 0.6710602641105652), ('ἴυγγος', 0.666806697845459), ('οὐρανομήκους', 0.6648931503295898)]
In [ ]:
Content source: kylepjohnson/ipython
Similar notebooks: