In [1]:
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.stop.greek.stops import STOPS_LIST as greek_stops
from cltk.stop.latin.stops import STOPS_LIST as latin_stops
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.tokenize.sentence import TokenizeSentence

import logging
import os
import time

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
def gen_docs_lda(corpus, lemmatize, rm_stops, testing):
    """This returns a list of tokenized words for an entire document; no sentence tokenization."""
    punkt = PunktLanguageVars()
    if corpus == 'phi5':
        language = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv_replacer = JVReplacer()
        text_cleaner = phi5_plaintext_cleanup
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    elif corpus == 'tlg':
        language = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        text_cleaner = tlg_plaintext_cleanup
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    if lemmatize:
        lemmatizer = LemmaReplacer(language)        
    if testing:
        filepaths = filepaths[:20]

    sent_tokenizer = TokenizeSentence(language)

    for filepath in filepaths:
        with open(filepath) as f:
            text = f.read()

        text = text_cleaner(text, rm_punctuation=True, rm_periods=True)
        words = punkt.word_tokenize(text)
        words = [w.lower() for w in words]
        words = [w for w in words if w]

        if stops:
            words = [w for w in words if w not in stops]
        words = [w for w in words if len(w) > 1]  # rm short words

        if words:
            words = words
        if words and language == 'latin':
            words = [jv_replacer.replace(word) for word in words]
        if lemmatize:
            words = lemmatizer.lemmatize(words)
        
        # dirty hack to ch incorrect 'edo1' lemmas
        new_words = []
        for word in words:
            if word == 'edo1':
                word = 'sum1'
            new_words.append(word)

        yield new_words

In [4]:
documents = gen_docs_lda('phi5', lemmatize=True, rm_stops=True, testing=False)
dictionary = Dictionary(documents)

In [6]:
documents = gen_docs_lda('phi5', lemmatize=True, rm_stops=True, testing=False)
corpus = [dictionary.doc2bow(doc) for doc in documents]
model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=20, workers=2, chunksize=2000, passes=5)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-6-8ffc960abe44> in <module>()
      1 documents = gen_docs_lda('phi5', lemmatize=True, rm_stops=True, testing=False)
----> 2 corpus = [dictionary.doc2bow(doc) for doc in documents]
      3 model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=20, workers=2, chunksize=2000, passes=5)

<ipython-input-6-8ffc960abe44> in <listcomp>(.0)
      1 documents = gen_docs_lda('phi5', lemmatize=True, rm_stops=True, testing=False)
----> 2 corpus = [dictionary.doc2bow(doc) for doc in documents]
      3 model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=20, workers=2, chunksize=2000, passes=5)

<ipython-input-3-194f92e2cc50> in gen_docs_lda(corpus, lemmatize, rm_stops, testing)
     51             if word == 'edo1':
     52                 word = 'sum1'
---> 53             new_words.append(word)
     54 
     55         yield new_words

KeyboardInterrupt: 

In [72]:
model.save('/tmp/lda_gensim_latin_multi.model')

In [67]:
model.num_terms


Out[67]:
128818

In [73]:
model.num_topics


Out[73]:
100

In [77]:
model.show_topics(100)


Out[77]:
['0.002*negumate + 0.001*ningulus + 0.001*nouentium + 0.001*expliciunt + 0.001*paroptus + 0.001*admiscis + 0.001*ascaloniam + 0.001*sapidus + 0.001*turio + 0.001*luxus2',
 '0.002*mens + 0.002*eiero + 0.002*tangomenas + 0.002*quiritum + 0.002*nusquam + 0.001*laus + 0.001*retineo + 0.001*pelvis + 0.001*perimadeia + 0.001*esor',
 '0.001*kato + 0.001*spolieis + 0.001*atur + 0.001*tueis + 0.001*uisce + 0.001*deiuitiora + 0.001*lycori + 0.001*tyria + 0.001*post + 0.001*historia',
 '0.030*sum1 + 0.028*tu + 0.028*ego + 0.021*hic + 0.015*qui1 + 0.014*quis1 + 0.013*facio + 0.010*meus + 0.009*dico2 + 0.009*ille',
 '0.003*cluo + 0.002*latonius + 0.002*icadium + 0.002*rediculi + 0.002*iapydem + 0.002*boaulia + 0.001*scaeu + 0.001*eanum + 0.001*oscillo + 0.001*talla',
 '0.036*sum1 + 0.034*facio + 0.015*is + 0.015*eo1 + 0.013*oportet + 0.011*uto + 0.010*qui1 + 0.010*vinum + 0.010*sero1 + 0.009*fio',
 '0.002*alpinaque + 0.002*crystallus + 0.001*aridulus + 0.001*anquina + 0.001*genumana + 0.001*arateis + 0.001*saeculum + 0.001*psyllus + 0.001*smyrna1 + 0.001*eous',
 '0.001*largitas + 0.001*auertentiumque + 0.001*acrifolium + 0.001*siluaticum + 0.001*pruscum + 0.001*aspergetur + 0.001*aureoue + 0.001*ariesue + 0.001*filix + 0.001*sentis1',
 '0.000*sum1 + 0.000*qui1 + 0.000*is + 0.000*hic + 0.000*dico2 + 0.000*ille + 0.000*ego + 0.000*facio + 0.000*tu + 0.000*omne',
 '0.003*extinguunt + 0.002*ignis + 0.002*vivo + 0.002*unda + 0.002*verus + 0.001*orior + 0.001*exuro + 0.001*condo + 0.001*communio1 + 0.001*impero',
 '0.001*sectator + 0.001*lenocinium + 0.001*tributim + 0.001*prandium + 0.001*delenio + 0.001*sector2 + 0.001*delicatus + 0.001*gubernaculum + 0.000*foveo + 0.000*grex',
 '0.002*sestertius + 0.002*stymphali + 0.001*arcadiae + 0.001*ipse + 0.001*tricies + 0.001*paeana + 0.001*sexagies + 0.001*poenico + 0.001*cenito + 0.001*fucosus',
 '0.033*dico2 + 0.020*hic + 0.010*sum1 + 0.007*recingo + 0.006*pluralis + 0.005*scribo + 0.005*facio + 0.004*adverbium + 0.004*duo + 0.004*ille',
 '0.002*furenter + 0.002*noxa + 0.002*spondeo + 0.001*praesto2 + 0.001*sano + 0.001*prodiguae + 0.001*pecus1 + 0.001*illic + 0.001*febris + 0.001*conicio',
 '0.001*fabrici + 0.000*respicio + 0.000*varius1 + 0.000*senectus2 + 0.000*dubius + 0.000*fortuno + 0.000*judico + 0.000*cado + 0.000*homo + 0.000*sum1',
 '0.002*lycori + 0.002*scrupeda + 0.002*faueatis + 0.001*testatim + 0.001*†ita + 0.001*istic2 + 0.001*aestuo + 0.001*vehemens + 0.001*trittiles + 0.001*tradedeq',
 '0.000*sum1 + 0.000*qui1 + 0.000*is + 0.000*hic + 0.000*facio + 0.000*sui + 0.000*dico2 + 0.000*ille + 0.000*alius2 + 0.000*possum',
 '0.002*reparator + 0.002*pistum + 0.002*muries + 0.001*uestalem + 0.001*obaratorem + 0.001*subruncinatorem + 0.001*picumno + 0.001*ueruactorem + 0.001*puilia + 0.001*promitorem',
 '0.001*pulcher1 + 0.001*murrina + 0.001*φορβή + 0.000*is + 0.000*qui1 + 0.000*sum1 + 0.000*hic + 0.000*dico2 + 0.000*facio + 0.000*mitto',
 '0.024*sum1 + 0.021*do + 0.015*facio + 0.010*drachm + 0.009*hic + 0.007*venter + 0.006*verus + 0.006*fortis + 0.006*qui1 + 0.006*inicio',
 '0.028*sum1 + 0.015*is + 0.012*sui + 0.012*qui1 + 0.008*hostis + 0.008*bellus + 0.007*eo1 + 0.006*urbs + 0.006*magnus + 0.006*omne',
 '0.000*qui1 + 0.000*sum1 + 0.000*is + 0.000*hic + 0.000*dico2 + 0.000*possum + 0.000*ille + 0.000*sui + 0.000*tu + 0.000*video',
 '0.008*bellus + 0.007*fero + 0.006*hic + 0.006*qui1 + 0.005*magnus + 0.005*armo + 0.004*vir + 0.004*do + 0.004*campus1 + 0.004*dexter',
 '0.002*calceolus + 0.001*syriaci + 0.001*cliua + 0.001*gallograeciam + 0.001*†ne + 0.001*terentio + 0.001*confingo + 0.001*ementior + 0.001*africanus + 0.001*oppidum',
 '0.002*dubitatim + 0.001*indulgitas + 0.001*maurusii + 0.001*bellosus + 0.001*paucies + 0.001*amilcar + 0.001*oportuerint + 0.001*barcha + 0.001*nucerum + 0.001*exfundatus',
 '0.002*marcus1 + 0.001*priuernati + 0.001*albano + 0.001*tiburti + 0.001*ad-sedeo + 0.001*evenio + 0.000*fortis + 0.000*filius + 0.000*sum1 + 0.000*qui1',
 '0.031*pipo + 0.025*liquamen + 0.022*coquo + 0.018*oleum + 0.017*tero + 0.015*mel + 0.015*mitto + 0.013*modicus + 0.012*vinum + 0.012*jus',
 '0.005*praetor + 0.003*auspicium + 0.003*censor + 0.003*contio + 0.003*comitio + 0.003*avoco + 0.002*consul + 0.002*magistratus + 0.002*magistro + 0.002*collega',
 '0.004*praeverbium + 0.003*furax + 0.002*padusam + 0.002*memoriosus + 0.002*apiscuntur + 0.002*elinguo + 0.002*adsecula + 0.002*involo + 0.002*largitio + 0.001*ingemescerem',
 '0.024*sum1 + 0.024*qui1 + 0.018*hic + 0.015*ille + 0.009*possum + 0.008*facio + 0.008*tu + 0.007*sui + 0.007*dico2 + 0.007*quis1',
 '0.039*sum1 + 0.028*qui1 + 0.021*is + 0.018*hic + 0.012*dico2 + 0.012*tu + 0.011*ego + 0.010*ille + 0.010*possum + 0.008*video',
 '0.008*aeneus + 0.006*turnus + 0.003*molucrum + 0.002*narro + 0.002*sonticus + 0.002*sancum + 0.002*manuos + 0.002*pescia + 0.002*iunonis + 0.002*camillum2',
 '0.026*is + 0.025*qui1 + 0.020*hic + 0.013*sui + 0.013*sum1 + 0.011*caesar + 0.011*pars + 0.010*facio + 0.009*oppidum + 0.009*reliquus',
 '0.003*huc + 0.003*quirito + 0.003*voco + 0.002*illicium + 0.002*conventio + 0.002*nusquam + 0.002*vocito + 0.001*lustrum1 + 0.001*praeco + 0.001*fortuno',
 '0.044*sum1 + 0.030*is + 0.025*qui1 + 0.015*hic + 0.014*possum + 0.014*jus + 0.011*heres + 0.011*eo1 + 0.010*filius + 0.010*do',
 '0.002*sesquisenex + 0.002*pusa + 0.002*potoni + 0.001*pusus + 0.001*cascus + 0.001*mulus + 0.001*rusum + 0.001*mutuo + 0.000*ridiculus + 0.000*amicus1',
 '0.009*tu + 0.008*hic + 0.007*qui1 + 0.005*magnus + 0.005*for + 0.005*video + 0.005*ego + 0.005*bellus + 0.005*manus1 + 0.004*fero',
 '0.002*putus1 + 0.002*celerissimus + 0.002*purus + 0.002*auster1 + 0.001*conditrix + 0.001*hyperion + 0.001*carthaginienses + 0.001*proruo + 0.001*bacchor + 0.001*lectulus',
 '0.040*sum1 + 0.037*is + 0.020*qui1 + 0.020*possum + 0.016*libro + 0.013*facio + 0.012*heres + 0.010*debeo + 0.010*hic + 0.009*do',
 '0.001*uulgu + 0.001*rubii + 0.001*nux + 0.001*urbanus + 0.001*citreum + 0.001*genu + 0.001*castanea + 0.001*rusticus + 0.001*corpus + 0.001*paro1',
 '0.015*is + 0.011*hic + 0.009*sum1 + 0.009*alius2 + 0.007*voco + 0.006*dico2 + 0.006*qui1 + 0.006*verus + 0.006*folium + 0.005*idem',
 '0.004*juvo + 0.004*enos + 0.004*anulus1 + 0.003*triumphus + 0.003*lases + 0.002*salio1 + 0.002*satur + 0.002*fu + 0.002*sto + 0.002*for',
 '0.001*ructuosus + 0.001*parricidatum + 0.001*coam + 0.001*soporus + 0.001*excito + 0.001*clytaemestram + 0.001*ariona + 0.001*noscito + 0.001*sponda + 0.001*quadrantarius',
 '0.036*dico2 + 0.025*qui1 + 0.023*sum1 + 0.018*hic + 0.015*is + 0.007*alius2 + 0.006*unde + 0.006*sui + 0.005*facio + 0.005*vel',
 '0.003*fritinnio + 0.002*mollusca + 0.002*ic + 0.001*fanni + 0.001*–⚕–⚕– + 0.001*trittiles + 0.001*frunde + 0.001*persica + 0.001*pullo + 0.001*deligo1',
 '0.003*arbor1 + 0.002*ficana + 0.002*prox + 0.002*septimontio + 0.002*prop + 0.002*telinum + 0.002*arca + 0.002*fagutali + 0.002*nautea + 0.002*mons',
 '0.004*numero1 + 0.002*nimius + 0.002*afranius + 0.001*celeriter + 0.001*extinguerem + 0.001*oenomao + 0.001*nequicquam + 0.001*aethione + 0.001*ratitus + 0.001*caecilius',
 '0.002*apenninum + 0.002*pater + 0.002*mdccccxcu + 0.001*assyriorum + 0.001*archon + 0.001*nini + 0.001*assyrii + 0.001*condo + 0.001*dedico + 0.001*dives',
 '0.024*sum1 + 0.023*is + 0.019*qui1 + 0.017*hic + 0.009*eo1 + 0.008*quoque + 0.007*aqua + 0.006*possum + 0.006*facio + 0.006*idem',
 '0.033*longus + 0.031*brevis + 0.025*duo + 0.024*syllaba + 0.020*unus + 0.019*hic + 0.017*qui1 + 0.015*primus + 0.014*verro + 0.013*trochaeus',
 '0.003*opulesco + 0.003*noctesco + 0.003*lutesco + 0.002*viresco + 0.001*purpuro + 0.001*crudus + 0.001*manduco1 + 0.001*priamum + 0.001*priamique + 0.001*fulica',
 '0.002*amineas + 0.002*uua + 0.001*falerna + 0.001*oronti + 0.001*frutex + 0.001*eoae + 0.001*jocus + 0.001*flagello + 0.001*pergo + 0.000*quando',
 '0.001*catonis + 0.001*mucro + 0.001*grammaticus + 0.001*linquo + 0.001*en + 0.001*poeta + 0.001*tartaream + 0.001*criminator + 0.001*sequor + 0.001*novo',
 '0.001*a-mico + 0.001*uexabunt + 0.001*informo + 0.000*temperies + 0.000*sexus + 0.000*germino + 0.000*per-misceo + 0.000*femina + 0.000*virtus + 0.000*fingo',
 '0.003*exquaeras + 0.002*aduoces + 0.002*cannat + 0.002*cannatum + 0.002*trogi + 0.002*†orande + 0.002*adsitet + 0.002*†commeatum + 0.001*classicus + 0.001*auspicium',
 '0.005*aedes + 0.004*mons + 0.004*quinticeps + 0.004*sexticeps + 0.003*cis + 0.003*lucus1 + 0.003*collis + 0.003*terticeps + 0.003*quarticeps + 0.003*cespius',
 '0.001*do + 0.001*strideo + 0.001*patella1 + 0.001*cyma + 0.000*sum1 + 0.000*qui1 + 0.000*is + 0.000*hic + 0.000*dico2 + 0.000*possum',
 '0.002*bimaritus + 0.002*uoltiniensis + 0.001*uoltinia + 0.001*lacrimula + 0.001*teretinam + 0.001*cispio + 0.001*plotio + 0.001*bithynia + 0.001*tribulum + 0.001*rhodi',
 '0.001*tabularia + 0.001*horreae + 0.001*fumus + 0.001*navale + 0.001*sufes + 0.001*murus + 0.000*prosequor + 0.000*jaceo + 0.000*curia + 0.000*nonne',
 '0.001*litterator + 0.001*faliscum + 0.001*sabina + 0.001*causor + 0.001*exter + 0.000*spatior + 0.000*liber1 + 0.000*paulatim + 0.000*ludo + 0.000*converto',
 '0.020*aput + 0.004*sylla + 0.003*sanates + 0.003*serus + 0.002*subidus + 0.002*mespilum + 0.002*ludia + 0.002*cotonius + 0.002*praecox + 0.002*album',
 '0.002*μῦσαι + 0.001*conprimere + 0.001*musso + 0.000*graeco + 0.000*oculus + 0.000*graeci + 0.000*dico2 + 0.000*sum1 + 0.000*qui1 + 0.000*hic',
 '0.002*uepre + 0.002*callaici + 0.001*ibes + 0.001*volens + 0.001*sabratha + 0.001*trasimenna + 0.001*libycam + 0.001*asbytes + 0.001*suboles + 0.001*horridus',
 '0.001*†liberis + 0.001*dignosco + 0.001*uiolamus + 0.001*aponus + 0.000*extorris + 0.000*navigium + 0.000*obstruo + 0.000*prensis + 0.000*pugnax + 0.000*ualuit',
 '0.001*harunce + 0.001*hermundulis + 0.001*hermundulo + 0.001*raudus + 0.001*neque + 0.001*hostis + 0.001*quot + 0.001*ala + 0.001*servio + 0.001*aurum',
 '0.000*sum1 + 0.000*qui1 + 0.000*is + 0.000*hic + 0.000*tu + 0.000*magnus + 0.000*sui + 0.000*eo1 + 0.000*possum + 0.000*do',
 '0.024*qui1 + 0.019*sum1 + 0.013*is + 0.010*hic + 0.006*magnus + 0.005*alius2 + 0.005*eo1 + 0.004*habeo + 0.004*dico2 + 0.004*possum',
 '0.001*nampii + 0.001*desediato + 0.001*isicc + 0.001*ib· + 0.001*pasi + 0.001*icc + 0.001*sintprae + 0.001*abderae + 0.001*itrp + 0.001*cmse',
 '0.000*sum1 + 0.000*qui1 + 0.000*is + 0.000*hic + 0.000*sui + 0.000*tu + 0.000*facio + 0.000*possum + 0.000*habeo + 0.000*do',
 '0.001*ille + 0.001*aer + 0.001*taurus1 + 0.001*ferio + 0.001*cornus1 + 0.001*sicco + 0.001*cornu + 0.001*vernus + 0.001*aries + 0.001*praecido',
 '0.001*percandidus + 0.001*iaspius + 0.001*coxo + 0.001*tapetae + 0.001*debilis + 0.001*quatio + 0.001*beryllus + 0.001*vito + 0.001*smaragdus + 0.001*gibber1',
 '0.003*aegonis + 0.001*damoeta + 0.001*tityre + 0.001*fagus + 0.001*tegimen + 0.001*calumniatur + 0.001*atherbal + 0.001*pudicabus + 0.001*oleabus + 0.001*raptabus',
 '0.001*foedo + 0.001*sopitionibus + 0.001*ipsimae + 0.001*illati + 0.001*lapidauimus + 0.001*gratiasenataui + 0.001*erogatorius + 0.001*reporrigo + 0.001*donabo” + 0.001*curarem”',
 '0.002*caelitus + 0.001*mutuus + 0.001*gratulor + 0.001*densantur + 0.001*larissa + 0.001*intuo + 0.001*arpinati + 0.001*quousque + 0.001*lorico + 0.001*procreo',
 '0.007*topper + 0.002*inserinuntur + 0.001*circae + 0.001*maturus + 0.001*magmentum + 0.001*didus + 0.001*homones + 0.001*pacui + 0.001*erumnam + 0.001*odyssia',
 '0.008*sum1 + 0.008*qui1 + 0.008*tu + 0.007*fero + 0.007*hic + 0.006*do + 0.006*ille + 0.005*venio + 0.005*magnus + 0.004*ego',
 '0.023*sum1 + 0.020*qui1 + 0.018*corpus + 0.016*possum + 0.014*omne + 0.013*res + 0.012*video + 0.009*sui + 0.009*natura + 0.009*terra',
 '0.062*hic + 0.054*littera + 0.027*scribo + 0.016*ablativus + 0.016*habeo + 0.015*dico2 + 0.015*sum1 + 0.014*genetivus + 0.013*puto + 0.011*syllaba',
 '0.000*sum1 + 0.000*qui1 + 0.000*hic + 0.000*is + 0.000*dico2 + 0.000*omne + 0.000*tu + 0.000*sui + 0.000*possum + 0.000*facio',
 '0.002*aestifero + 0.002*cretaeae + 0.002*ecfundet + 0.001*uolsonem + 0.001*squamigerae + 0.001*rhodanumue + 0.001*†superfluit + 0.001*⌊oli + 0.001*abdi + 0.001*tricastinis',
 '0.002*euthia + 0.002*gausapa + 0.001*modulus + 0.001*gladiola + 0.001*caesareumque + 0.001*agamemnoniis + 0.001*armenii + 0.001*fretus + 0.001*angustia + 0.001*felicitas1',
 '0.037*sum1 + 0.022*qui1 + 0.021*is + 0.010*hic + 0.010*habeo + 0.007*dico2 + 0.007*uto + 0.007*verbum + 0.007*scribo + 0.006*facio',
 '0.002*perpusillus + 0.001*iubilatuicinus + 0.001*buccoquis + 0.001*circumvenio + 0.000*accuso + 0.000*antiquus + 0.000*rogo + 0.000*io1 + 0.000*liceo1 + 0.000*ego',
 '0.002*tudites + 0.002*valentia1 + 0.002*nefrendes + 0.002*romen + 0.001*ocris + 0.001*struppus + 0.001*nuscitiosum + 0.001*uacerram + 0.001*naucum + 0.001*orchita',
 '0.002*cascus + 0.002*†delia + 0.002*titano + 0.001*mirabile + 0.001*castus1 + 0.001*gemini + 0.001*latona + 0.001*nuptiae + 0.001*deus + 0.001*pario2',
 '0.000*sum1 + 0.000*alpinaque + 0.000*crystallus + 0.000*qui1 + 0.000*hic + 0.000*anquina + 0.000*aridulus + 0.000*genumana + 0.000*arateis + 0.000*tu',
 '0.035*sum1 + 0.026*qui1 + 0.017*ager + 0.015*hic + 0.011*quidam + 0.011*is + 0.010*finis + 0.010*debeo + 0.010*linea + 0.009*dico2',
 '0.038*sum1 + 0.024*is + 0.019*qui1 + 0.016*libro + 0.013*hic + 0.011*do + 0.010*possum + 0.009*debeo + 0.009*heres + 0.009*habeo',
 '0.002*ecquando + 0.001*argumentor + 0.001*perpudesco + 0.001*assero2 + 0.000*genius + 0.000*nomino + 0.000*gigno + 0.000*pauso + 0.000*vindex + 0.000*dejero',
 '0.002*deus + 0.002*scruppedam + 0.002*flauisas + 0.001*adagium + 0.001*hectora + 0.001*progenitor + 0.001*accius + 0.001*hectorem + 0.001*scipio1 + 0.001*vetus',
 '0.000*qui1 + 0.000*is + 0.000*sum1 + 0.000*hic + 0.000*facio + 0.000*possum + 0.000*sui + 0.000*dico2 + 0.000*magnus + 0.000*eo1',
 '0.002*adorio + 0.001*nidor + 0.001*primitiae + 0.000*emico + 0.000*ardor + 0.000*sponte + 0.000*nubo + 0.000*sterno + 0.000*sanguis + 0.000*manipulus',
 '0.010*qui1 + 0.008*princeps + 0.008*is + 0.005*augustus1 + 0.005*senatus + 0.005*miles + 0.005*sui + 0.004*imperium + 0.004*legio + 0.004*imperator',
 '0.008*fiscus + 0.007*qu + 0.005*um + 0.004*no1 + 0.004*sa + 0.003*defero + 0.003*ta + 0.003*quadrantal + 0.002*cu + 0.002*em',
 '0.000*sum1 + 0.000*qui1 + 0.000*hic + 0.000*is + 0.000*ego + 0.000*facio + 0.000*habeo + 0.000*tu + 0.000*ille + 0.000*magnus',
 '0.002*macerrimum + 0.002*litorosus + 0.001*devenio + 0.001*rhyndacus + 0.001*aeger + 0.001*amitto + 0.001*aeneus + 0.001*gauri + 0.001*apisci + 0.001*garadum',
 '0.009*facio + 0.007*ago + 0.006*aedilis + 0.005*curulis + 0.005*uxor + 0.004*hic + 0.004*ludo + 0.004*do + 0.004*flaccus1 + 0.004*tero',
 '0.008*senex + 0.006*sui + 0.006*is + 0.005*uxor + 0.005*capio + 0.005*miles + 0.004*emo + 0.004*leno2 + 0.004*suo + 0.004*servus1',
 '0.003*persulto + 0.003*chalybis + 0.003*phlegethontis + 0.002*marmarides + 0.002*imperterritus + 0.001*seminecum + 0.001*cymes + 0.001*battiadas + 0.001*scyllaei + 0.001*uuiferis',
 '0.001*syrma + 0.000*eludo + 0.000*dolus + 0.000*fluctus + 0.000*malus + 0.000*tragicus + 0.000*quis1 + 0.000*qui1 + 0.000*facio + 0.000*verro']

In [ ]: