In [8]:
import datetime as dt
from cltk.corpus.utils.importer import CorpusImporter
In [2]:
corpus_importer = CorpusImporter('greek')
In [3]:
corpus_importer.list_corpora
Out[3]:
In [4]:
corpus_importer.import_corpus('tlg', '/root/classics_corpora/TLG_E')
http://docs.cltk.org/en/latest/greek.html#converting-tlg-texts-with-tlgu
In [5]:
from cltk.corpus.greek.tlgu import TLGU
In [6]:
corpus_importer.import_corpus('greek_software_tlgu')
In [10]:
t = TLGU()
In [11]:
t0 = dt.datetime.utcnow()
t.convert_corpus(corpus='tlg')
print('... finished in {}'.format(dt.datetime.utcnow() - t0))
Now the TLG corpus is in now ready to use in Unicode. Some preprocesing is likely still required, as the text still has formatting and linebreaks present in the original printed text.
In [12]:
with open('/root/cltk_data/greek/text/tlg/plaintext/TLG0007.TXT') as file_open:
text_snippet = file_open.read()[:1500]
print(text_snippet)
In [ ]: