In [1]:
from cltk.corpus.greek.tlg.parse_tlg_indices import select_id_by_name
In [2]:
select_id_by_name('Aeschylus')
Out[2]:
In [3]:
import os
author_fp = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext/' + 'TLG0085.TXT')
In [4]:
author_fp
Out[4]:
In [5]:
with open(author_fp) as file_open:
aes_raw = file_open.read()
In [6]:
print(aes_raw[:1000])
In [7]:
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
In [8]:
aes_clean = tlg_plaintext_cleanup(aes_raw, rm_punctuation=True, rm_periods=True)
In [9]:
print(aes_clean[:1000])
In [10]:
aes_unigrams = aes_clean.split()
print(aes_unigrams[:30])
In [11]:
total_words = len(aes_unigrams)
print('Total words:', total_words)
In [12]:
unique_words = set(aes_unigrams)
unique_count = len(unique_words)
print('Total unique words:', unique_count)
In [13]:
aes_ratio = unique_count / total_words
In [14]:
print('Lexical diversity of Aechylus', aes_ratio)