In [2]:
import datetime as dt
import os
import sys

from cltk.corpus.greek.tlg.parse_tlg_indices import get_id_author
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from nltk.tokenize.punkt import PunktLanguageVars
import pandas

In [3]:
p = PunktLanguageVars()

Lexical diveristy by author

Original text with diacritics

Get status about corpus


In [4]:
t0 = dt.datetime.utcnow()

cleaned_dir = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext_clean')
dir_contents = os.listdir(cleaned_dir)

corpus_stats = {}

all_tokens_list = []
for doc_count, file in enumerate(dir_contents):
    file_path = os.path.join(cleaned_dir, file)
    with open(file_path) as fo:
        text = fo.read().lower()
    text = ''.join([char for char in text if char not in ['.']])
    tokens = p.word_tokenize(text)
    all_tokens_list += tokens

print('Total author files:', doc_count)
print('Total words:', len(all_tokens_list))
all_tokens_unique = set(all_tokens_list)
print('Total unique words:', len(all_tokens_unique))

corpus_stats = {'doc_count': doc_count, 
               'total_words': len(all_tokens_list),
               'total_unique_words': len(all_tokens_unique)}

print('... finished in {}'.format(dt.datetime.utcnow() - t0))


Total author files: 1822
Total words: 72057716
Total unique words: 1515193
... finished in 0:03:38.839884

In [5]:
df_corpus = pandas.DataFrame(corpus_stats, index=[0])
print(df_corpus)


   doc_count  total_unique_words  total_words
0       1822             1515193     72057716

Get stats per author


In [6]:
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_of_author

In [7]:
map_id_author = get_id_author()

In [8]:
t0 = dt.datetime.utcnow()

# Words and unique words per author
map_id_word_counts = {}
for file in dir_contents:
    map_word_counts = {}
    file_path = os.path.join(cleaned_dir, file)
    author_id = file[3:-4]
    author = map_id_author[author_id]
    with open(file_path) as fo:
        text = fo.read().lower()
    text = ''.join([char for char in text if char not in ['.']])
    tokens = p.word_tokenize(text)
    map_word_counts['name'] = author
    map_word_counts['epithet'] = get_epithet_of_author(author_id)
    map_word_counts['word_count_all'] = len(tokens)
    map_word_counts['word_count_unique'] = len(set(tokens))
    try:
        lexical_diversity = len(set(tokens)) / len(tokens)
    except ZeroDivisionError:
        lexical_diversity = 0
    map_word_counts['lexical_diversity'] = lexical_diversity
    
    map_id_word_counts[author_id] = map_word_counts
#     print(author)
#     print('    ', 'Total words:', len(tokens))
#     print('    ', 'Total unique words:', len(set(tokens)))
#     print('    ', 'Lexical diversity:', lexical_diversity)

print('... finished in {}'.format(dt.datetime.utcnow() - t0))


... finished in 0:03:44.275525

In [9]:
df_text_counts = pandas.DataFrame(map_id_word_counts).T

In [10]:
df_text_counts


Out[10]:
epithet lexical_diversity name word_count_all word_count_unique
0001 Epici/-ae 0.34366 Apollonius Rhodius Epic. 39155 13456
0002 Elegiaci 0.398142 Theognis Eleg. 9798 3901
0003 Historici/-ae 0.150405 Thucydides Hist. 150427 22625
0004 Biographi 0.222551 Diogenes Laertius Biogr. 110977 24698
0005 Bucolici 0.439063 Theocritus Bucol. 21719 9536
0006 Tragici 0.19995 Euripides Trag. 184076 36806
0007 Biographi 0.110099 Plutarchus Biogr. et Phil. 1034650 113914
0008 Sophistae 0.151358 Athenaeus Soph. 394588 59724
0009 Lyrici/-ae 0.674388 Sappho Lyr. 3756 2533
0010 Oratores 0.139739 Isocrates Orat. 120603 16853
0011 Tragici 0.288234 Sophocles Trag. 73423 21163
0012 Epici/-ae 0.155017 Homerus Epic., Homer 199617 30944
0013 None 0.393614 Hymni Homerici, Homeric Hymns 16036 6312
0014 Oratores 0.112017 Demosthenes Orat. 297535 33329
0015 Historici/-ae 0.255029 Herodianus Hist. 46928 11968
0016 Historici/-ae 0.158366 Herodotus Hist. 185779 29421
0017 Oratores 0.193321 Isaeus Orat. 32878 6356
0018 Philosophici/-ae 0.139545 Philo Judaeus Phil. 449869 62777
0019 Comici 0.244177 Aristophanes Comic. 112787 27540
0020 Epici/-ae 0.390019 Hesiodus Epic. 25168 9816
0022 Epici/-ae 0.517786 Nicander Epic. 11976 6201
0023 Epici/-ae 0.398901 Oppianus Epic. 22755 9077
0024 Epici/-ae 0.491281 Oppianus Epic. 13477 6621
0026 Oratores 0.232688 Aeschines Orat. 48984 11398
0027 Oratores 0.27413 Andocides Orat. 17623 4831
0028 Oratores 0.254137 Antiphon Orat. 21390 5436
0029 Oratores 0.302597 Dinarchus Orat. 17710 5359
0030 Oratores 0.330946 Hyperides Orat. 16894 5591
0031 None 0.134781 Novum Testamentum, New Testament 138907 18722
0032 Historici/-ae 0.127116 Xenophon Hist. 317174 40318
... ... ... ... ... ...
5031 None 0.321937 Scholia In Nicandrum 51100 16451
5032 None 0.364637 Scholia In Oppianum 70108 25564
5033 None 0.57196 Scholia In Pausaniam 806 461
5034 None 0.143688 Scholia In Pindarum 281729 40481
5035 None 0.250535 Scholia In Platonem 64482 16155
5037 None 0.186222 Scholia In Sophoclem 160631 29913
5038 None 0.291173 Scholia In Theocritum 49565 14432
5039 None 0.219468 Scholia In Thucydidem 93649 20553
5040 None 0.474512 Scholia In Xenophontem 4198 1992
5045 Rhetorici 0.310473 Anonymi In Aphthonium Rhet. 4316 1340
5046 None 0.467742 Scholia In Theonem Rhetorem 1178 551
5048 None 0.444405 Scholia In Clementem Alexandrinum 11233 4992
5052 None 0.238008 Scholia in Maximum Confessorem 20701 4927
7000 None 0.311176 Anthologia Graeca, AG 133198 41448
7051 None 0.181684 Doctrina Patrum 63682 11570
7052 None 0.355126 Anthologiae Graecae Appendix 77367 27475
9003 Lexicographi 0.952381 Anonymus Lexicographus Lexicogr. 42 40
9004 None 0.134826 Anonymi In Aristotelis Librum Alterum Analytic... 17111 2307
9006 Paroemiographi 0.281114 Gregorius Paroemiogr. 10839 3047
9007 None 0.466233 Appendix Proverbiorum 7848 3659
9008 Paroemiographi 0.452399 Macarius Chrysocephalus Paroemiogr. 7794 3526
9009 Paroemiographi 0.32256 Michael Apostolius Paroemiogr. 52970 17086
9010 None 0.189172 Suda, Suidas 617130 116744
9012 Poetae 0.381793 Ignatius Biogr. et Poeta 20223 7721
9018 Paroemiographi 0.398801 Arsenius Paroemiogr. 22693 9050
9019 Philosophici/-ae 0.130852 Stephanus Phil. 25823 3379
9020 Grammatici 0.287102 Stephanus Gramm. 19871 5705
9021 Alchemistae 0.287719 Stephanus Alchem. 16815 4838
9022 Poetae 0.385695 Joannes Tzetzes Gramm. et Poeta 6767 2610
9023 Philologi 0.280107 Thomas Magister Philol. 45704 12802

1823 rows × 5 columns


In [11]:
df_text_counts.to_csv(os.path.expanduser('~/cltk_data/user_data/stats_text_counts.csv'))

Stats by author


In [12]:
from statistics import mean
from statistics import stdev

In [13]:
author_stats = {}
corpus_word_count_all = []
corpus_word_count_unique = []
corpus_word_lexical_diversity = []
for author_id, map_counts in map_id_word_counts.items():
    corpus_word_count_all.append(map_counts['word_count_all'])
    corpus_word_count_unique.append(map_counts['word_count_unique'])
    corpus_word_lexical_diversity.append(map_counts['lexical_diversity'])

author_stats['mean_words_per_author'] = mean(corpus_word_count_all)
author_stats['standard_deviation_of_words_per_author:'] = stdev(corpus_word_count_all)
author_stats['mean_unique_words_per_author'] = mean(corpus_word_count_unique)
author_stats['standard_deviation_of_unique_words_per_author'] = stdev(corpus_word_count_unique)
author_stats['lexical_diversity_per_author'] = mean(corpus_word_lexical_diversity)
author_stats['standard_deviation_of_lexical_diversity_per_author:'] = stdev(corpus_word_lexical_diversity)

print('Mean words per author:', mean(corpus_word_count_all))
print('Standard deviation of words per author:', stdev(corpus_word_count_all))

print('Mean unique words per author:', mean(corpus_word_count_unique))
print('Standard deviation of unique words per author:', stdev(corpus_word_count_unique))

print('Lexical diversity per author:', mean(corpus_word_lexical_diversity))
print('Standard deviation of lexical diversity per author:', stdev(corpus_word_lexical_diversity))


Mean words per author: 39526.99725726824
Standard deviation of words per author: 174923.28976653758
Mean unique words per author: 5435.820076796489
Standard deviation of unique words per author: 14195.290142159112
Lexical diversity per author: 0.5171187962883808
Standard deviation of lexical diversity per author: 0.2732410961564417

In [14]:
df_authors = pandas.DataFrame(author_stats, index=[0])
df_authors


Out[14]:
lexical_diversity_per_author mean_unique_words_per_author mean_words_per_author standard_deviation_of_lexical_diversity_per_author: standard_deviation_of_unique_words_per_author standard_deviation_of_words_per_author:
0 0.517119 5435.820077 39526.997257 0.273241 14195.290142 174923.289767

In [15]:
df_authors.to_csv(os.path.expanduser('~/cltk_data/user_data/stats_authors.csv'))

Get stats about epithets


In [16]:
from collections import defaultdict
import datetime as dt
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithets

In [17]:
list_epithets = get_epithets()

In [18]:
t0 = dt.datetime.utcnow()

map_epithet_counts_all = defaultdict(list)
map_epithet_counts_unique = defaultdict(list)
map_epithet_lexical_diversity = defaultdict(list)
for file in dir_contents:
    map_word_counts = defaultdict(list)
    file_path = os.path.join(cleaned_dir, file)
    author_id = file[3:-4]
    author = map_id_author[author_id]
    with open(file_path) as fo:
        text = fo.read().lower()
    text = ''.join([char for char in text if char not in ['.']])
    tokens = p.word_tokenize(text)
    try:
        lexical_diversity = len(set(tokens)) / len(tokens)
    except ZeroDivisionError:
        lexical_diversity = 0
    epithet = get_epithet_of_author(author_id)

    map_epithet_counts_all[epithet].append(len(tokens))
    map_epithet_counts_unique[epithet].append(len(set(tokens)))
    map_epithet_lexical_diversity[epithet].append(lexical_diversity)

print('... finished in {}'.format(dt.datetime.utcnow() - t0))


... finished in 0:03:44.608805

In [24]:
from statistics import StatisticsError

epithet_lexical_diversity_tuples = []
epithet_scores = {}
for epithet, counts in map_epithet_counts_all.items():
    print(epithet)
    print('    Mean of word counts per author:', mean(counts))
    try:
        wc_standard_deviation = stdev(counts)
    except StatisticsError:
        wc_standard_deviation = 0
    print('    Standard deviation of word counts per author:', wc_standard_deviation)
    
    uniques_list = map_epithet_counts_unique[epithet]
    print('    Mean of unique word counts per author:', mean(uniques_list))
    try:
        uniques_standard_deviation = stdev(uniques_list)
    except StatisticsError:
        uniques_standard_deviation = 0
    print('    Standard deviation of unique word counts per author:', uniques_standard_deviation)

    lexical_diversity_list = map_epithet_lexical_diversity[epithet]
    print('    Mean of lexical diversity per author:', mean(lexical_diversity_list))
    try:
        ld_standard_deviation = stdev(lexical_diversity_list)
    except StatisticsError:
        ld_standard_deviation = 0
    print('    Standard deviation of lexical diversity:', ld_standard_deviation)
    
    epithet_lexical_diversity_tuples.append((epithet, mean(lexical_diversity_list)))

    tmp_scores = {}
    tmp_scores['mean_of_word_counts_ per_author'] = mean(counts)
    tmp_scores['standard_deviation_of_word_counts_per_author'] = wc_standard_deviation
    tmp_scores['mean_of_unique_word_counts_per_author'] = mean(uniques_list)
    tmp_scores['standard_deviation_of_unique_word_counts_per_author'] = uniques_standard_deviation
    tmp_scores['mean_of_lexical_diversity_per_author'] = mean(lexical_diversity_list)
    tmp_scores['standard_deviation_of_lexical_diversity'] = ld_standard_deviation
    epithet_scores[epithet] = tmp_scores


Mechanici
    Mean of word counts per author: 27850.714285714286
    Standard deviation of word counts per author: 58272.68047926142
    Mean of unique word counts per author: 3668.4285714285716
    Standard deviation of unique word counts per author: 4857.279000192832
    Mean of lexical diversity per author: 0.30923266180627396
    Standard deviation of lexical diversity: 0.1190888649052873
Mimographi
    Mean of word counts per author: 2752
    Standard deviation of word counts per author: 3008.032247167573
    Mean of unique word counts per author: 1479
    Standard deviation of unique word counts per author: 1438.2551929334377
    Mean of lexical diversity per author: 0.6253901209264193
    Standard deviation of lexical diversity: 0.16095147451789965
Hymnographi
    Mean of word counts per author: 126901
    Standard deviation of word counts per author: 0
    Mean of unique word counts per author: 24999
    Standard deviation of unique word counts per author: 0
    Mean of lexical diversity per author: 0.19699608356120124
    Standard deviation of lexical diversity: 0
Philologi
    Mean of word counts per author: 260571
    Standard deviation of word counts per author: 554261.013120849
    Mean of unique word counts per author: 29787.11111111111
    Standard deviation of unique word counts per author: 48661.146589051015
    Mean of lexical diversity per author: 0.2882825748650545
    Standard deviation of lexical diversity: 0.1304755800246082
Iambici
    Mean of word counts per author: 610.5
    Standard deviation of word counts per author: 966.6762046390635
    Mean of unique word counts per author: 423.5
    Standard deviation of unique word counts per author: 658.1576558849711
    Mean of lexical diversity per author: 0.7662045014476172
    Standard deviation of lexical diversity: 0.2688812502534106
Poetae
    Mean of word counts per author: 8495.23076923077
    Standard deviation of word counts per author: 23588.759891838694
    Mean of unique word counts per author: 2542.923076923077
    Standard deviation of unique word counts per author: 5088.694100829982
    Mean of lexical diversity per author: 0.5715269592682862
    Standard deviation of lexical diversity: 0.350695968788658
Medici
    Mean of word counts per author: 132242.07894736843
    Standard deviation of word counts per author: 414574.0207429061
    Mean of unique word counts per author: 14082.105263157895
    Standard deviation of unique word counts per author: 26032.439765622858
    Mean of lexical diversity per author: 0.33949473725041757
    Standard deviation of lexical diversity: 0.234759640350198
Scriptores Ecclesiastici
    Mean of word counts per author: 203310.41379310345
    Standard deviation of word counts per author: 587654.3703291191
    Mean of unique word counts per author: 19739.91379310345
    Standard deviation of unique word counts per author: 29989.26318118424
    Mean of lexical diversity per author: 0.3207979204601886
    Standard deviation of lexical diversity: 0.19043146387928753
Periegetae
    Mean of word counts per author: 28730
    Standard deviation of word counts per author: 71070.64200673862
    Mean of unique word counts per author: 5329.444444444444
    Standard deviation of unique word counts per author: 10078.459953176267
    Mean of lexical diversity per author: 0.42366081412770284
    Standard deviation of lexical diversity: 0.14377276157425073
Paradoxographi
    Mean of word counts per author: 3216.4444444444443
    Standard deviation of word counts per author: 4577.134341242102
    Mean of unique word counts per author: 1278
    Standard deviation of unique word counts per author: 1403.2975450701822
    Mean of lexical diversity per author: 0.5526884575900334
    Standard deviation of lexical diversity: 0.16815577249748542
Alchemistae
    Mean of word counts per author: 4077.2272727272725
    Standard deviation of word counts per author: 8089.548341808638
    Mean of unique word counts per author: 1223.3636363636363
    Standard deviation of unique word counts per author: 1752.24526836841
    Mean of lexical diversity per author: 0.4830936632872289
    Standard deviation of lexical diversity: 0.17351700128167993
Tragici
    Mean of word counts per author: 4161.788235294118
    Standard deviation of word counts per author: 22768.237305558832
    Mean of unique word counts per author: 1114.6705882352942
    Standard deviation of unique word counts per author: 5230.739015865812
    Mean of lexical diversity per author: 0.6186934024311059
    Standard deviation of lexical diversity: 0.4210897971414373
Poetae Didactici
    Mean of word counts per author: 310
    Standard deviation of word counts per author: 0
    Mean of unique word counts per author: 228
    Standard deviation of unique word counts per author: 0
    Mean of lexical diversity per author: 0.7354838709677419
    Standard deviation of lexical diversity: 0
Onirocritici
    Mean of word counts per author: 38698.5
    Standard deviation of word counts per author: 36553.88505343857
    Mean of unique word counts per author: 7283.5
    Standard deviation of unique word counts per author: 8166.3762159234375
    Mean of lexical diversity per author: 0.1598640506435638
    Standard deviation of lexical diversity: 0.060021036591121714
Poetae Medici
    Mean of word counts per author: 532
    Standard deviation of word counts per author: 397.98576188937545
    Mean of unique word counts per author: 374.75
    Standard deviation of unique word counts per author: 307.5097559427993
    Mean of lexical diversity per author: 0.7121893719806763
    Standard deviation of lexical diversity: 0.1684456565310668
Epici/-ae
    Mean of word counts per author: 8508.848484848484
    Standard deviation of word counts per author: 31548.93517171376
    Mean of unique word counts per author: 2118.121212121212
    Standard deviation of unique word counts per author: 5666.842950994445
    Mean of lexical diversity per author: 0.5901755381170039
    Standard deviation of lexical diversity: 0.33857453628977313
Elegiaci
    Mean of word counts per author: 601.3703703703703
    Standard deviation of word counts per author: 1892.1529688558105
    Mean of unique word counts per author: 300.48148148148147
    Standard deviation of unique word counts per author: 766.5725100115534
    Mean of lexical diversity per author: 0.7829407154009786
    Standard deviation of lexical diversity: 0.2874134683446021
Scriptores Erotici
    Mean of word counts per author: 24547.375
    Standard deviation of word counts per author: 26112.565305601165
    Mean of unique word counts per author: 6774.875
    Standard deviation of unique word counts per author: 6261.752001237353
    Mean of lexical diversity per author: 0.40319032343438627
    Standard deviation of lexical diversity: 0.1804668508503265
Geometri
    Mean of word counts per author: 113808
    Standard deviation of word counts per author: 97741.43902153273
    Mean of unique word counts per author: 4100.75
    Standard deviation of unique word counts per author: 1907.860472711077
    Mean of lexical diversity per author: 0.046396734394107485
    Standard deviation of lexical diversity: 0.017133951688166348
Philosophici/-ae
    Mean of word counts per author: 51075.875555555554
    Standard deviation of word counts per author: 179693.3744019452
    Mean of unique word counts per author: 5046.333333333333
    Standard deviation of unique word counts per author: 11373.068974634143
    Mean of lexical diversity per author: 0.4755907773618297
    Standard deviation of lexical diversity: 0.24403259165874935
Biographi
    Mean of word counts per author: 139125.22222222222
    Standard deviation of word counts per author: 337874.1998139906
    Mean of unique word counts per author: 18505.444444444445
    Standard deviation of unique word counts per author: 36628.98858865445
    Mean of lexical diversity per author: 0.3489969555756385
    Standard deviation of lexical diversity: 0.16288958090681266
Doxographi
    Mean of word counts per author: 20857
    Standard deviation of word counts per author: 10690.040317978226
    Mean of unique word counts per author: 4847
    Standard deviation of unique word counts per author: 1711.198410471445
    Mean of lexical diversity per author: 0.24332716760648826
    Standard deviation of lexical diversity: 0.04267050974259767
Hagiographi
    Mean of word counts per author: 577026
    Standard deviation of word counts per author: 0
    Mean of unique word counts per author: 81563
    Standard deviation of unique word counts per author: 0
    Mean of lexical diversity per author: 0.14135064971075828
    Standard deviation of lexical diversity: 0
Tactici
    Mean of word counts per author: 17298
    Standard deviation of word counts per author: 14257.208254072744
    Mean of unique word counts per author: 4335.166666666667
    Standard deviation of unique word counts per author: 2317.036332616877
    Mean of lexical diversity per author: 0.2811319092652864
    Standard deviation of lexical diversity: 0.05898116623511755
Astrologici
    Mean of word counts per author: 40658.07142857143
    Standard deviation of word counts per author: 49684.96543602623
    Mean of unique word counts per author: 6993.571428571428
    Standard deviation of unique word counts per author: 6760.877613881952
    Mean of lexical diversity per author: 0.2667317612551159
    Standard deviation of lexical diversity: 0.11512053657071745
Chronographi
    Mean of word counts per author: 167315.33333333334
    Standard deviation of word counts per author: 128769.99315316697
    Mean of unique word counts per author: 26193.166666666668
    Standard deviation of unique word counts per author: 16527.893288821375
    Mean of lexical diversity per author: 0.192453839756767
    Standard deviation of lexical diversity: 0.07109978057978761
Mythographi
    Mean of word counts per author: 8067.428571428572
    Standard deviation of word counts per author: 9161.463363043098
    Mean of unique word counts per author: 2916
    Standard deviation of unique word counts per author: 2726.469206379073
    Mean of lexical diversity per author: 0.44350822261989964
    Standard deviation of lexical diversity: 0.10409992530376733
Oratores
    Mean of word counts per author: 54713.083333333336
    Standard deviation of word counts per author: 83232.85401533385
    Mean of unique word counts per author: 8986.25
    Standard deviation of unique word counts per author: 8894.274369963061
    Mean of lexical diversity per author: 0.28408280368872196
    Standard deviation of lexical diversity: 0.14306279635893515
Comici
    Mean of word counts per author: 2135.9527027027025
    Standard deviation of word counts per author: 11381.961100546177
    Mean of unique word counts per author: 596.1081081081081
    Standard deviation of unique word counts per author: 2589.913029126116
    Mean of lexical diversity per author: 0.5163583294667708
    Standard deviation of lexical diversity: 0.2271212563395277
Epigrammatici/-ae
    Mean of word counts per author: 2778.913043478261
    Standard deviation of word counts per author: 12372.339754148963
    Mean of unique word counts per author: 762.5652173913044
    Standard deviation of unique word counts per author: 2912.8728996721206
    Mean of lexical diversity per author: 0.8487669388459786
    Standard deviation of lexical diversity: 0.1713862514400121
Gnostici
    Mean of word counts per author: 1032
    Standard deviation of word counts per author: 1412.799348810722
    Mean of unique word counts per author: 431.5
    Standard deviation of unique word counts per author: 572.0493859799169
    Mean of lexical diversity per author: 0.614900854930397
    Standard deviation of lexical diversity: 0.28748269520242653
Musici
    Mean of word counts per author: 12335.2
    Standard deviation of word counts per author: 14978.744229741023
    Mean of unique word counts per author: 3010.6
    Standard deviation of unique word counts per author: 3574.0627162936016
    Mean of lexical diversity per author: 0.4080182947663436
    Standard deviation of lexical diversity: 0.23824942152223064
Parodii
    Mean of word counts per author: 1370.8
    Standard deviation of word counts per author: 2010.2351106276099
    Mean of unique word counts per author: 474.4
    Standard deviation of unique word counts per author: 623.1856063806352
    Mean of lexical diversity per author: 0.6014235088742101
    Standard deviation of lexical diversity: 0.28881236454346115
Historici/-ae
    Mean of word counts per author: 18369.85846153846
    Standard deviation of word counts per author: 67096.250988704
    Mean of unique word counts per author: 3234.3046153846153
    Standard deviation of unique word counts per author: 8736.966947452285
    Mean of lexical diversity per author: 0.602783132461963
    Standard deviation of lexical diversity: 0.21149779951217368
Poetae Philosophi
    Mean of word counts per author: 4307.875
    Standard deviation of word counts per author: 7185.83500342365
    Mean of unique word counts per author: 1407.625
    Standard deviation of unique word counts per author: 2199.737837464144
    Mean of lexical diversity per author: 0.6119466967099234
    Standard deviation of lexical diversity: 0.29434900019244575
Atticistae
    Mean of word counts per author: 17408.25
    Standard deviation of word counts per author: 11418.418406971548
    Mean of unique word counts per author: 6447.5
    Standard deviation of unique word counts per author: 3113.0307311899983
    Mean of lexical diversity per author: 0.40594681623420636
    Standard deviation of lexical diversity: 0.07373702720652488
Lyrici/-ae
    Mean of word counts per author: 1221.2407407407406
    Standard deviation of word counts per author: 4085.5070581364143
    Mean of unique word counts per author: 727.7222222222222
    Standard deviation of unique word counts per author: 2007.501135286379
    Mean of lexical diversity per author: 0.7476551963199876
    Standard deviation of lexical diversity: 0.28102096252171194
Rhetorici
    Mean of word counts per author: 59919.942307692305
    Standard deviation of word counts per author: 139601.47564777324
    Mean of unique word counts per author: 8529.788461538461
    Standard deviation of unique word counts per author: 14684.76227495771
    Mean of lexical diversity per author: 0.40280602805586774
    Standard deviation of lexical diversity: 0.23582844963950045
Scriptores Fabularum
    Mean of word counts per author: 39241.5
    Standard deviation of word counts per author: 40196.89919011167
    Mean of unique word counts per author: 10083.5
    Standard deviation of unique word counts per author: 7292.3922343768645
    Mean of lexical diversity per author: 0.3403359258334396
    Standard deviation of lexical diversity: 0.16278828961999994
Epistolographi
    Mean of word counts per author: 24930.4
    Standard deviation of word counts per author: 40838.60395875452
    Mean of unique word counts per author: 5774
    Standard deviation of unique word counts per author: 7477.122608330025
    Mean of lexical diversity per author: 0.31840533079667954
    Standard deviation of lexical diversity: 0.17407256235074994
Grammatici
    Mean of word counts per author: 23759.118644067796
    Standard deviation of word counts per author: 52972.00968438824
    Mean of unique word counts per author: 5341.627118644068
    Standard deviation of unique word counts per author: 7737.724602942137
    Mean of lexical diversity per author: 0.4383668023216881
    Standard deviation of lexical diversity: 0.19517979251428155
Apologetici
    Mean of word counts per author: 14997.111111111111
    Standard deviation of word counts per author: 22442.318036716064
    Mean of unique word counts per author: 3726.3333333333335
    Standard deviation of unique word counts per author: 4017.8107222715207
    Mean of lexical diversity per author: 0.4362444959064383
    Standard deviation of lexical diversity: 0.1966271315397474
Theologici
    Mean of word counts per author: 337317.2272727273
    Standard deviation of word counts per author: 514073.7187353332
    Mean of unique word counts per author: 31959.863636363636
    Standard deviation of unique word counts per author: 33977.534250478275
    Mean of lexical diversity per author: 0.2056865016124094
    Standard deviation of lexical diversity: 0.12839293187456263
Polyhistorici
    Mean of word counts per author: 11359
    Standard deviation of word counts per author: 0
    Mean of unique word counts per author: 3939
    Standard deviation of unique word counts per author: 0
    Mean of lexical diversity per author: 0.34677348358130117
    Standard deviation of lexical diversity: 0
Astronomici
    Mean of word counts per author: 17452.272727272728
    Standard deviation of word counts per author: 14411.656067856387
    Mean of unique word counts per author: 2519.909090909091
    Standard deviation of unique word counts per author: 1804.452573748917
    Mean of lexical diversity per author: 0.15605168533359925
    Standard deviation of lexical diversity: 0.09962395743920886
Gnomici
    Mean of word counts per author: 994.6666666666666
    Standard deviation of word counts per author: 607.3963560421921
    Mean of unique word counts per author: 633.3333333333334
    Standard deviation of unique word counts per author: 445.1767439268737
    Mean of lexical diversity per author: 0.6072317526129395
    Standard deviation of lexical diversity: 0.07395986512270739
Bucolici
    Mean of word counts per author: 9096
    Standard deviation of word counts per author: 10974.216555180603
    Mean of unique word counts per author: 4239.666666666667
    Standard deviation of unique word counts per author: 4614.311685759138
    Mean of lexical diversity per author: 0.5318337561964496
    Standard deviation of lexical diversity: 0.08261723685478253
Geographi
    Mean of word counts per author: 38415.454545454544
    Standard deviation of word counts per author: 86587.8937639248
    Mean of unique word counts per author: 6882.454545454545
    Standard deviation of unique word counts per author: 13451.501279512531
    Mean of lexical diversity per author: 0.46230688215853827
    Standard deviation of lexical diversity: 0.2618199117561238
Paroemiographi
    Mean of word counts per author: 22930.6
    Standard deviation of word counts per author: 17919.046244150384
    Mean of unique word counts per author: 8053
    Standard deviation of unique word counts per author: 5665.462734852291
    Mean of lexical diversity per author: 0.36520992842041383
    Standard deviation of lexical diversity: 0.0664398410604052
Mathematici
    Mean of word counts per author: 100291.22222222222
    Standard deviation of word counts per author: 122502.94839796488
    Mean of unique word counts per author: 7417.222222222223
    Standard deviation of unique word counts per author: 9056.485214720136
    Mean of lexical diversity per author: 0.23653613754097272
    Standard deviation of lexical diversity: 0.25607419422288413
None
    Mean of word counts per author: 34036.12574850299
    Standard deviation of word counts per author: 108466.06341998396
    Mean of unique word counts per author: 5970.086826347306
    Standard deviation of unique word counts per author: 14189.938299315627
    Mean of lexical diversity per author: 0.5262500789551204
    Standard deviation of lexical diversity: 0.25427357430911485
Sophistae
    Mean of word counts per author: 47278.12903225807
    Standard deviation of word counts per author: 92375.2689579276
    Mean of unique word counts per author: 9728.774193548386
    Standard deviation of unique word counts per author: 15342.648829780941
    Mean of lexical diversity per author: 0.42123188222717806
    Standard deviation of lexical diversity: 0.18189319925611835
Lexicographi
    Mean of word counts per author: 265077.25
    Standard deviation of word counts per author: 235698.55655105316
    Mean of unique word counts per author: 65895.75
    Standard deviation of unique word counts per author: 46740.19347681393
    Mean of lexical diversity per author: 0.45092513391608313
    Standard deviation of lexical diversity: 0.34822708023376175
Choliambographi
    Mean of word counts per author: 19
    Standard deviation of word counts per author: 0
    Mean of unique word counts per author: 19
    Standard deviation of unique word counts per author: 0
    Mean of lexical diversity per author: 1.0
    Standard deviation of lexical diversity: 0

In [25]:
# sort epithets by lexical diversity
sorted(epithet_lexical_diversity_tuples, key=lambda x: x[1], reverse=True)


Out[25]:
[('Choliambographi', 1.0),
 ('Epigrammatici/-ae', 0.8487669388459786),
 ('Elegiaci', 0.7829407154009786),
 ('Iambici', 0.7662045014476172),
 ('Lyrici/-ae', 0.7476551963199876),
 ('Poetae Didactici', 0.7354838709677419),
 ('Poetae Medici', 0.7121893719806763),
 ('Mimographi', 0.6253901209264193),
 ('Tragici', 0.6186934024311059),
 ('Gnostici', 0.614900854930397),
 ('Poetae Philosophi', 0.6119466967099234),
 ('Gnomici', 0.6072317526129395),
 ('Historici/-ae', 0.602783132461963),
 ('Parodii', 0.6014235088742101),
 ('Epici/-ae', 0.5901755381170039),
 ('Poetae', 0.5715269592682862),
 ('Paradoxographi', 0.5526884575900334),
 ('Bucolici', 0.5318337561964496),
 (None, 0.5262500789551204),
 ('Comici', 0.5163583294667708),
 ('Alchemistae', 0.4830936632872289),
 ('Philosophici/-ae', 0.4755907773618297),
 ('Geographi', 0.46230688215853827),
 ('Lexicographi', 0.45092513391608313),
 ('Mythographi', 0.44350822261989964),
 ('Grammatici', 0.4383668023216881),
 ('Apologetici', 0.4362444959064383),
 ('Periegetae', 0.42366081412770284),
 ('Sophistae', 0.42123188222717806),
 ('Musici', 0.4080182947663436),
 ('Atticistae', 0.40594681623420636),
 ('Scriptores Erotici', 0.40319032343438627),
 ('Rhetorici', 0.40280602805586774),
 ('Paroemiographi', 0.36520992842041383),
 ('Biographi', 0.3489969555756385),
 ('Polyhistorici', 0.34677348358130117),
 ('Scriptores Fabularum', 0.3403359258334396),
 ('Medici', 0.33949473725041757),
 ('Scriptores Ecclesiastici', 0.3207979204601886),
 ('Epistolographi', 0.31840533079667954),
 ('Mechanici', 0.30923266180627396),
 ('Philologi', 0.2882825748650545),
 ('Oratores', 0.28408280368872196),
 ('Tactici', 0.2811319092652864),
 ('Astrologici', 0.2667317612551159),
 ('Doxographi', 0.24332716760648826),
 ('Mathematici', 0.23653613754097272),
 ('Theologici', 0.2056865016124094),
 ('Hymnographi', 0.19699608356120124),
 ('Chronographi', 0.192453839756767),
 ('Onirocritici', 0.1598640506435638),
 ('Astronomici', 0.15605168533359925),
 ('Hagiographi', 0.14135064971075828),
 ('Geometri', 0.046396734394107485)]

In [26]:
pandas.DataFrame(epithet_lexical_diversity_tuples)


Out[26]:
0 1
0 Mechanici 0.309233
1 Mimographi 0.625390
2 Hymnographi 0.196996
3 Philologi 0.288283
4 Iambici 0.766205
5 Poetae 0.571527
6 Medici 0.339495
7 Scriptores Ecclesiastici 0.320798
8 Periegetae 0.423661
9 Paradoxographi 0.552688
10 Alchemistae 0.483094
11 Tragici 0.618693
12 Poetae Didactici 0.735484
13 Onirocritici 0.159864
14 Poetae Medici 0.712189
15 Epici/-ae 0.590176
16 Elegiaci 0.782941
17 Scriptores Erotici 0.403190
18 Geometri 0.046397
19 Philosophici/-ae 0.475591
20 Biographi 0.348997
21 Doxographi 0.243327
22 Hagiographi 0.141351
23 Tactici 0.281132
24 Astrologici 0.266732
25 Chronographi 0.192454
26 Mythographi 0.443508
27 Oratores 0.284083
28 Comici 0.516358
29 Epigrammatici/-ae 0.848767
30 Gnostici 0.614901
31 Musici 0.408018
32 Parodii 0.601424
33 Historici/-ae 0.602783
34 Poetae Philosophi 0.611947
35 Atticistae 0.405947
36 Lyrici/-ae 0.747655
37 Rhetorici 0.402806
38 Scriptores Fabularum 0.340336
39 Epistolographi 0.318405
40 Grammatici 0.438367
41 Apologetici 0.436244
42 Theologici 0.205687
43 Polyhistorici 0.346773
44 Astronomici 0.156052
45 Gnomici 0.607232
46 Bucolici 0.531834
47 Geographi 0.462307
48 Paroemiographi 0.365210
49 Mathematici 0.236536
50 None 0.526250
51 Sophistae 0.421232
52 Lexicographi 0.450925
53 Choliambographi 1.000000

In [27]:
df_epithet_scores = pandas.DataFrame(epithet_scores).T
df_epithet_scores


Out[27]:
mean_of_lexical_diversity_per_author mean_of_unique_word_counts_per_author mean_of_word_counts_ per_author standard_deviation_of_lexical_diversity standard_deviation_of_unique_word_counts_per_author standard_deviation_of_word_counts_per_author
Mimographi 0.625390 1479.000000 2752.000000 0.160951 1438.255193 3008.032247
Philologi 0.288283 29787.111111 260571.000000 0.130476 48661.146589 554261.013121
Iambici 0.766205 423.500000 610.500000 0.268881 658.157656 966.676205
Poetae 0.571527 2542.923077 8495.230769 0.350696 5088.694101 23588.759892
Lexicographi 0.450925 65895.750000 265077.250000 0.348227 46740.193477 235698.556551
Medici 0.339495 14082.105263 132242.078947 0.234760 26032.439766 414574.020743
Scriptores Ecclesiastici 0.320798 19739.913793 203310.413793 0.190431 29989.263181 587654.370329
Periegetae 0.423661 5329.444444 28730.000000 0.143773 10078.459953 71070.642007
Paradoxographi 0.552688 1278.000000 3216.444444 0.168156 1403.297545 4577.134341
Scriptores Erotici 0.403190 6774.875000 24547.375000 0.180467 6261.752001 26112.565306
Tragici 0.618693 1114.670588 4161.788235 0.421090 5230.739016 22768.237306
Poetae Didactici 0.735484 228.000000 310.000000 0.000000 0.000000 0.000000
Onirocritici 0.159864 7283.500000 38698.500000 0.060021 8166.376216 36553.885053
Poetae Medici 0.712189 374.750000 532.000000 0.168446 307.509756 397.985762
Epici/-ae 0.590176 2118.121212 8508.848485 0.338575 5666.842951 31548.935172
Hymnographi 0.196996 24999.000000 126901.000000 0.000000 0.000000 0.000000
Alchemistae 0.483094 1223.363636 4077.227273 0.173517 1752.245268 8089.548342
Geometri 0.046397 4100.750000 113808.000000 0.017134 1907.860473 97741.439022
Biographi 0.348997 18505.444444 139125.222222 0.162890 36628.988589 337874.199814
Doxographi 0.243327 4847.000000 20857.000000 0.042671 1711.198410 10690.040318
Bucolici 0.531834 4239.666667 9096.000000 0.082617 4614.311686 10974.216555
Hagiographi 0.141351 81563.000000 577026.000000 0.000000 0.000000 0.000000
Tactici 0.281132 4335.166667 17298.000000 0.058981 2317.036333 14257.208254
Mathematici 0.236536 7417.222222 100291.222222 0.256074 9056.485215 122502.948398
Chronographi 0.192454 26193.166667 167315.333333 0.071100 16527.893289 128769.993153
Mythographi 0.443508 2916.000000 8067.428571 0.104100 2726.469206 9161.463363
Comici 0.516358 596.108108 2135.952703 0.227121 2589.913029 11381.961101
Epigrammatici/-ae 0.848767 762.565217 2778.913043 0.171386 2912.872900 12372.339754
Polyhistorici 0.346773 3939.000000 11359.000000 0.000000 0.000000 0.000000
Gnostici 0.614901 431.500000 1032.000000 0.287483 572.049386 1412.799349
Musici 0.408018 3010.600000 12335.200000 0.238249 3574.062716 14978.744230
Parodii 0.601424 474.400000 1370.800000 0.288812 623.185606 2010.235111
Geographi 0.462307 6882.454545 38415.454545 0.261820 13451.501280 86587.893764
Paroemiographi 0.365210 8053.000000 22930.600000 0.066440 5665.462735 17919.046244
Historici/-ae 0.602783 3234.304615 18369.858462 0.211498 8736.966947 67096.250989
Poetae Philosophi 0.611947 1407.625000 4307.875000 0.294349 2199.737837 7185.835003
Atticistae 0.405947 6447.500000 17408.250000 0.073737 3113.030731 11418.418407
Lyrici/-ae 0.747655 727.722222 1221.240741 0.281021 2007.501135 4085.507058
Rhetorici 0.402806 8529.788462 59919.942308 0.235828 14684.762275 139601.475648
Epistolographi 0.318405 5774.000000 24930.400000 0.174073 7477.122608 40838.603959
Grammatici 0.438367 5341.627119 23759.118644 0.195180 7737.724603 52972.009684
Apologetici 0.436244 3726.333333 14997.111111 0.196627 4017.810722 22442.318037
Theologici 0.205687 31959.863636 337317.227273 0.128393 33977.534250 514073.718735
Philosophici/-ae 0.475591 5046.333333 51075.875556 0.244033 11373.068975 179693.374402
Astronomici 0.156052 2519.909091 17452.272727 0.099624 1804.452574 14411.656068
Gnomici 0.607232 633.333333 994.666667 0.073960 445.176744 607.396356
Scriptores Fabularum 0.340336 10083.500000 39241.500000 0.162788 7292.392234 40196.899190
Oratores 0.284083 8986.250000 54713.083333 0.143063 8894.274370 83232.854015
Elegiaci 0.782941 300.481481 601.370370 0.287413 766.572510 1892.152969
Astrologici 0.266732 6993.571429 40658.071429 0.115121 6760.877614 49684.965436
NaN 0.526250 5970.086826 34036.125749 0.254274 14189.938299 108466.063420
Sophistae 0.421232 9728.774194 47278.129032 0.181893 15342.648830 92375.268958
Mechanici 0.309233 3668.428571 27850.714286 0.119089 4857.279000 58272.680479
Choliambographi 1.000000 19.000000 19.000000 0.000000 0.000000 0.000000

In [28]:
df_epithet_scores.to_csv(os.path.expanduser('~/cltk_data/user_data/stats_epithet.csv'))

In [ ]: