This notebook illustrates the CLTK's sentence tokenizer for the Latin language. First, I offer the code with which I tokenized, and counted, sentences and words in the PHI5. Next, the notebook offers a few simple views of this data, organized globally and for specific genres.
This first step tokenizes words and sentences, then returns the ratio of average words per sentence for each author in the PHI5 disk. The following code runs in under 5 minutes (download here). This script generates a file called called phi5_auth_sent_data_v3.txt
(download).
"""For computing sentence length data for PHI5 authors."""
import ast
from cltk.tokenize.sentence_tokenizer_latin import tokenize_latin_sentences
from collections import Counter
from nltk.tokenize import RegexpTokenizer
import os
import re
# PHI5 parameters
PHI5_DIR = os.path.join('~/cltk_data', 'compiled', 'phi5')
PHI5_INDEX_REL = os.path.join(PHI5_DIR, 'index_author_works.txt')
# Local write parameters
WRITE_FILE_NAME = 'phi5_auth_sent_data_v3.txt'
WRITE_DIR_LOCATION = '~/Downloads'
class AvgSents(object):
"""Contains methods for computing average sentence length."""
def open_file(self, abs_path):
"""Open any file with an absolute path."""
try:
with open(abs_path) as file:
read_file = file.read()
return read_file
except:
pass
def clean_text(self, input_text):
"""Clean text.
TODO: rm '\x00'
"""
output_text = re.sub(r'\{\d.+?\}\d|\d+?|\{.+?\}|\.{3}|@|\?|!|`|&|%|\$|\#|\+|-|—|\[|\]|<|>|\\\.|!|\.\s\.|\.\.|p\.\s|,\s,|\\|\*', '', input_text)
return output_text
def phi5_author_path(self, author, index_dict):
"""Build absolute path to PHI5 author file."""
print('Processing author:', author)
phi5_file = index_dict[author]['phi5_file']
phi5_file_txt = os.path.join(phi5_file + '.txt')
phi5_file_rel = os.path.join(PHI5_DIR, phi5_file_txt)
phi5_file_abs = os.path.expanduser(phi5_file_rel)
return phi5_file_abs
def count_tokenize_sents(self, author_read):
"""Tokenize and count sentences in a string."""
try:
sents = tokenize_latin_sentences(author_read)
total_sents = len(sents)
tally_dict = self.tally_of_sentence_lengths(sents)
except:
pass
print('Total sentences:', total_sents)
print('Tally of sentence lengths:', tally_dict)
return total_sents, tally_dict
def write_dict(self, auth_s_w_dict):
"""Write author sentence word data into dict in a file."""
write_rel_path = os.path.join(WRITE_DIR_LOCATION, WRITE_FILE_NAME)
write_abs_path = os.path.expanduser(write_rel_path)
try:
with open(write_abs_path, 'w') as file:
try:
file.write(str(auth_s_w_dict))
except:
pass
except:
pass
def count_tokenize_words(self, author_read):
"""Tokenize and count words in a string."""
tokenizer = RegexpTokenizer('\s+', gaps=True)
try:
words = tokenizer.tokenize(author_read)
return len(words)
except:
pass
def tally_of_sentence_lengths(self, sents_list):
"""Count the total occurrences of each sentence length. I.e.,
{1: 4, 2: 3, 3: 7, 8:12, ... 19: 400, 20: 379, 21: 433, ... },
as in: {'number of words in a sentence': 'number of times this number
of words occurs in an author'}
"""
tally_list = []
for sentence in sents_list:
word_tokenizer = RegexpTokenizer('\s+', gaps=True)
try:
sentence_words = word_tokenizer.tokenize(sentence)
sentence_word_length = len(sentence_words)
tally_list.append(sentence_word_length)
except:
pass
tally_counter = Counter(tally_list)
tally_dict = dict(tally_counter)
return tally_dict
def main():
"""Main function"""
avg = AvgSents()
phi5_index_abs = os.path.expanduser(PHI5_INDEX_REL)
index_read = avg.open_file(phi5_index_abs)
index_dict = ast.literal_eval(index_read)
auth_s_w_dict = {}
for author in index_dict:
phi5_file_abs = avg.phi5_author_path(author, index_dict)
author_read = avg.open_file(phi5_file_abs)
avg.clean_text(author_read)
try:
sent_count, tally_dict = avg.count_tokenize_sents(author_read)
except:
sent_count = 0
try:
word_count = avg.count_tokenize_words(author_read)
except:
word_count = 0
try:
avg_words_per_sent = word_count / sent_count
except:
avg_words_per_sent = 0
counts = {'sent_count': sent_count,
'word_count': word_count,
'avg_words_per_sent': avg_words_per_sent,
'tally_of_sent_word_lengths': tally_dict}
auth_s_w_dict[author] = counts
avg.write_dict(auth_s_w_dict)
if __name__ == "__main__":
main()
Next, I offer some simple views of the file phi5_auth_sent_data_v3.txt
using the data analysis library Pandas.
In [14]:
import ast
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
#display enough rows for all PHI5 authors
pd.set_option('display.max_rows', 10000)
In [15]:
file_name = 'phi5_auth_sent_data_v3.txt'
dir_location = '~/Downloads'
rel_path = os.path.join(dir_location, file_name)
abs_path = os.path.expanduser(rel_path)
with open(abs_path) as f:
r = f.read()
d = ast.literal_eval(r)
In [16]:
# generate the Pandas DataFrame
df = pd.DataFrame(d)
#rm tally from these views
df = df.drop('tally_of_sent_word_lengths')
Outliers are apparent on the low and high end (fragmentary texts with odd formatting). Filtering out authors with less than, say, 1000 words would return better results. I am leaving these outliers in, however, because they will help me to improve the CLTK's text cleaner and sentence tokenizer.
In [17]:
df.T.sort('avg_words_per_sent', ascending=1)
Out[17]:
In [18]:
authors = list(df.columns.values)
dict_gen = {}
historians = ['Titus Livius; Livy', 'Gaius Suetonius Tranquillus', 'Cornelius Tacitus', 'Valerius Maximus', 'Gaius Iulius Caesar; Caesar', 'Valerius Antias', 'Lucius Coelius Antipater', 'Sempronius Asellio', 'Gaius Asinius Pollio', 'Gavius Bassus', 'Lucius Calpurnius Piso Frugi', 'Marcus Porcius Cato; Cato', 'Lucius Cincius Alimentus', 'Claudius Caesar Germanicus', 'Quintus Claudius Quadrigarius', 'Lucius Herennius Balbus', 'Quintus Curtius Rufus', 'Annius Florus', 'Gnaeus Gellius', 'Granius Licinianus', 'Titus Labienus', 'Gaius Licinius Mucianus', 'Valerius Maximus', 'Quintus Asconius Pedianus', 'Fabius Pictor', 'Pompeius Trogus', 'Gaius Sallustius Crispus', 'Lucius Annaeus Seneca senior', 'Silius Italicus', 'Lucius Cornelius Sisenna', 'Velleius Paterculus']
for author in authors:
if author in historians:
dict_gen[author] = d[author]
df_hist = pd.DataFrame(dict_gen)
df_hist = df_hist.drop('tally_of_sent_word_lengths')
df_hist.T.sort('avg_words_per_sent', ascending=1)
Out[18]:
Saving the dataframe to comma–separated–values may be useful for use in other, non–Python programs (download here).
In [19]:
df.T.to_csv('phi5_table_data.csv')