In [15]:
# Feature table

In [1]:
import os
import re
import time

from cltk.corpus.utils.formatter import assemble_phi5_works_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [2]:
def works_texts_list():
    fps = assemble_phi5_works_filepaths()
    curly_comp = re.compile(r'{.+?}')
    _list = []
    for fp in fps:
        with open(fp) as fo:
            fr = fo.read()
        text = phi5_plaintext_cleanup(fr, rm_punctuation=True, rm_periods=True)
        text = curly_comp.sub('', text)
        _list.append(text)
    return _list

In [3]:
t0 = time.time()
text_list = works_texts_list()
print('Total texts', len(text_list))
print('Time to build list of texts: {}'.format(time.time() - t0))


Time to build list of texts: 76.1740050315857

In [5]:
# bag of words/word count
def bow_csv():
    t0 = time.time()
    vectorizer = CountVectorizer(min_df=1)
    column_names = ['wc_' + w for w in vectorizer.get_feature_names()]
    term_document_matrix = vectorizer.fit_transform(text_list)
    dataframe_bow = pd.DataFrame(term_document_matrix.toarray(), columns=column_names)
    print('DF BOW shape', dataframe_bow.shape)

    fp = os.path.expanduser('~/cltk_data/user_data/bow_latin.csv')
    dataframe_bow.to_csv(fp)
    print('Time to create BOW vectorizer and write csv: {}'.format(time.time() - t0))

In [7]:
# tf-idf
def tfidf_csv():
    t0 = time.time()
    vectorizer = TfidfVectorizer(min_df=1)
    column_names = ['tfidf_' + w for w in vectorizer.get_feature_names()]
    term_document_matrix = vectorizer.fit_transform(text_list)
    dataframe_tfidf = pd.DataFrame(term_document_matrix.toarray(), columns=column_names)
    print('DF tf-idf shape', dataframe_tfidf.shape)
    
    fp = os.path.expanduser('~/cltk_data/user_data/tfidf_latin.csv')
    dataframe_tfidf.to_csv(fp)
    print('Time to create tf-idf vectorizer and write csv: {}'.format(time.time() - t0))

In [6]:
bow_csv()

In [8]:
tfidf_csv()


DF tf-idf shape (836, 318869)
Time to create tf-idf vectorizer and write csv: 827.4702050685883

In [10]:
vectorizer = TfidfVectorizer(min_df=1)
term_document_matrix = vectorizer.fit_transform(text_list)

In [ ]: