In [15]:
# Feature table
In [1]:
import os
import re
import time
from cltk.corpus.utils.formatter import assemble_phi5_works_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
In [2]:
def works_texts_list():
fps = assemble_phi5_works_filepaths()
curly_comp = re.compile(r'{.+?}')
_list = []
for fp in fps:
with open(fp) as fo:
fr = fo.read()
text = phi5_plaintext_cleanup(fr, rm_punctuation=True, rm_periods=True)
text = curly_comp.sub('', text)
_list.append(text)
return _list
In [3]:
t0 = time.time()
text_list = works_texts_list()
print('Total texts', len(text_list))
print('Time to build list of texts: {}'.format(time.time() - t0))
In [5]:
# bag of words/word count
def bow_csv():
t0 = time.time()
vectorizer = CountVectorizer(min_df=1)
column_names = ['wc_' + w for w in vectorizer.get_feature_names()]
term_document_matrix = vectorizer.fit_transform(text_list)
dataframe_bow = pd.DataFrame(term_document_matrix.toarray(), columns=column_names)
print('DF BOW shape', dataframe_bow.shape)
fp = os.path.expanduser('~/cltk_data/user_data/bow_latin.csv')
dataframe_bow.to_csv(fp)
print('Time to create BOW vectorizer and write csv: {}'.format(time.time() - t0))
In [7]:
# tf-idf
def tfidf_csv():
t0 = time.time()
vectorizer = TfidfVectorizer(min_df=1)
column_names = ['tfidf_' + w for w in vectorizer.get_feature_names()]
term_document_matrix = vectorizer.fit_transform(text_list)
dataframe_tfidf = pd.DataFrame(term_document_matrix.toarray(), columns=column_names)
print('DF tf-idf shape', dataframe_tfidf.shape)
fp = os.path.expanduser('~/cltk_data/user_data/tfidf_latin.csv')
dataframe_tfidf.to_csv(fp)
print('Time to create tf-idf vectorizer and write csv: {}'.format(time.time() - t0))
In [6]:
bow_csv()
In [8]:
tfidf_csv()
In [10]:
vectorizer = TfidfVectorizer(min_df=1)
term_document_matrix = vectorizer.fit_transform(text_list)
In [ ]: