In [1]:

    
import datetime as dt
import os
import time

from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_of_author
from cltk.corpus.greek.tlg.parse_tlg_indices import get_id_author
import pandas
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer

Make vectorizer



In [26]:

    
def stream_lemmatized_files(corpus_dir):
    # return all docs in a dir
    user_dir = os.path.expanduser('~/cltk_data/user_data/' + corpus_dir)
    files = os.listdir(user_dir)

    for file in files:
        filepath = os.path.join(user_dir, file)
        with open(filepath) as fo:
            #TODO rm words less the 3 chars long
            yield file[3:-4], fo.read()



In [3]:

    
t0 = dt.datetime.utcnow()

map_id_author = get_id_author()

df = pandas.DataFrame(columns=['id', 'author' 'text', 'epithet'])

for _id, text in stream_lemmatized_files('tlg_lemmatized_no_accents_no_stops'):
    author = map_id_author[_id]
    epithet = get_epithet_of_author(_id)
    df = df.append({'id': _id, 'author': author, 'text': text, 'epithet': epithet}, ignore_index=True)

print(df.shape)
print('... finished in {}'.format(dt.datetime.utcnow() - t0))
print('Number of texts:', len(df))









    



(1823, 5)
... finished in 0:00:19.186495
Number of texts: 1823



In [4]:

    
text_list = df['text'].tolist()

# make a list of short texts to drop
# For pres, get distributions of words per doc
short_text_drop_index = [index if len(text) > 500 else None for index, text in enumerate(text_list) ]  # ~100 words



In [5]:

    
t0 = dt.datetime.utcnow()

# TODO: Consdier using generator to CV http://stackoverflow.com/a/21600406

# time & size counts, w/ 50 texts:
# 0:01:15 & 202M @ ngram_range=(1, 3), min_df=2, max_features=500
# 0:00:26 & 80M @ ngram_range=(1, 2), analyzer='word', min_df=2, max_features=5000
# 0:00:24 & 81M @ ngram_range=(1, 2), analyzer='word', min_df=2, max_features=50000

# time & size counts, w/ 1823 texts:
# 0:02:18 & 46MB @ ngram_range=(1, 1), analyzer='word', min_df=2, max_features=500000
# 0:2:01 & 47 @ ngram_range=(1, 1), analyzer='word', min_df=2, max_features=1000000

# max features in the lemmatized data set: 551428
max_features = 100000
ngrams = 1
vectorizer = CountVectorizer(ngram_range=(1, ngrams), analyzer='word', 
                             min_df=2, max_features=max_features)
term_document_matrix = vectorizer.fit_transform(text_list)  # input is a list of strings, 1 per document

# save matrix
vector_fp = os.path.expanduser('~/cltk_data/user_data/vectorizer_test_features{0}_ngrams{1}.pickle'.format(max_features, ngrams))
joblib.dump(term_document_matrix, vector_fp)

print('... finished in {}'.format(dt.datetime.utcnow() - t0))









    



... finished in 0:01:28.154103

Transform term matrix into feature table



In [6]:

    
# Put BoW vectors into a new df
term_document_matrix = joblib.load(vector_fp)  # scipy.sparse.csr.csr_matrix



In [7]:

    
term_document_matrix.shape









    Out[7]:





(1823, 100000)



In [8]:

    
term_document_matrix_array = term_document_matrix.toarray()



In [9]:

    
dataframe_bow = pandas.DataFrame(term_document_matrix_array, columns=vectorizer.get_feature_names())



In [10]:

    
ids_list = df['id'].tolist()



In [11]:

    
len(ids_list)









    Out[11]:





1823



In [12]:

    
dataframe_bow.shape









    Out[12]:





(1823, 100000)



In [13]:

    
dataframe_bow['id'] = ids_list



In [14]:

    
authors_list = df['author'].tolist()
dataframe_bow['author'] = authors_list



In [15]:

    
epithets_list = df['epithet'].tolist()
dataframe_bow['epithet'] = epithets_list



In [16]:

    
# For pres, give distribution of epithets, including None
dataframe_bow['epithet']









    Out[16]:





0                  Historici/-ae
1                        Tragici
2                        Tragici
3                         Comici
4                           None
5                           None
6                  Historici/-ae
7               Philosophici/-ae
8                      Sophistae
9                     Theologici
10                 Historici/-ae
11      Scriptores Ecclesiastici
12                     Geographi
13                    Periegetae
14                          None
15                    Lyrici/-ae
16              Philosophici/-ae
17                       Tragici
18                          None
19                     Geographi
20                          None
21                        Medici
22                 Historici/-ae
23                 Historici/-ae
24                        Medici
25                    Lyrici/-ae
26                  Onirocritici
27                Paradoxographi
28      Scriptores Ecclesiastici
29                       Tragici
                  ...           
1793               Historici/-ae
1794               Historici/-ae
1795               Historici/-ae
1796                        None
1797               Historici/-ae
1798           Epigrammatici/-ae
1799                        None
1800            Philosophici/-ae
1801            Philosophici/-ae
1802                    Elegiaci
1803                  Lyrici/-ae
1804                     Iambici
1805                 Alchemistae
1806            Philosophici/-ae
1807            Philosophici/-ae
1808                      Comici
1809                      Comici
1810            Philosophici/-ae
1811                  Lyrici/-ae
1812                   Sophistae
1813                   Epici/-ae
1814            Philosophici/-ae
1815            Philosophici/-ae
1816               Historici/-ae
1817                 Astronomici
1818            Philosophici/-ae
1819                  Lyrici/-ae
1820               Historici/-ae
1821                        None
1822                      Comici
Name: epithet, dtype: object



In [21]:

    
t0 = dt.datetime.utcnow()

# removes 334
#! remove rows whose epithet = None
# note on selecting none in pandas: http://stackoverflow.com/a/24489602
dataframe_bow = dataframe_bow[dataframe_bow.epithet.notnull()]
dataframe_bow.shape

print('... finished in {}'.format(dt.datetime.utcnow() - t0))









    



... finished in 0:00:02.298707



In [22]:

    
t0 = dt.datetime.utcnow()

dataframe_bow.to_csv(os.path.expanduser('~/cltk_data/user_data/tlg_bow.csv'))

print('... finished in {}'.format(dt.datetime.utcnow() - t0))









    



... finished in 0:05:07.627325



In [23]:

    
dataframe_bow.shape









    Out[23]:





(1489, 100003)



In [24]:

    
dataframe_bow.head(10)









    Out[24]:






  
    
      
      ʹʹ
      ʹγʹ
      ʹδʹ
      αʹ
      ααα
      ααπτος
      ααπτους
      ααρων
      αασαμην
      αασχετον
      ...
      ϲωμα
      ϲωματα
      ϲωματι
      ϲωματοϲ
      ϲωματων
      ϲωμαϲι
      ϲωμαϲιν
      id
      author
      epithet
    
  
  
    
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      1459
      Lepidus Hist.
      Historici/-ae
    
    
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0825
      Melito Trag.
      Tragici
    
    
      2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0331
      [Polyidus] Trag.
      Tragici
    
    
      3
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0417
      Archippus Comic.
      Comici
    
    
      6
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      2475
      Menecrates Hist.
      Historici/-ae
    
    
      7
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      4075
      Marinus Phil.
      Philosophici/-ae
    
    
      8
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      2127
      Troilus Soph.
      Sophistae
    
    
      9
      0
      0
      0
      0
      0
      0
      0
      4
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      2074
      Apollinaris Theol.
      Theologici
    
    
      10
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      2173
      Antileon Hist.
      Historici/-ae
    
    
      11
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      1419
      Hermas Scr. Eccl., Pastor Hermae
      Scriptores Ecclesiastici
    
  

10 rows × 100003 columns



In [25]:

    
# write dataframe_bow to disk, for fast reuse while classifying
# 2.3G
fp_df = os.path.expanduser('~/cltk_data/user_data/tlg_bow_df.pickle')
joblib.dump(dataframe_bow, fp_df)









    Out[25]:





['/root/cltk_data/user_data/tlg_bow_df.pickle']

	ααρων	...	id	author	epithet
0	0	...	1459	Lepidus Hist.	Historici/-ae
1	0	...	0825	Melito Trag.	Tragici
2	0	...	0331	[Polyidus] Trag.	Tragici
3	0	...	0417	Archippus Comic.	Comici
6	0	...	2475	Menecrates Hist.	Historici/-ae
7	0	...	4075	Marinus Phil.	Philosophici/-ae
8	0	...	2127	Troilus Soph.	Sophistae
9	4	...	2074	Apollinaris Theol.	Theologici
10	0	...	2173	Antileon Hist.	Historici/-ae
11	0	...	1419	Hermas Scr. Eccl., Pastor Hermae	Scriptores Ecclesiastici