In [11]:
import datetime as dt
import os
import time
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_index
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithets
from cltk.corpus.greek.tlg.parse_tlg_indices import select_authors_by_epithet
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_of_author
from cltk.corpus.greek.tlg.parse_tlg_indices import get_id_author
from cltk.stop.greek.stops import STOPS_LIST as greek_stops
from cltk.tokenize.word import nltk_tokenize_words
from greek_accentuation.characters import base
import pandas # pip install pandas
from sklearn import metrics
from sklearn.cluster import AffinityPropagation
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
In [12]:
# Try this dropping the following less represented epithets (list taken from 3.1b)
to_drop = {'Apologetici',
'Astrologici',
'Astronomici',
'Atticistae',
'Biographi',
'Bucolici',
'Choliambographi',
'Chronographi',
'Doxographi',
'Epigrammatici/-ae',
'Epistolographi',
'Geographi',
'Geometri',
'Gnomici',
'Gnostici',
'Hagiographi',
'Hymnographi',
'Iambici',
'Lexicographi',
'Mathematici',
'Mechanici',
'Mimographi',
'Musici',
'Mythographi',
'Nomographi',
'Onirocritici',
'Oratores',
'Paradoxographi',
'Parodii',
'Paroemiographi',
'Periegetae',
'Philologi',
'Poetae Didactici',
'Poetae Medici',
'Poetae Philosophi',
'Polyhistorici',
'Scriptores Erotici',
'Scriptores Fabularum',
'Scriptores Rerum Naturalium',
'Tactici'}
In [13]:
def stream_lemmatized_files(corpus_dir, reject_none_epithet=True, reject_chars_less_than=None, reject_epithets=None):
# return all docs in a dir; parameters for removing by None epithet and short texts
user_dir = os.path.expanduser('~/cltk_data/user_data/' + corpus_dir)
files = os.listdir(user_dir)
map_id_author = get_id_author()
for file in files:
filepath = os.path.join(user_dir, file)
file_id = file[3:-4]
author = map_id_author[file_id]
if reject_none_epithet:
# get id numbers and then epithets of each author
author_epithet = get_epithet_of_author(file_id)
if not author_epithet:
continue
if reject_epithets:
if author_epithet in reject_epithets:
continue
with open(filepath) as fo:
text = fo.read()
if reject_chars_less_than:
if len(text) < reject_chars_less_than:
continue
yield file_id, author, text
In [14]:
t0 = dt.datetime.utcnow()
id_author_text_list = []
for tlg_id, author, text in stream_lemmatized_files('tlg_lemmatized_no_accents_no_stops',
reject_none_epithet=True,
reject_chars_less_than=500,
reject_epithets=to_drop):
id_author_text_list.append((tlg_id, author, text))
print('... finished in {}'.format(dt.datetime.utcnow() - t0))
print('Number of texts:', len(id_author_text_list))
In [43]:
t0 = dt.datetime.utcnow()
# tf-idf features
n_samples = 2000
n_features = 2000 # TODO: increase
n_topics = len(get_epithets()) # 55
n_top_words = 20
tfidf_vectorizer = TfidfVectorizer(max_df=1.0,
min_df=1,
max_features=n_features,
stop_words=None)
texts_list = [t[2] for t in id_author_text_list]
tfidf_matrix = tfidf_vectorizer.fit_transform(texts_list)
# save features
vector_fp = os.path.expanduser('~/cltk_data/user_data/tfidf_{0}features.pickle'.format(n_features))
joblib.dump(tfidf_matrix, vector_fp)
print('... finished in {}'.format(dt.datetime.utcnow() - t0))
# time on good server:
# 1000 features: 0:01:22
In [44]:
af = AffinityPropagation(damping=0.5).fit(tfidf_matrix) # defaults: damping=0.5, preference=None
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
In [45]:
print('Estimated number of clusters: %d' % n_clusters_)
In [46]:
import matplotlib.pyplot as plt
from itertools import cycle
In [47]:
tfidf_array = tfidf_matrix.toarray()
In [48]:
type(tfidf_array)
Out[48]:
In [52]:
tfidf_array.shape
Out[52]:
In [51]:
pandas.DataFrame(tfidf_array).head(10)
Out[51]:
In [49]:
plt.close('all')
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
class_members = labels == k
cluster_center = tfidf_array[cluster_centers_indices[k]]
plt.plot(tfidf_array[class_members, 0], tfidf_array[class_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
for x in tfidf_array[class_members]:
plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
In [ ]: