Scikit Affinity propagation: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AffinityPropagation.html



In [1]:

    
import datetime as dt
import os
import time

from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_index
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithets
from cltk.corpus.greek.tlg.parse_tlg_indices import select_authors_by_epithet
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_of_author
from cltk.corpus.greek.tlg.parse_tlg_indices import get_id_author
from cltk.stop.greek.stops import STOPS_LIST as greek_stops
from cltk.tokenize.word import nltk_tokenize_words

from greek_accentuation.characters import base

import pandas  # pip install pandas

from sklearn import metrics
from sklearn.cluster import AffinityPropagation
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer



In [2]:

    
def stream_lemmatized_files(corpus_dir, reject_none_epithet=False, reject_chars_less_than=None):
    # return all docs in a dir; parameters for removing by None epithet and short texts
    user_dir = os.path.expanduser('~/cltk_data/user_data/' + corpus_dir)
    files = os.listdir(user_dir)
    
    map_id_author = get_id_author()

    for file in files:
        filepath = os.path.join(user_dir, file)
        file_id = file[3:-4]
        author = map_id_author[file_id]

        if reject_none_epithet:
            # get id numbers and then epithets of each author
            author_epithet = get_epithet_of_author(file_id)
            if not author_epithet:
                continue

        with open(filepath) as fo:
            
            text = fo.read()
            
            if reject_chars_less_than:
                if len(text) < reject_chars_less_than:
                    continue
            
            yield file_id, author, text



In [3]:

    
t0 = dt.datetime.utcnow()

id_author_text_list = []
for tlg_id, author, text in stream_lemmatized_files('tlg_lemmatized_no_accents_no_stops', 
                                    reject_none_epithet=True,
                                    reject_chars_less_than=500):
    id_author_text_list.append((tlg_id, author, text))

print('... finished in {}'.format(dt.datetime.utcnow() - t0))
print('Number of texts:', len(id_author_text_list))









    



... finished in 0:00:02.021323
Number of texts: 1143



In [4]:

    
t0 = dt.datetime.utcnow()

# tf-idf features
n_samples = 2000
n_features = 1000  # TODO: increase
n_topics = len(get_epithets())  # 55
n_top_words = 20

tfidf_vectorizer = TfidfVectorizer(max_df=1.0, 
                                   min_df=1,
                                   max_features=n_features,
                                   stop_words=None)
texts_list = [t[2] for t in id_author_text_list]
tfidf_matrix = tfidf_vectorizer.fit_transform(texts_list)

# save features
vector_fp = os.path.expanduser('~/cltk_data/user_data/tfidf_{0}features.pickle'.format(n_features))
joblib.dump(tfidf_matrix, vector_fp)

print('... finished in {}'.format(dt.datetime.utcnow() - t0))
# time on good server:
# 1000 features: 0:01:22









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-ed57442dd3e5> in <module>()
     16 # save features
     17 vector_fp = os.path.expanduser('~/cltk_data/user_data/tfidf_{0}features.pickle'.format(n_features))
---> 18 joblib.dump(tfidf, vector_fp)
     19 
     20 print('... finished in {}'.format(dt.datetime.utcnow() - t0))

NameError: name 'tfidf' is not defined

Do affinity propagation



In [5]:

    
af = AffinityPropagation(preference=-50).fit(tfidf_matrix)  #! ch preference
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)



In [6]:

    
print('Estimated number of clusters: %d' % n_clusters_)









    



Estimated number of clusters: 1143

Visualize



In [7]:

    
import matplotlib.pyplot as plt
from itertools import cycle



In [8]:

    
tfidf_array = tfidf_matrix.toarray()



In [9]:

    
type(tfidf_array)









    Out[9]:





numpy.ndarray



In [16]:

    
plt.close('all')
plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    class_members = labels == k
    cluster_center = tfidf_array[cluster_centers_indices[k]]
    plt.plot(tfidf_array[class_members, 0], tfidf_array[class_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
    for x in tfidf_array[class_members]:
        plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()



In [ ]: