In [2]:
#!/usr/bin/python
################################################################################
#Author: Antsa Raharimanantsoa
#Description: Classification using clustering algorithm
#Creation_date: March 2017
################################################################################

import nltk
from mongoengine import *
from document import *
from lib import *
from algo_clustering import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import cluster
from operator import itemgetter
from itertools import groupby
import logging
from datetime import datetime
import time

log_name = datetime.now().strftime("%Y%m%d_%H%M")
logging.basicConfig(filename='log/clusters/' + log_name + '.log',
                    level=logging.DEBUG,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    datefmt='%d-%m-%Y %I:%M:%S %p')

stopwords = nltk.corpus.stopwords.words('french')
stopwords += nltk.corpus.stopwords.words('english')

"""Retrieve all contents for the clustering"""
content = get_content_article()
logging.info("Retrieve all articles for the classification")

"""tf-idf representation"""
tfidf_vectorizer = TfidfVectorizer(#max_df=0.8,
                                max_features=200000,
                                 #min_df=0.2,
                                 stop_words=stopwords,
                                use_idf=True,ngram_range=(1,3)
                                ,tokenizer=tokenize_only)
tfidf_matrix = tfidf_vectorizer.fit_transform(content.values())

print('---- TF-IDF done ----')
logging.info("TF-IDF done, clustering ongoing ...")

"""Test of Kmeans (number of clusters must be detected in advance)"""
km_clusters = kmeans(tfidf_matrix, 5)
gp_tokens = []
gp_clusters = [{'cluster':vlue,
                'article_id':content.keys()[idx]} for (idx,vlue) in enumerate(km_clusters)]

connect(DATABASE_NAME)
for each_art in NewArticle.objects:
    for idents in gp_clusters:
        if each_art.id == idents['article_id']:
            gp_tokens.append((each_art.tokens,idents['cluster']))
            
sorted_tokens = sorted(gp_tokens, key=itemgetter(1))
sorted_tokens


---- TF-IDF done ----
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-0817d10e59e1> in <module>()
     55     for idents in gp_clusters:
     56         if each_art.id == idents['article_id']:
---> 57             gp_tokens.append((each_art.tokens,idents['cluster']))
     58 
     59 sorted_tokens = sorted(gp_tokens, key=itemgetter(1))

NameError: name 'gp_tokens' is not defined

In [ ]: