In [7]:
%%time
from pymongo import MongoClient
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
from sklearn.cluster import KMeans
import numpy as np
import pickle
import time

#start_time = time.time()

client=MongoClient()
client=MongoClient('mongodb://localhost:/')
db=client['eventData']
sen=db.documents_english

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

texts = []
docIds=[]
actuallyTrained=0;
for i in sen.find():
    try:
        raw = ''.join(i['document']).lower()
        tokens = tokenizer.tokenize(raw)
        stopped_tokens = [i for i in tokens if not i in en_stop]
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)
        docIds.append(i['_id'])
        actuallyTrained=actuallyTrained+1
    except:
        pass

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word = dictionary, passes=1)

dim=20 
result=[]
for i in range(0,actuallyTrained):
    feature=[]
    previousindex=0
    for item in ldamodel[corpus[i]]:
        index=item[0]
        #print(index)
        for beforeindex in range(previousindex,index):
            feature.append(0)
        feature.append(item[1])
        previousindex=index+1
    while (len(feature)<dim):
        feature.append(0);  #add in 0 at the end
    result.append(feature)
    
kmeanstest=np.array(result)
kmeans = KMeans(n_clusters=20, random_state=0).fit(kmeanstest)

#and before building  the dictionary test if the size of docIds and cluster result dimensions are the same.
try:
    assert(len(docIds)==kmeans.labels_.size)
    dictionary_cocId_topicClusterItBelongs={}
    for i in range(0,actuallyTrained):
        dictionary_cocId_topicClusterItBelongs.update({docIds[i]:kmeans.labels_[i]})
except:
    print("the docIds size is different from the topic # cluster size")

with open('traingrst_english.pkl', 'wb') as output:
    pickle.dump(dictionary_cocId_topicClusterItBelongs,output)

#print("--- %s seconds ---" % (time.time() - start_time))


CPU times: user 2h 52min 49s, sys: 33min 51s, total: 3h 26min 41s
Wall time: 2h 50min 56s

In [8]:
%%time
dic=pickle.load(open('traingrst_english.pkl', 'rb'))
topic_set=0;
for i in sen.find(modifiers={"$snapshot": True}):
    try:
        #find the topic from the dictionary
        docid=i['_id']
        topic=dic[docid]
        #print(topic)
        sen.update_many(
        {"_id": str(docid)},
        {
        "$set": {
            #this need to cast to int, otherwise it has a can't encode object error after it use pickle to load.
            "topic":int(topic)
        }
        }
        )
        topic_set=topic_set+1
    except Exception as e:
        print(str(e))
        pass
print(topic_set)


1161388
CPU times: user 3min 32s, sys: 24 s, total: 3min 56s
Wall time: 5min 57s

In [ ]: