In [6]:
%%time
from pymongo import MongoClient
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
from sklearn.cluster import KMeans
import numpy as np
import pickle
import time
#start_time = time.time()
client=MongoClient()
client=MongoClient('mongodb://localhost:/')
db=client['eventData']
sen=db.documents_arabic
tokenizer = RegexpTokenizer(r'\w+')
# create English stop words list
en_stop = get_stop_words('ar')
en_stop.append(' ب')
en_stop.append('إلى')
en_stop.append('إلى')
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
texts = []
docIds=[]
actuallyTrained=0;
for i in sen.find():
try:
raw = ''.join(i['document']).lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in en_stop]
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
texts.append(stemmed_tokens)
docIds.append(i['_id'])
actuallyTrained=actuallyTrained+1
except:
pass
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word = dictionary, passes=1)
dim=20
result=[]
for i in range(0,actuallyTrained):
feature=[]
previousindex=0
for item in ldamodel[corpus[i]]:
index=item[0]
#print(index)
for beforeindex in range(previousindex,index):
feature.append(0)
feature.append(item[1])
previousindex=index+1
while (len(feature)<dim):
feature.append(0); #add in 0 at the end
result.append(feature)
kmeanstest=np.array(result)
kmeans = KMeans(n_clusters=20, random_state=0).fit(kmeanstest)
#and before building the dictionary test if the size of docIds and cluster result dimensions are the same.
try:
assert(len(docIds)==kmeans.labels_.size)
dictionary_cocId_topicClusterItBelongs={}
for i in range(0,actuallyTrained):
dictionary_cocId_topicClusterItBelongs.update({docIds[i]:kmeans.labels_[i]})
except:
print("the docIds size is different from the topic # cluster size")
with open('traingrst_arabic.pkl', 'wb') as output:
pickle.dump(dictionary_cocId_topicClusterItBelongs,output)
#print("--- %s seconds ---" % (time.time() - start_time))
In [1]:
%%time
from pymongo import MongoClient
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
from sklearn.cluster import KMeans
import numpy as np
import pickle
import time
#start_time = time.time()
client=MongoClient()
client=MongoClient('mongodb://localhost:29017/')
db=client['eventData']
sen=db.documents_arabic
In [10]:
%%time
dic=pickle.load(open('traingrst_arabic.pkl', 'rb'))
topic_set=0;
for i in sen.find(modifiers={"$snapshot": True}):
try:
#find the topic from the dictionary
docid=i['_id']
topic=dic[docid]
#print(topic)
sen.update_many(
{"_id": str(docid)},
{
"$set": {
#this need to cast to int, otherwise it has a can't encode object error after it use pickle to load.
"topic":int(topic)
}
}
)
topic_set=topic_set+1
except Exception as e:
print(str(e))
pass
print(topic_set)
In [38]:
from bson.son import SON
pipeline = [
{"$unwind": "$topic"},
{"$group": {"_id": "$topic", "count": {"$sum": 1}}},
{"$sort": SON([("count", -1), ("_id", -1)])}
]
import pprint
distribution=list(db.documents_arabic.aggregate(pipeline))
In [39]:
distribution
Out[39]:
In [46]:
xpos=[]
ypos=[]
for dictitem in distribution:
xpos.append(dictitem['_id'])
ypos.append(dictitem['count'])
In [47]:
xpos
Out[47]:
In [52]:
import matplotlib.pyplot as plt;
plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
#performance = [10,8,6,4,2,1]
plt.bar(xpos, ypos)
#plt.xticks(y_pos, objects)
plt.ylabel('Document Under The Topic Count')
plt.xlabel('Topic Index')
plt.xticks(np.arange(0, 20, 1))
plt.title('topic count model for arabic documents')
plt.show()
In [ ]: