In [2]:
import json
import codecs
import os
docs = []
for filename in os.listdir("reuters-21578-json/data/full"):
f = open("reuters-21578-json/data/full/"+filename)
js = json.load(f)
for j in js:
if 'topics' in j and 'body' in j:
d = {}
d["id"] = j['id']
d["text"] = j['body'].replace("\n","")
d["title"] = j['title']
d["tags"] = ",".join(j['topics'])
docs.append(d)
print "loaded ",len(docs)," documents"
In [3]:
from seldon.text import DocumentSimilarity,DefaultJsonCorpus
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
corpus = DefaultJsonCorpus(docs)
ds = DocumentSimilarity(model_type='gensim_lsi')
ds.fit(corpus)
print "done"
In [4]:
ds.score()
Out[4]:
Run a test again but use the Annoy approximate nearest neighbour index that would have been built. Should be much faster.
In [5]:
ds.score(approx=True)
Out[5]:
In [6]:
query_doc=6023
print "Query doc: ",ds.get_meta(query_doc)['title'],"Tagged:",ds.get_meta(query_doc)['tags']
neighbours = ds.nn(query_doc,k=5,translate_id=True,approx=True)
print neighbours
for (doc_id,_) in neighbours:
j = ds.get_meta(doc_id)
print "Doc id",doc_id,j['title'],"Tagged:",j['tags']
In [7]:
import seldon
rw = seldon.Recommender_wrapper()
rw.save_recommender(ds,"reuters_recommender")
print "done"
In [ ]:
from seldon.microservice import Microservices
m = Microservices()
app = m.create_recommendation_microservice("reuters_recommender")
app.run(host="0.0.0.0",port=5000,debug=False)
In [ ]: