In [14]:
from gensim import corpora, models, similarities
import pickle
import string
In [15]:
def remove_punctuation(x):
x = str(x)
return x.translate(string.maketrans('',''),string.punctuation)
In [16]:
beer_reviews = pickle.load(open('beer_reviews.pkl','rb'))
In [17]:
documents = [reviews[1] for reviews in beer_reviews.items()]
In [18]:
documents = [' '.join(review) for review in documents]
In [19]:
documents = [remove_punctuation(doc) for doc in documents]
In [20]:
documents = [review.lower() for review in documents]
In [21]:
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]for document in documents]
In [22]:
from collections import defaultdict
In [23]:
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]for text in texts]
In [24]:
dictionary = corpora.Dictionary(texts)
In [25]:
corpus = [dictionary.doc2bow(text) for text in texts]
In [26]:
tfidf = models.TfidfModel(corpus)
In [27]:
corpus_tfidf = tfidf[corpus]
In [30]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=500)
In [31]:
index = similarities.MatrixSimilarity(lsi[corpus])
In [32]:
beer_names = beer_reviews.keys()
In [33]:
doc = ''
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
In [34]:
text_input = 'Heady Topper'
In [35]:
# get the reviews for a beer
beer_name_inputted = 1
try:
doc= documents[beer_names.index(text_input)]
except ValueError:
doc = text_input
beer_name_inputted = 0
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
sims = index[vec_lsi]
for beer in sorted(enumerate(sims), key = lambda x: -x[1])[beer_name_inputted:beer_name_inputted+5]:
print(beer_names[beer[0]] + ' : %.2f' % (beer[1]*100))
In [36]:
# documents
# dictionary
# lsi 41.2s
# index 6.4s --rebuild on app
In [41]:
pickle.dump(documents,open('flask/app/models/documents.pkl','wb'))
In [42]:
pickle.dump(dictionary,open('flask/app/models/dictionary.pkl','wb'))
In [43]:
pickle.dump(lsi,open('flask/app/models/lsi.pkl','wb'))
In [54]:
pickle.dump(corpus,open('flask/app/models/corpus.pkl','wb'))
In [55]:
pickle.dump(beer_names,open('flask/app/models/beer_names.pkl','wb'))
In [56]:
pickle.dump(index,open('flask/app/models/index.pkl','wb'))
In [ ]:
In [1]:
from sklearn.cluster import KMeans
In [ ]: