In [14]:
from gensim import corpora, models, similarities
import pickle
import string

In [15]:
def remove_punctuation(x):
    x = str(x)
    return x.translate(string.maketrans('',''),string.punctuation)

In [16]:
beer_reviews = pickle.load(open('beer_reviews.pkl','rb'))

In [17]:
documents = [reviews[1] for reviews in beer_reviews.items()]

In [18]:
documents = [' '.join(review) for review in documents]

In [19]:
documents = [remove_punctuation(doc) for doc in documents]

In [20]:
documents = [review.lower() for review in documents]

In [21]:
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]for document in documents]

In [22]:
from collections import defaultdict

In [23]:
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
        
texts = [[token for token in text if frequency[token] > 1]for text in texts]

In [24]:
dictionary = corpora.Dictionary(texts)

In [25]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [26]:
tfidf = models.TfidfModel(corpus)

In [27]:
corpus_tfidf = tfidf[corpus]

In [30]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=500)

In [31]:
index = similarities.MatrixSimilarity(lsi[corpus])

In [32]:
beer_names = beer_reviews.keys()

In [33]:
doc = ''
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]

In [34]:
text_input = 'Heady Topper'

In [35]:
# get the reviews for a beer
beer_name_inputted = 1
try:
    doc= documents[beer_names.index(text_input)]
except ValueError:
    doc = text_input
    beer_name_inputted = 0
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]

sims = index[vec_lsi]
for beer in sorted(enumerate(sims), key = lambda x: -x[1])[beer_name_inputted:beer_name_inputted+5]:
    print(beer_names[beer[0]] + ' : %.2f' % (beer[1]*100))


Pliny The Elder : 92.63
Palate Wrecker : 91.26
Abrasive Ale : 91.21
Stone Enjoy By IPA : 90.74
Stone RuinTen Triple IPA : 90.70

In [36]:
# documents
# dictionary
# lsi    41.2s
# index  6.4s --rebuild on app

In [41]:
pickle.dump(documents,open('flask/app/models/documents.pkl','wb'))

In [42]:
pickle.dump(dictionary,open('flask/app/models/dictionary.pkl','wb'))

In [43]:
pickle.dump(lsi,open('flask/app/models/lsi.pkl','wb'))

In [54]:
pickle.dump(corpus,open('flask/app/models/corpus.pkl','wb'))

In [55]:
pickle.dump(beer_names,open('flask/app/models/beer_names.pkl','wb'))

In [56]:
pickle.dump(index,open('flask/app/models/index.pkl','wb'))

In [ ]:


In [1]:
from sklearn.cluster import KMeans

In [ ]: