notebook.community

Edit and run



In [14]:

    
from gensim import corpora, models, similarities
import pickle
import string



In [15]:

    
def remove_punctuation(x):
    x = str(x)
    return x.translate(string.maketrans('',''),string.punctuation)



In [16]:

    
beer_reviews = pickle.load(open('beer_reviews.pkl','rb'))



In [17]:

    
documents = [reviews[1] for reviews in beer_reviews.items()]



In [18]:

    
documents = [' '.join(review) for review in documents]



In [19]:

    
documents = [remove_punctuation(doc) for doc in documents]



In [20]:

    
documents = [review.lower() for review in documents]



In [21]:

    
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]for document in documents]



In [22]:

    
from collections import defaultdict



In [23]:

    
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
        
texts = [[token for token in text if frequency[token] > 1]for text in texts]



In [24]:

    
dictionary = corpora.Dictionary(texts)



In [25]:

    
corpus = [dictionary.doc2bow(text) for text in texts]



In [26]:

    
tfidf = models.TfidfModel(corpus)



In [27]:

    
corpus_tfidf = tfidf[corpus]



In [30]:

    
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=500)



In [31]:

    
index = similarities.MatrixSimilarity(lsi[corpus])



In [32]:

    
beer_names = beer_reviews.keys()



In [33]:

    
doc = ''
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]



In [34]:

    
text_input = 'Heady Topper'



In [35]:

    
# get the reviews for a beer
beer_name_inputted = 1
try:
    doc= documents[beer_names.index(text_input)]
except ValueError:
    doc = text_input
    beer_name_inputted = 0
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]

sims = index[vec_lsi]
for beer in sorted(enumerate(sims), key = lambda x: -x[1])[beer_name_inputted:beer_name_inputted+5]:
    print(beer_names[beer[0]] + ' : %.2f' % (beer[1]*100))









    



Pliny The Elder : 92.63
Palate Wrecker : 91.26
Abrasive Ale : 91.21
Stone Enjoy By IPA : 90.74
Stone RuinTen Triple IPA : 90.70



In [36]:

    
# documents
# dictionary
# lsi    41.2s
# index  6.4s --rebuild on app



In [41]:

    
pickle.dump(documents,open('flask/app/models/documents.pkl','wb'))



In [42]:

    
pickle.dump(dictionary,open('flask/app/models/dictionary.pkl','wb'))



In [43]:

    
pickle.dump(lsi,open('flask/app/models/lsi.pkl','wb'))



In [54]:

    
pickle.dump(corpus,open('flask/app/models/corpus.pkl','wb'))



In [55]:

    
pickle.dump(beer_names,open('flask/app/models/beer_names.pkl','wb'))



In [56]:

    
pickle.dump(index,open('flask/app/models/index.pkl','wb'))



In [ ]:



In [1]:

    
from sklearn.cluster import KMeans



In [ ]: