notebook.community

Edit and run



In [3]:

    
from gensim import corpora, models, similarities
import pickle
import string
import pandas as pd
import nltk


from collections import defaultdict

from tqdm import tqdm
from tqdm import tqdm_notebook



In [20]:

    
def remove_punctuation(x):
    #x = x.encode('utf-8')
    return x.translate(string.maketrans('',''),string.punctuation)



In [21]:

    
beers = pd.read_pickle('all_beer_reviews.pkl')



In [22]:

    
documents = [review for review in beers.reviews]



In [23]:

    
documents = [' '.join(review) for review in documents]



In [24]:

    
documents = [remove_punctuation(doc) for doc in documents]



In [25]:

    
def remove_punctuation(x):
    x = x.encode('utf-8')
    return x.translate(string.maketrans('',''),string.punctuation)



In [26]:

    
documents = [review.lower() for review in documents]



In [27]:

    
stoplist = set('an one little just has be up had no with is this it i but that on not \
                very some as was like from its bit at more into there my pours for \
                a of the and to in'.split())



In [28]:

    
# create a stoplist of words in brewery names

breweries = []
for brewery in beers.brewery_name:
    for token in brewery.lower().split():
        breweries.append(remove_punctuation(token))
    
breweries = set(breweries)



In [13]:

    
# combine stoplists and remove the words therein

stoplist = stoplist.union(breweries)



In [19]:

    
# remove low frequency words
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
        
texts = [[token for token in text if frequency[token] > 10]for text in texts]









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-19-5f362443926c> in <module>()
      1 # remove low frequency words
      2 frequency = defaultdict(int)
----> 3 for text in texts:
      4     for token in text:
      5         frequency[token] += 1

NameError: name 'texts' is not defined



In [393]:

    
# remove everything that isnt a noun or adjective
pos_accepted = set(['NN','NNS','JJ','JJR'])
#texts = [[word[0] for word in nltk.pos_tag(document.lower().split()) if word[1] in pos_accepted]for document in documents]
texts = []
for document in tqdm(documents, desc = 'docs'):
    clean_review = []

    doc = nltk.pos_tag(document.lower().split())
    for word in doc:
        if word[1] in pos_accepted:
            clean_review.append(word[0])
    texts.append(clean_review)









    



docs: 100%|██████████| 20381/20381 [1:55:48<00:00,  3.02it/s]



In [17]:

    
texts = pickle.load(open('nn_jj_reviews_only.pkl','rb'))









    



---------------------------------------------------------------------------
EOFError                                  Traceback (most recent call last)
<ipython-input-17-79cb8cf36838> in <module>()
----> 1 texts = pickle.load(open('nn_jj_reviews_only.pkl','rb'))

/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.pyc in load(file)
   1382 
   1383 def load(file):
-> 1384     return Unpickler(file).load()
   1385 
   1386 def loads(str):

/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.pyc in load(self)
    862             while 1:
    863                 key = read(1)
--> 864                 dispatch[key](self)
    865         except _Stop, stopinst:
    866             return stopinst.value

/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.pyc in load_eof(self)
    884 
    885     def load_eof(self):
--> 886         raise EOFError
    887     dispatch[''] = load_eof
    888 

EOFError:



In [18]:

    
sorted(frequency.items(),key = lambda x: x[1])[:10]









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-18-97cbc6d9f06c> in <module>()
----> 1 sorted(frequency.items(),key = lambda x: x[1])[:10]

NameError: name 'frequency' is not defined



In [396]:

    
dictionary = corpora.Dictionary(texts)



In [397]:

    
corpus = [dictionary.doc2bow(text) for text in texts]



In [398]:

    
tfidf = models.TfidfModel(corpus)



In [399]:

    
corpus_tfidf = tfidf[corpus]



In [400]:

    
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=200)



In [401]:

    
index = similarities.MatrixSimilarity(lsi[corpus])



In [402]:

    
beers = beers.reset_index()



In [403]:

    
text_input = 'Box Set Track #8 - Number Of The Beast'



In [404]:

    
# get the reviews for a beer
beer_name_inputted = 1
try:
    doc= documents[beers[beers.name == text_input].index[0]]
except IndexError:
    print 'Beer Name Not Inputted'
    doc = text_input
    beer_name_inputted = 0
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]

sims = index[vec_lsi]
similar_beers = []
for beer in sorted(enumerate(sims), key = lambda x: -x[1])[beer_name_inputted:beer_name_inputted+5]:
    similar_beers.append(beer[0])
    print(beers.name.iloc[beer[0]] + '\t:\t%.2f' % (beer[1]*100))
similar_beers = beers.iloc[similar_beers,:]









    



Barrel Aged 12 Dogs Of Christmas	:	82.11
Bourbon Barrel Aged Imperial Mayan Mocha	:	80.34
Barrel-Aged Abraxas	:	80.30
Supreme Leader	:	79.27
Barrel Aged GnarlyWine	:	79.24



In [405]:

    
for term in sorted(vec_lsi, key = lambda x: -x[1])[:25]:
    print dictionary[term[0]]









    



concept
lack
reviewers
greenness
elementstaste
small
lacings
beers
joes
todaya
version
roof
malt
chocolate
carbonationdrinkability
faint
guinness
great
room
belgian
favs
fan
opaque
feel
strong



In [406]:

    
# documents
# dictionary
# lsi    41.2s
# index  6.4s --rebuild on app



In [407]:

    
pickle.dump(documents,open('flask/app/models/documents.pkl','wb'))



In [408]:

    
pickle.dump(dictionary,open('flask/app/models/dictionary.pkl','wb'))



In [409]:

    
pickle.dump(lsi,open('flask/app/models/lsi.pkl','wb'))



In [410]:

    
pickle.dump(corpus,open('flask/app/models/corpus.pkl','wb'))



In [411]:

    
pickle.dump(index,open('flask/app/models/index.pkl','wb'))



In [412]:

    
beers.drop('reviews',axis=1,inplace=True)
beers.to_pickle('flask/app/models/beer_review_df.pkl')



In [414]:

    
pickle.dump(texts,open('nn_jj_reviews_only.pkl','wb'))



In [413]:

    
from sklearn.cluster import KMeans



In [ ]: