In [3]:
from gensim import corpora, models, similarities
import pickle
import string
import pandas as pd
import nltk


from collections import defaultdict

from tqdm import tqdm
from tqdm import tqdm_notebook

In [20]:
def remove_punctuation(x):
    #x = x.encode('utf-8')
    return x.translate(string.maketrans('',''),string.punctuation)

In [21]:
beers = pd.read_pickle('all_beer_reviews.pkl')

In [22]:
documents = [review for review in beers.reviews]

In [23]:
documents = [' '.join(review) for review in documents]

In [24]:
documents = [remove_punctuation(doc) for doc in documents]

In [25]:
def remove_punctuation(x):
    x = x.encode('utf-8')
    return x.translate(string.maketrans('',''),string.punctuation)

In [26]:
documents = [review.lower() for review in documents]

In [27]:
stoplist = set('an one little just has be up had no with is this it i but that on not \
                very some as was like from its bit at more into there my pours for \
                a of the and to in'.split())

In [28]:
# create a stoplist of words in brewery names

breweries = []
for brewery in beers.brewery_name:
    for token in brewery.lower().split():
        breweries.append(remove_punctuation(token))
    
breweries = set(breweries)

In [13]:
# combine stoplists and remove the words therein

stoplist = stoplist.union(breweries)

In [19]:
# remove low frequency words
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
        
texts = [[token for token in text if frequency[token] > 10]for text in texts]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-19-5f362443926c> in <module>()
      1 # remove low frequency words
      2 frequency = defaultdict(int)
----> 3 for text in texts:
      4     for token in text:
      5         frequency[token] += 1

NameError: name 'texts' is not defined

In [393]:
# remove everything that isnt a noun or adjective
pos_accepted = set(['NN','NNS','JJ','JJR'])
#texts = [[word[0] for word in nltk.pos_tag(document.lower().split()) if word[1] in pos_accepted]for document in documents]
texts = []
for document in tqdm(documents, desc = 'docs'):
    clean_review = []

    doc = nltk.pos_tag(document.lower().split())
    for word in doc:
        if word[1] in pos_accepted:
            clean_review.append(word[0])
    texts.append(clean_review)


docs: 100%|██████████| 20381/20381 [1:55:48<00:00,  3.02it/s]

In [17]:
texts = pickle.load(open('nn_jj_reviews_only.pkl','rb'))


---------------------------------------------------------------------------
EOFError                                  Traceback (most recent call last)
<ipython-input-17-79cb8cf36838> in <module>()
----> 1 texts = pickle.load(open('nn_jj_reviews_only.pkl','rb'))

/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.pyc in load(file)
   1382 
   1383 def load(file):
-> 1384     return Unpickler(file).load()
   1385 
   1386 def loads(str):

/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.pyc in load(self)
    862             while 1:
    863                 key = read(1)
--> 864                 dispatch[key](self)
    865         except _Stop, stopinst:
    866             return stopinst.value

/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.pyc in load_eof(self)
    884 
    885     def load_eof(self):
--> 886         raise EOFError
    887     dispatch[''] = load_eof
    888 

EOFError: 

In [18]:
sorted(frequency.items(),key = lambda x: x[1])[:10]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-18-97cbc6d9f06c> in <module>()
----> 1 sorted(frequency.items(),key = lambda x: x[1])[:10]

NameError: name 'frequency' is not defined

In [396]:
dictionary = corpora.Dictionary(texts)

In [397]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [398]:
tfidf = models.TfidfModel(corpus)

In [399]:
corpus_tfidf = tfidf[corpus]

In [400]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=200)

In [401]:
index = similarities.MatrixSimilarity(lsi[corpus])

In [402]:
beers = beers.reset_index()

In [403]:
text_input = 'Box Set Track #8 - Number Of The Beast'

In [404]:
# get the reviews for a beer
beer_name_inputted = 1
try:
    doc= documents[beers[beers.name == text_input].index[0]]
except IndexError:
    print 'Beer Name Not Inputted'
    doc = text_input
    beer_name_inputted = 0
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]

sims = index[vec_lsi]
similar_beers = []
for beer in sorted(enumerate(sims), key = lambda x: -x[1])[beer_name_inputted:beer_name_inputted+5]:
    similar_beers.append(beer[0])
    print(beers.name.iloc[beer[0]] + '\t:\t%.2f' % (beer[1]*100))
similar_beers = beers.iloc[similar_beers,:]


Barrel Aged 12 Dogs Of Christmas	:	82.11
Bourbon Barrel Aged Imperial Mayan Mocha	:	80.34
Barrel-Aged Abraxas	:	80.30
Supreme Leader	:	79.27
Barrel Aged GnarlyWine	:	79.24

In [405]:
for term in sorted(vec_lsi, key = lambda x: -x[1])[:25]:
    print dictionary[term[0]]


concept
lack
reviewers
greenness
elementstaste
small
lacings
beers
joes
todaya
version
roof
malt
chocolate
carbonationdrinkability
faint
guinness
great
room
belgian
favs
fan
opaque
feel
strong

In [406]:
# documents
# dictionary
# lsi    41.2s
# index  6.4s --rebuild on app

In [407]:
pickle.dump(documents,open('flask/app/models/documents.pkl','wb'))

In [408]:
pickle.dump(dictionary,open('flask/app/models/dictionary.pkl','wb'))

In [409]:
pickle.dump(lsi,open('flask/app/models/lsi.pkl','wb'))

In [410]:
pickle.dump(corpus,open('flask/app/models/corpus.pkl','wb'))

In [411]:
pickle.dump(index,open('flask/app/models/index.pkl','wb'))

In [412]:
beers.drop('reviews',axis=1,inplace=True)
beers.to_pickle('flask/app/models/beer_review_df.pkl')

In [414]:
pickle.dump(texts,open('nn_jj_reviews_only.pkl','wb'))

In [413]:
from sklearn.cluster import KMeans

In [ ]: