In [3]:
from gensim import corpora, models, similarities
import pickle
import string
import pandas as pd
import nltk
from collections import defaultdict
from tqdm import tqdm
from tqdm import tqdm_notebook
In [20]:
def remove_punctuation(x):
#x = x.encode('utf-8')
return x.translate(string.maketrans('',''),string.punctuation)
In [21]:
beers = pd.read_pickle('all_beer_reviews.pkl')
In [22]:
documents = [review for review in beers.reviews]
In [23]:
documents = [' '.join(review) for review in documents]
In [24]:
documents = [remove_punctuation(doc) for doc in documents]
In [25]:
def remove_punctuation(x):
x = x.encode('utf-8')
return x.translate(string.maketrans('',''),string.punctuation)
In [26]:
documents = [review.lower() for review in documents]
In [27]:
stoplist = set('an one little just has be up had no with is this it i but that on not \
very some as was like from its bit at more into there my pours for \
a of the and to in'.split())
In [28]:
# create a stoplist of words in brewery names
breweries = []
for brewery in beers.brewery_name:
for token in brewery.lower().split():
breweries.append(remove_punctuation(token))
breweries = set(breweries)
In [13]:
# combine stoplists and remove the words therein
stoplist = stoplist.union(breweries)
In [19]:
# remove low frequency words
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 10]for text in texts]
In [393]:
# remove everything that isnt a noun or adjective
pos_accepted = set(['NN','NNS','JJ','JJR'])
#texts = [[word[0] for word in nltk.pos_tag(document.lower().split()) if word[1] in pos_accepted]for document in documents]
texts = []
for document in tqdm(documents, desc = 'docs'):
clean_review = []
doc = nltk.pos_tag(document.lower().split())
for word in doc:
if word[1] in pos_accepted:
clean_review.append(word[0])
texts.append(clean_review)
In [17]:
texts = pickle.load(open('nn_jj_reviews_only.pkl','rb'))
In [18]:
sorted(frequency.items(),key = lambda x: x[1])[:10]
In [396]:
dictionary = corpora.Dictionary(texts)
In [397]:
corpus = [dictionary.doc2bow(text) for text in texts]
In [398]:
tfidf = models.TfidfModel(corpus)
In [399]:
corpus_tfidf = tfidf[corpus]
In [400]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=200)
In [401]:
index = similarities.MatrixSimilarity(lsi[corpus])
In [402]:
beers = beers.reset_index()
In [403]:
text_input = 'Box Set Track #8 - Number Of The Beast'
In [404]:
# get the reviews for a beer
beer_name_inputted = 1
try:
doc= documents[beers[beers.name == text_input].index[0]]
except IndexError:
print 'Beer Name Not Inputted'
doc = text_input
beer_name_inputted = 0
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
sims = index[vec_lsi]
similar_beers = []
for beer in sorted(enumerate(sims), key = lambda x: -x[1])[beer_name_inputted:beer_name_inputted+5]:
similar_beers.append(beer[0])
print(beers.name.iloc[beer[0]] + '\t:\t%.2f' % (beer[1]*100))
similar_beers = beers.iloc[similar_beers,:]
In [405]:
for term in sorted(vec_lsi, key = lambda x: -x[1])[:25]:
print dictionary[term[0]]
In [406]:
# documents
# dictionary
# lsi 41.2s
# index 6.4s --rebuild on app
In [407]:
pickle.dump(documents,open('flask/app/models/documents.pkl','wb'))
In [408]:
pickle.dump(dictionary,open('flask/app/models/dictionary.pkl','wb'))
In [409]:
pickle.dump(lsi,open('flask/app/models/lsi.pkl','wb'))
In [410]:
pickle.dump(corpus,open('flask/app/models/corpus.pkl','wb'))
In [411]:
pickle.dump(index,open('flask/app/models/index.pkl','wb'))
In [412]:
beers.drop('reviews',axis=1,inplace=True)
beers.to_pickle('flask/app/models/beer_review_df.pkl')
In [414]:
pickle.dump(texts,open('nn_jj_reviews_only.pkl','wb'))
In [413]:
from sklearn.cluster import KMeans
In [ ]: