In [2]:
import json
import gensim
import numpy as np
import string
import tsne as ts # local copy in the repo
In [2]:
%matplotlib inline
In [3]:
import matplotlib.pyplot as plt
In [1]:
# We need to unzip the data file to use it:
!gunzip ../data/yelp/yelp_academic_dataset_reviews.json.gz
In [2]:
# Make sure it is there and unzipped:
!ls -al ../data/yelp/
In [4]:
## Make sure this dataset is here and unzipped.
data = []
with open("../data/yelp/yelp_academic_dataset_reviews.json") as handle:
for line in handle.readlines():
yelp = json.loads(line)
data.append(yelp)
In [6]:
len(data)
Out[6]:
In [8]:
data[0]
Out[8]:
In [10]:
revs = [d[u'text'] for d in data]
In [11]:
revs[0]
Out[11]:
"Generally, word2vec is trained using something called a skip-gram model. The skip-gram model, pictures above, attempts to use the vector representation that it learns to predict the words that appear around a given word in the corpus. Essentially, it uses the context of the word as it is used in a variety of books and other literature to derive a meaningful set of numbers. If the “context” of two words is similar, they will have similar vector representations." (Source)
"In word2vec, a distributed representation of a word is used. Take a vector with several hundred dimensions (say 1000). Each word is representated by a distribution of weights across those elements. So instead of a one-to-one mapping between an element in the vector and a word, the representation of a word is spread across all of the elements in the vector, and each element in the vector contributes to the definition of many words.
If I label the dimensions in a hypothetical word vector (there are no such pre-assigned labels in the algorithm of course), it might look a bit like this:"
So that means we can do associative logic, or analogies, with these models:
Specifically, a large enough model of the right kind of language (like a lot of news, or lots of books) will allow you to get "queen" from putting in man, king, woman... and doing vector math on them. So, king-man+woman=queen.
In [2]:
""" An alternate from gensim tutorials - just use all words in the model in a rewiew. No nltk used to split."""
import re
class YelpReviews(object):
"""Iterate over sentences of all plaintext files in a directory """
SPLIT_SENTENCES = re.compile(u"[.!?:]\s+") # split sentences on these characters
def __init__(self, objs, field):
self.field = field
self.objs = objs
def __iter__(self):
for obj in self.objs:
text = obj[self.field]
for sentence in self.SPLIT_SENTENCES.split(text):
yield gensim.utils.simple_preprocess(sentence, deacc=True)
In [3]:
## Don't do this is you already have the model file! Skip to the step after.
## Otherwise, feel free to do it from scratch.
## We pass in the full data objs and use the YelpReviews class to get the 'text' field for us.
#model = gensim.models.Word2Vec(YelpReviews(data, 'text'), min_count=2, workers=2)
#model.save('yelp_w2v_model.mod')
#model.save_word2vec_format('yelp_w2vformat.mod')
In [3]:
# If you already have a model file, load it here:
model = gensim.models.Word2Vec.load_word2vec_format('../data/yelp/yelp_w2vformat.mod')
In [9]:
model.most_similar(positive=["chicken", "waffles"], topn=20)
Out[9]:
In [7]:
model.most_similar("waitress")
Out[7]:
In [9]:
model.vocab.items()[0:5]
Out[9]:
In [10]:
model.most_similar(['good', 'pizza'])
Out[10]:
In [11]:
model.most_similar_cosmul(['good', 'pizza']) # less susceptible to extreme outliers
Out[11]:
In [12]:
model.most_similar(['dog'])
Out[12]:
In [13]:
model.most_similar(['salon'])
Out[13]:
In [15]:
model.most_similar(positive=['donuts', 'nypd'], negative=['fireman'])
Out[15]:
In [15]:
import nltk
nltk.data.path = ['../nltk_data']
from nltk.corpus import stopwords
english_stops = stopwords.words('english')
In [13]:
revs[0]
Out[13]:
In [16]:
tokens = [nltk.word_tokenize(rev) for rev in revs] # this takes a long time. don't run unless you're sure.
In [17]:
mystops = english_stops + [u"n't", u'...', u"'ve"]
In [18]:
def clean_tokens(tokens, stoplist):
""" Lowercases, takes out punct and stopwords and short strings """
return [token.lower() for token in tokens if (token not in string.punctuation) and
(token.lower() not in stoplist) and len(token) > 2]
In [19]:
clean = [clean_tokens(tok, mystops) for tok in tokens]
In [20]:
from nltk import Text
allclean = [y for x in clean for y in x] # flatten the list of lists
cleantext = Text(allclean)
In [21]:
mostcommon = cleantext.vocab().most_common()[0:1500]
mostcommon_words = [word[0] for word in mostcommon]
In [23]:
mostcommon_words[0:12]
Out[23]:
In [24]:
# thing required to get the vectors for tsne
def get_vectors(words, model):
# requires model be in the binary format, not gensim's
word_vectors = []
word_labels = []
for word in words:
if word in model:
word_vectors.append( model[word] )
word_labels.append(word)
return word_vectors, word_labels
In [25]:
mymodel = gensim.models.Word2Vec.load_word2vec_format('../data/yelp/yelp_w2vformat.mod')
vectors, labels = get_vectors(mostcommon_words, mymodel)
In [26]:
# should be same as top words above
labels[:12]
Out[26]:
In [27]:
res = ts.tsne(np.asfarray(vectors, dtype='float'), 2, 50, 20)
The "AFINN-111.txt" file is another sentiment file.
In [16]:
from collections import defaultdict
sentiment = defaultdict(int)
with open('../data/sentiment_wordlists/AFINN-111.txt') as handle:
for line in handle.readlines():
word = line.split('\t')[0]
polarity = line.split('\t')[1]
sentiment[word] = int(polarity)
In [17]:
sentiment['pho']
Out[17]:
In [18]:
sentiment['good']
Out[18]:
In [19]:
sentiment['angry']
Out[19]:
In [ ]:
sentiment['pizza']
In [201]:
def render_json( vectors, labels, filename ):
output = []
vectors = np.array(vectors)
for i in range(len(vectors)):
new_hash = {}
new_hash["word"] = str(labels[i])
new_hash["x"] = int(vectors[i][0])
new_hash["y"] = int(vectors[i][1])
new_hash["sentiment"] = sentiment[str(labels[i])]
output.append(new_hash)
with open(filename, 'w') as handle:
json.dump(output, handle)
In [210]:
render_json(res, labels, "../outputdata/yelp.json")
In [33]:
plt.figure(figsize=(15, 15))
plt.scatter(res[:,0], res[:,1], s=10, color='gray', alpha=0.2)
Out[33]:
If you go to the file tsne_yelp.html, you can interact with this and see what the words are.
In [ ]: