In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

In [1]:
import os
import string
import numpy as np
import pandas as pd
from yelp_utils import *
import yelp_utils

In [2]:
from bs4 import BeautifulSoup
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [3]:
# SEED_VAL = 200;
# WORK_DIR = os.getcwd();
# data_subset = "_10Percent"
# YELP_DATA_CSV_DIR = os.path.join(WORK_DIR, "data", "csv")
# YELP_DATA_WORD_2_VEC_MODEL_DIR = os.path.join(WORK_DIR, "data", "word2vec_model")

Read the data


In [4]:
yelp_utils.make_sure_path_exists(yelp_utils.YELP_DATA_WORD_2_VEC_MODEL_DIR)

In [5]:
read_filename = os.path.join(yelp_utils.YELP_DATA_CSV_DIR, 'business_review_user' + yelp_utils.data_subset + '.csv')
df_data = pd.read_csv(read_filename, engine='c', encoding='utf-8')

Raw text


In [6]:
df_data.review_text[1]


Out[6]:
u"Great outdoor patio dining area. Great happy hour. Great service.\r\n\r\nOutdoor patio dining has a beautiful mesquite tree for ambiance and blocking out the sun while the center fireplace keeps you warm. \r\n\r\nWe had:\r\nQueso Skillet with warm tortilla chips - amazing!\r\nMac N Cheese with Chorizo - fabulous! one of the best mac n cheeses I've ever had!\r\nCarne Asada on a Potato Pancake - was ok. Sounded excellent, tasted decent.\r\n\r\nFriendly and good food. But the ambiance really puts it over the top as a great dining experience. I'd be back with a group of friends to lounge, play cornsack or bocce ball during happy hour."

Processing text


In [7]:
def lowercase_remove_punctuation_and_numbers_and_tokenize_and_filter_more_stopwords_and_stem(s):
    '''
    Function to convert a string to tokens and perform foloowing operations-
    1. remove numbers 
    2. lowercase 
    3. remove stopwords as per NLTK stopword list
    4. stem the words using Poerter stemmer
    Input: String
    Output: Token list
    '''
    s = yelp_utils.remove_numbers_in_string(s)
    s = yelp_utils.lowercase_remove_punctuation(s)
    s = yelp_utils.remove_stopwords(s)
    token_list = nltk.word_tokenize(s)
    #token_list = filter_out_more_stopwords(token_list)
    token_list = yelp_utils.stem_token_list(token_list)
    return token_list

In [8]:
print lowercase_remove_punctuation_and_numbers_and_tokenize_and_filter_more_stopwords_and_stem(df_data.review_text[1])


[u'great', u'outdoor', u'patio', u'dine', u'area', u'great', u'happi', u'hour', u'great', u'servic', u'outdoor', u'patio', u'dine', u'beauti', u'mesquit', u'tree', u'ambianc', u'block', u'sun', u'center', u'fireplac', u'keep', u'warm', u'queso', u'skillet', u'warm', u'tortilla', u'chip', u'amaz', u'mac', u'n', u'chees', u'chorizo', u'fabul', u'one', u'best', u'mac', u'n', u'chees', u'ive', u'ever', u'carn', u'asada', u'potato', u'pancak', u'ok', u'sound', u'excel', u'tast', u'decent', u'friendli', u'good', u'food', u'ambianc', u'realli', u'put', u'top', u'great', u'dine', u'experi', u'id', u'back', u'group', u'friend', u'loung', u'play', u'cornsack', u'bocc', u'ball', u'happi', u'hour']

In [9]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    # source: https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append(lowercase_remove_punctuation_and_numbers_and_tokenize_and_filter_more_stopwords_and_stem(raw_sentence))
    
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [10]:
sentences = []  # Initialize an empty list of sentences

print "Parsing sentences from training set"
for review in df_data["review_text"]:
    sentences += review_to_sentences(review, tokenizer)


Parsing sentences from training set

In [11]:
print sentences[0]


[u'locat', u'chevron', u'ga', u'station', u'lot']

Training word2vec model


In [12]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import gensim, logging
from gensim.models import Word2Vec
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


DEBUG: nvcc STDOUT mod.cu
   Creating library C:/Users/hrushikesh/AppData/Local/Theano/compiledir_Windows-10-10.0.10586-Intel64_Family_6_Model_60_Stepping_3_GenuineIntel-2.7.12-64/cuda_ndarray/cuda_ndarray.lib and object C:/Users/hrushikesh/AppData/Local/Theano/compiledir_Windows-10-10.0.10586-Intel64_Family_6_Model_60_Stepping_3_GenuineIntel-2.7.12-64/cuda_ndarray/cuda_ndarray.exp

Using gpu device 0: GeForce GTX 965M (CNMeM is disabled, CuDNN not available)
DEBUG: nvcc STDOUT mod.cu
   Creating library C:/Users/hrushikesh/AppData/Local/Theano/compiledir_Windows-10-10.0.10586-Intel64_Family_6_Model_60_Stepping_3_GenuineIntel-2.7.12-64/tmpoegyrv/97496c4d3cf9a06dc4082cc141f918d2.lib and object C:/Users/hrushikesh/AppData/Local/Theano/compiledir_Windows-10-10.0.10586-Intel64_Family_6_Model_60_Stepping_3_GenuineIntel-2.7.12-64/tmpoegyrv/97496c4d3cf9a06dc4082cc141f918d2.exp

DEBUG: nvcc STDOUT mod.cu
   Creating library C:/Users/hrushikesh/AppData/Local/Theano/compiledir_Windows-10-10.0.10586-Intel64_Family_6_Model_60_Stepping_3_GenuineIntel-2.7.12-64/tmpyanni6/6174b19f8005a60d6a2faaae7ff1c9a7.lib and object C:/Users/hrushikesh/AppData/Local/Theano/compiledir_Windows-10-10.0.10586-Intel64_Family_6_Model_60_Stepping_3_GenuineIntel-2.7.12-64/tmpyanni6/6174b19f8005a60d6a2faaae7ff1c9a7.exp


In [13]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 10   # Minimum word count                        
num_workers = 8       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec

model_file = os.path.join(YELP_DATA_WORD_2_VEC_MODEL_DIR, str(num_features) + 'features_' + str(min_word_count) + 'minwords_' + str(context) + 'context'+ data_subset)
if not os.path.isfile(model_file):
    print "Training model..."
# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
    model = word2vec.Word2Vec(sentences, workers=num_workers, \
                size=num_features, min_count = min_word_count, \
                window = context, sample = downsampling)
# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
    model.init_sims(replace=True)
    model.save(model_file)
else:
    print "Loading existing model"
    model = Word2Vec.load(model_file)


Training model...

In [14]:
model.doesnt_match("amazing delightful bad".split())


Out[14]:
'bad'

In [15]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
#     source: https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
#     source: https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors
    counter = 0.
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
       #
       # Print a status message every 1000th review
       if counter%1000. == 0.:
           print "Review %d of %d" % (counter, len(reviews))
       # 
       # Call the function (defined above) that makes average feature vectors
       reviewFeatureVecs[counter] = makeFeatureVec(review, model, \
           num_features)
       #
       # Increment the counter
       counter = counter + 1.
    return reviewFeatureVecs

In [16]:
# ****************************************************************
# Calculate average feature vectors for training and testing sets,
# using the functions we defined above. Notice that we now use stop word
# removal.

clean_train_reviews = []
for review in df_data["review_text"]:
    clean_train_reviews.append( lowercase_remove_punctuation_and_numbers_and_tokenize_and_filter_more_stopwords_and_stem( review))

%time trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )


Review 0 of 2193
Review 1000 of 2193
C:\Anaconda\lib\site-packages\ipykernel\__main__.py:44: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
Review 2000 of 2193
Wall time: 423 ms

In [17]:
trainDataVecs


Out[17]:
array([[ 0.03380506, -0.03248994, -0.06748378, ..., -0.09760942,
        -0.00110296,  0.0182657 ],
       [ 0.02240651, -0.0220686 , -0.09725758, ..., -0.07256958,
         0.01236789,  0.02137215],
       [ 0.03171084, -0.03008272, -0.08868304, ..., -0.08954675,
         0.01328464,  0.02270603],
       ..., 
       [ 0.03102988, -0.03029333, -0.06572317, ..., -0.08929857,
         0.01131339,  0.02390043],
       [ 0.03002388, -0.02800074, -0.0602631 , ..., -0.08041041,
         0.0128369 ,  0.02611707],
       [ 0.02659599, -0.03020945, -0.07441082, ..., -0.07671879,
         0.01343209,  0.02648316]], dtype=float32)

In [18]:
trainDataVecs.shape


Out[18]:
(2193L, 300L)

In [19]:
word2vec_feature_matrix_file = os.path.join(YELP_DATA_WORD_2_VEC_MODEL_DIR, 'word2vec_feature_matrix'+data_subset+'.csv')
np.savetxt(word2vec_feature_matrix_file, trainDataVecs, delimiter=",")

In [20]:
test = np.genfromtxt(
    word2vec_feature_matrix_file,           # file name
    delimiter=',')           # column delimiter

In [22]:
np.array_equal(trainDataVecs, test)


Out[22]:
True

In [ ]: