In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
In [1]:
import os
import string
import numpy as np
import pandas as pd
from yelp_utils import *
import yelp_utils
In [2]:
from bs4 import BeautifulSoup
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
In [3]:
# SEED_VAL = 200;
# WORK_DIR = os.getcwd();
# data_subset = "_10Percent"
# YELP_DATA_CSV_DIR = os.path.join(WORK_DIR, "data", "csv")
# YELP_DATA_WORD_2_VEC_MODEL_DIR = os.path.join(WORK_DIR, "data", "word2vec_model")
Read the data
In [4]:
yelp_utils.make_sure_path_exists(yelp_utils.YELP_DATA_WORD_2_VEC_MODEL_DIR)
In [5]:
read_filename = os.path.join(yelp_utils.YELP_DATA_CSV_DIR, 'business_review_user' + yelp_utils.data_subset + '.csv')
df_data = pd.read_csv(read_filename, engine='c', encoding='utf-8')
Raw text
In [6]:
df_data.review_text[1]
Out[6]:
In [7]:
def lowercase_remove_punctuation_and_numbers_and_tokenize_and_filter_more_stopwords_and_stem(s):
'''
Function to convert a string to tokens and perform foloowing operations-
1. remove numbers
2. lowercase
3. remove stopwords as per NLTK stopword list
4. stem the words using Poerter stemmer
Input: String
Output: Token list
'''
s = yelp_utils.remove_numbers_in_string(s)
s = yelp_utils.lowercase_remove_punctuation(s)
s = yelp_utils.remove_stopwords(s)
token_list = nltk.word_tokenize(s)
#token_list = filter_out_more_stopwords(token_list)
token_list = yelp_utils.stem_token_list(token_list)
return token_list
In [8]:
print lowercase_remove_punctuation_and_numbers_and_tokenize_and_filter_more_stopwords_and_stem(df_data.review_text[1])
In [9]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
# Function to split a review into parsed sentences. Returns a
# list of sentences, where each sentence is a list of words
# source: https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors
# 1. Use the NLTK tokenizer to split the paragraph into sentences
raw_sentences = tokenizer.tokenize(review.strip())
#
# 2. Loop over each sentence
sentences = []
for raw_sentence in raw_sentences:
# If a sentence is empty, skip it
if len(raw_sentence) > 0:
# Otherwise, call review_to_wordlist to get a list of words
sentences.append(lowercase_remove_punctuation_and_numbers_and_tokenize_and_filter_more_stopwords_and_stem(raw_sentence))
# Return the list of sentences (each sentence is a list of words,
# so this returns a list of lists
return sentences
In [10]:
sentences = [] # Initialize an empty list of sentences
print "Parsing sentences from training set"
for review in df_data["review_text"]:
sentences += review_to_sentences(review, tokenizer)
In [11]:
print sentences[0]
In [12]:
# Import the built-in logging module and configure it so that Word2Vec
# creates nice output messages
import gensim, logging
from gensim.models import Word2Vec
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [13]:
# Set values for various parameters
num_features = 300 # Word vector dimensionality
min_word_count = 10 # Minimum word count
num_workers = 8 # Number of threads to run in parallel
context = 10 # Context window size
downsampling = 1e-3 # Downsample setting for frequent words
# Initialize and train the model (this will take some time)
from gensim.models import word2vec
model_file = os.path.join(YELP_DATA_WORD_2_VEC_MODEL_DIR, str(num_features) + 'features_' + str(min_word_count) + 'minwords_' + str(context) + 'context'+ data_subset)
if not os.path.isfile(model_file):
print "Training model..."
# It can be helpful to create a meaningful model name and
# save the model for later use. You can load it later using Word2Vec.load()
model = word2vec.Word2Vec(sentences, workers=num_workers, \
size=num_features, min_count = min_word_count, \
window = context, sample = downsampling)
# If you don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)
model.save(model_file)
else:
print "Loading existing model"
model = Word2Vec.load(model_file)
In [14]:
model.doesnt_match("amazing delightful bad".split())
Out[14]:
In [15]:
def makeFeatureVec(words, model, num_features):
# Function to average all of the word vectors in a given
# paragraph
#
# Pre-initialize an empty numpy array (for speed)
# source: https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors
featureVec = np.zeros((num_features,),dtype="float32")
#
nwords = 0.
#
# Index2word is a list that contains the names of the words in
# the model's vocabulary. Convert it to a set, for speed
index2word_set = set(model.index2word)
#
# Loop over each word in the review and, if it is in the model's
# vocaublary, add its feature vector to the total
for word in words:
if word in index2word_set:
nwords = nwords + 1.
featureVec = np.add(featureVec,model[word])
#
# Divide the result by the number of words to get the average
featureVec = np.divide(featureVec,nwords)
return featureVec
def getAvgFeatureVecs(reviews, model, num_features):
# Given a set of reviews (each one a list of words), calculate
# the average feature vector for each one and return a 2D numpy array
#
# Initialize a counter
# source: https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors
counter = 0.
#
# Preallocate a 2D numpy array, for speed
reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
#
# Loop through the reviews
for review in reviews:
#
# Print a status message every 1000th review
if counter%1000. == 0.:
print "Review %d of %d" % (counter, len(reviews))
#
# Call the function (defined above) that makes average feature vectors
reviewFeatureVecs[counter] = makeFeatureVec(review, model, \
num_features)
#
# Increment the counter
counter = counter + 1.
return reviewFeatureVecs
In [16]:
# ****************************************************************
# Calculate average feature vectors for training and testing sets,
# using the functions we defined above. Notice that we now use stop word
# removal.
clean_train_reviews = []
for review in df_data["review_text"]:
clean_train_reviews.append( lowercase_remove_punctuation_and_numbers_and_tokenize_and_filter_more_stopwords_and_stem( review))
%time trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )
In [17]:
trainDataVecs
Out[17]:
In [18]:
trainDataVecs.shape
Out[18]:
In [19]:
word2vec_feature_matrix_file = os.path.join(YELP_DATA_WORD_2_VEC_MODEL_DIR, 'word2vec_feature_matrix'+data_subset+'.csv')
np.savetxt(word2vec_feature_matrix_file, trainDataVecs, delimiter=",")
In [20]:
test = np.genfromtxt(
word2vec_feature_matrix_file, # file name
delimiter=',') # column delimiter
In [22]:
np.array_equal(trainDataVecs, test)
Out[22]:
In [ ]: