In [3]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

In [1]:
import numpy as np
import pandas as pd
from yelp_utils import *
import yelp_utils

In [2]:

Read the csv file generated in yelp_datacleaning

In [3]:
read_filename = os.path.join(yelp_utils.YELP_DATA_CSV_DIR, 'business_review_user' + data_subset + '.csv')
df_data = pd.read_csv(read_filename, engine='c', encoding='utf-8')

Pre processing

Perform following NLP task on data-

  1. Lower case
  2. Remove punctuation
  3. Remove numbers
  4. Remove stop words
  5. Stem the words using PorterStemmer</b>

In [4]:
df_data_preprocessed_review = df_data.copy();
%time df_data_preprocessed_review['review_text'] = df_data_preprocessed_review['review_text'].apply( \

Wall time: 2.95 s

Data before preprocessing

In [5]:

u"Great outdoor patio dining area. Great happy hour. Great service.\r\n\r\nOutdoor patio dining has a beautiful mesquite tree for ambiance and blocking out the sun while the center fireplace keeps you warm. \r\n\r\nWe had:\r\nQueso Skillet with warm tortilla chips - amazing!\r\nMac N Cheese with Chorizo - fabulous! one of the best mac n cheeses I've ever had!\r\nCarne Asada on a Potato Pancake - was ok. Sounded excellent, tasted decent.\r\n\r\nFriendly and good food. But the ambiance really puts it over the top as a great dining experience. I'd be back with a group of friends to lounge, play cornsack or bocce ball during happy hour."

Data after preprocessing

In [6]:

u'great outdoor patio dine area great happi hour great servic outdoor patio dine beauti mesquit tree ambianc block sun center fireplac keep warm queso skillet warm tortilla chip amaz mac n chees chorizo fabul one best mac n chees ive ever carn asada potato pancak ok sound excel tast decent friendli good food ambianc realli put top great dine experi id back group friend loung play cornsack bocc ball happi hour'

Generating bag of words

In [7]:
vectorizer = CountVectorizer(analyzer = "word",
                                 tokenizer = None,
                                 preprocessor = None,
                                 ngram_range = (1, 1),
                                 strip_accents = 'unicode',
                                 max_features = 1000)

In [8]:
feature_matrix = vectorizer.fit_transform(df_data_preprocessed_review.review_text)

In [9]:

<2193x1000 sparse matrix of type '<type 'numpy.int64'>'
	with 80889 stored elements in Compressed Sparse Row format>

In [10]:
spare_matrix_file = os.path.join(YELP_DATA_SPARSE_MATRIX_DIR, "bagWords"+ data_subset)
save_sparse_csr(spare_matrix_file, feature_matrix)

In [11]:
test = load_sparse_csr(spare_matrix_file + ".npz")

In [12]:
print np.array_equal(,
print np.array_equal(feature_matrix.indices, test.indices)
print np.array_equal(feature_matrix.indptr, test.indptr)
print np.array_equal(feature_matrix.shape, test.shape)


In [ ]: