In [3]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
In [1]:
import numpy as np
import pandas as pd
from yelp_utils import *
import yelp_utils
In [2]:
yelp_utils.make_sure_path_exists(yelp_utils.YELP_DATA_SPARSE_MATRIX_DIR)
Read the csv file generated in yelp_datacleaning
In [3]:
read_filename = os.path.join(yelp_utils.YELP_DATA_CSV_DIR, 'business_review_user' + data_subset + '.csv')
df_data = pd.read_csv(read_filename, engine='c', encoding='utf-8')
Perform following NLP task on data-
In [4]:
df_data_preprocessed_review = df_data.copy();
%time df_data_preprocessed_review['review_text'] = df_data_preprocessed_review['review_text'].apply( \
yelp_utils.lowercase_and_remove_punctuation_and_remove_numbers_and_tokenize_stem_and_restring)
Data before preprocessing
In [5]:
df_data.review_text[1]
Out[5]:
Data after preprocessing
In [6]:
df_data_preprocessed_review.review_text[1]
Out[6]:
In [7]:
vectorizer = CountVectorizer(analyzer = "word",
tokenizer = None,
preprocessor = None,
ngram_range = (1, 1),
strip_accents = 'unicode',
max_features = 1000)
In [8]:
feature_matrix = vectorizer.fit_transform(df_data_preprocessed_review.review_text)
In [9]:
feature_matrix
Out[9]:
In [10]:
spare_matrix_file = os.path.join(YELP_DATA_SPARSE_MATRIX_DIR, "bagWords"+ data_subset)
save_sparse_csr(spare_matrix_file, feature_matrix)
In [11]:
test = load_sparse_csr(spare_matrix_file + ".npz")
In [12]:
print np.array_equal(feature_matrix.data, test.data)
print np.array_equal(feature_matrix.indices, test.indices)
print np.array_equal(feature_matrix.indptr, test.indptr)
print np.array_equal(feature_matrix.shape, test.shape)
In [ ]: