In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
In [2]:
import os
import numpy as np
import pandas as pd
from yelp_utils import *
import yelp_utils
In [3]:
from sklearn.cross_validation import KFold
from sklearn.svm import SVC
from sklearn import metrics
from scipy.sparse import csr_matrix
In [4]:
# SEED_VAL = 200;
# WORK_DIR = os.getcwd();
# data_subset = "_10Percent"
# YELP_DATA_CSV_DIR = os.path.join(WORK_DIR, "data", "csv")
# YELP_DATA_SPARSE_MATRIX_DIR = os.path.join(WORK_DIR, "data", "sparse_matrix")
# YELP_DATA_WORD_2_VEC_MODEL_DIR = os.path.join(WORK_DIR, "data", "word2vec_model")
In [5]:
# Read data
read_filename = os.path.join(yelp_utils.YELP_DATA_CSV_DIR, 'business_review_user' + yelp_utils.data_subset + ".csv")
df_data = pd.read_csv(read_filename, engine='c', encoding='utf-8')
In [6]:
def mySVC(feature_matrix_train, y_train, feature_matrix_test, y_test):
'''
Function to apply SVC from sklearn to build model on train data and get accuracy on test and train data
Input:
feature_matrix_train: numpy.ndarray
y_train: numpy.ndarray
feature_matrix_test: numpy.ndarray
y_test: numpy.ndarray
Output:
list: [train_accuracy, test_accuracy]
'''
clf = SVC()
clf.fit(feature_matrix_train, y_train) #.set_params(kernel='linear')
clf2_predictions = clf.predict(feature_matrix_train)
train_accuracy = str(metrics.accuracy_score(y_train, clf2_predictions))
clf_predictions = clf.predict(feature_matrix_test)
test_accuracy = str(metrics.accuracy_score(y_test, clf_predictions))
return [train_accuracy, test_accuracy]
def myKFoldSVM(X, y, n_fold):
'''
Function to divide data into k folds and apply SVC from sklearn to
build model on train data and prints accuracy on test and train data.
Input:
X: numpy.ndarray
y: numpy.ndarray
n_fold: numpy.ndarray
Output:
Prints train and test accuracy for each of the kfold and the average accuracy
'''
res_svc_train = []
res_svc_test = []
res_svc_train_avg = 0.0
res_svc_test_avg = 0.0
kf = KFold(X.shape[0], n_folds=n_fold)
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
res_svc = mySVC(X_train, y_train, X_test, y_test)
res_svc_train_avg = res_svc_train_avg + float(res_svc[0])
res_svc_test_avg = res_svc_test_avg + float(res_svc[1])
res_svc_train.append(res_svc[0])
res_svc_test.append(res_svc[1])
res_svc_train_avg = res_svc_train_avg / n_fold
res_svc_test_avg = res_svc_test_avg / n_fold
print "SVC training accuracy: " + str(res_svc_train)
print "SVC training average accuracy: " + str(res_svc_train_avg)
print "SVC test accuracy:" + str(res_svc_test)
print "SVC test average accuracy: " + str(res_svc_test_avg)
In [7]:
spare_matrix_file = os.path.join(yelp_utils.YELP_DATA_SPARSE_MATRIX_DIR, "bagWords" + yelp_utils.data_subset)
feature_matrix_bag_of_words = load_sparse_csr(spare_matrix_file + ".npz")
y = np.array(df_data.review_stars.copy(), dtype='int32')
In [8]:
%time myKFoldSVM(feature_matrix_bag_of_words.toarray(), y, 5)
In [9]:
# spare_matrix_file = os.path.join(YELP_DATA_SPARSE_MATRIX_DIR, "bagWords_feat_add" + data_subset)
# feature_matrix_bag_of_words_and_hand_craft_features = load_sparse_csr(spare_matrix_file + ".npz")
In [10]:
# %time myKFoldSVM(feature_matrix_bag_of_words_and_hand_craft_features, y, 5)
In [11]:
word2vec_feature_matrix_file = os.path.join(YELP_DATA_WORD_2_VEC_MODEL_DIR, "word2vec_feature_matrix" + data_subset+ ".csv")
feature_matrix_word2vec = np.genfromtxt(word2vec_feature_matrix_file, delimiter=',')
In [12]:
%time myKFoldSVM(feature_matrix_word2vec, y, 5)
In [13]:
# word2vec_feature_matrix_file = os.path.join(YELP_DATA_WORD_2_VEC_MODEL_DIR, "word2vec_add_feature_matrix" + data_subset+ ".csv")
# feature_matrix_word2vec_and_hand_craft_features = np.genfromtxt(word2vec_feature_matrix_file, delimiter=',')
In [14]:
# %time myKFoldSVM(feature_matrix_word2vec_and_hand_craft_features, y, 5)
In [15]:
# feature_matrix_hand_craft_features = feature_matrix_word2vec_and_hand_craft_features[:,100:104]
In [16]:
# %time myKFoldSVM(feature_matrix_hand_craft_features, y, 5)
In [ ]: