In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

In [2]:
import os
import numpy as np
import pandas as pd
from yelp_utils import *
import yelp_utils

In [3]:
from sklearn.cross_validation import KFold
from sklearn.svm import SVC
from sklearn import metrics
from scipy.sparse import csr_matrix

In [4]:
# SEED_VAL = 200;
# WORK_DIR = os.getcwd();
# data_subset = "_10Percent"
# YELP_DATA_CSV_DIR = os.path.join(WORK_DIR, "data", "csv")
# YELP_DATA_SPARSE_MATRIX_DIR = os.path.join(WORK_DIR, "data", "sparse_matrix")
# YELP_DATA_WORD_2_VEC_MODEL_DIR = os.path.join(WORK_DIR, "data", "word2vec_model")

In [5]:
# Read data
read_filename = os.path.join(yelp_utils.YELP_DATA_CSV_DIR, 'business_review_user' + yelp_utils.data_subset + ".csv")
df_data = pd.read_csv(read_filename, engine='c', encoding='utf-8')

In [6]:
def mySVC(feature_matrix_train, y_train, feature_matrix_test, y_test):
    '''
    Function to apply SVC from sklearn to build model on train data and get accuracy on test and train data
    Input:
        feature_matrix_train: numpy.ndarray
        y_train: numpy.ndarray
        feature_matrix_test: numpy.ndarray
        y_test: numpy.ndarray
    Output:
        list: [train_accuracy, test_accuracy]
    '''
    clf = SVC()
    clf.fit(feature_matrix_train, y_train)  #.set_params(kernel='linear')
    
    clf2_predictions = clf.predict(feature_matrix_train)
    train_accuracy = str(metrics.accuracy_score(y_train, clf2_predictions))

    clf_predictions = clf.predict(feature_matrix_test)
    test_accuracy = str(metrics.accuracy_score(y_test, clf_predictions))
    
    return [train_accuracy, test_accuracy]

def myKFoldSVM(X, y, n_fold):
    '''
    Function to divide data into k folds and apply SVC from sklearn to 
    build model on train data and prints accuracy on test and train data. 
    Input:
        X: numpy.ndarray
        y: numpy.ndarray
        n_fold: numpy.ndarray
    Output:
        Prints train and test accuracy for each of the kfold and the average accuracy 
    '''
    res_svc_train = []
    res_svc_test = []
    res_svc_train_avg = 0.0
    res_svc_test_avg = 0.0
    kf = KFold(X.shape[0], n_folds=n_fold)
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        res_svc = mySVC(X_train, y_train, X_test, y_test)
        
        res_svc_train_avg = res_svc_train_avg + float(res_svc[0])
        res_svc_test_avg = res_svc_test_avg + float(res_svc[1])
        
        res_svc_train.append(res_svc[0])
        res_svc_test.append(res_svc[1])
        
    res_svc_train_avg = res_svc_train_avg / n_fold
    res_svc_test_avg = res_svc_test_avg / n_fold
    
    print "SVC training accuracy: " + str(res_svc_train)
    print "SVC training average accuracy: " + str(res_svc_train_avg)
    print "SVC test accuracy:" + str(res_svc_test)
    print "SVC test average accuracy: " + str(res_svc_test_avg)

Bag of words


In [7]:
spare_matrix_file = os.path.join(yelp_utils.YELP_DATA_SPARSE_MATRIX_DIR, "bagWords" + yelp_utils.data_subset)
feature_matrix_bag_of_words = load_sparse_csr(spare_matrix_file + ".npz")
y = np.array(df_data.review_stars.copy(), dtype='int32')

In [8]:
%time myKFoldSVM(feature_matrix_bag_of_words.toarray(), y, 5)


SVC training accuracy: ['0.45267958951', '0.454389965792', '0.456100342075', '0.45811965812', '0.454700854701']
SVC training average accuracy: 0.45519808204
SVC test accuracy:['0.441913439636', '0.414578587699', '0.396355353075', '0.420091324201', '0.442922374429']
SVC test average accuracy: 0.423172215808
Wall time: 37.5 s

Bag of words + Hand crafting features


In [9]:
# spare_matrix_file = os.path.join(YELP_DATA_SPARSE_MATRIX_DIR, "bagWords_feat_add" + data_subset)
# feature_matrix_bag_of_words_and_hand_craft_features = load_sparse_csr(spare_matrix_file + ".npz")

In [10]:
# %time myKFoldSVM(feature_matrix_bag_of_words_and_hand_craft_features, y, 5)

Word embedding


In [11]:
word2vec_feature_matrix_file = os.path.join(YELP_DATA_WORD_2_VEC_MODEL_DIR, "word2vec_feature_matrix" + data_subset+ ".csv")
feature_matrix_word2vec = np.genfromtxt(word2vec_feature_matrix_file, delimiter=',')

In [12]:
%time myKFoldSVM(feature_matrix_word2vec, y, 5)


SVC training accuracy: ['0.398517673888', '0.405359179019', '0.40763968073', '0.403418803419', '0.40056980057']
SVC training average accuracy: 0.403101027525
SVC test accuracy:['0.421412300683', '0.394077448747', '0.384965831435', '0.401826484018', '0.413242009132']
SVC test average accuracy: 0.403104814803
Wall time: 10.5 s

Word embedding + hand craft features


In [13]:
# word2vec_feature_matrix_file = os.path.join(YELP_DATA_WORD_2_VEC_MODEL_DIR, "word2vec_add_feature_matrix" + data_subset+ ".csv")
# feature_matrix_word2vec_and_hand_craft_features = np.genfromtxt(word2vec_feature_matrix_file, delimiter=',')

In [14]:
# %time myKFoldSVM(feature_matrix_word2vec_and_hand_craft_features, y, 5)

Hand craft features


In [15]:
# feature_matrix_hand_craft_features = feature_matrix_word2vec_and_hand_craft_features[:,100:104]

In [16]:
# %time myKFoldSVM(feature_matrix_hand_craft_features, y, 5)

In [ ]: