In [1]:
    
#!/usr/bin/env python
# -*- coding: utf-8 -*-
    
In [2]:
    
import os
import numpy as np
import pandas as pd
from yelp_utils import *
import yelp_utils
    
In [3]:
    
from sklearn.cross_validation import KFold
from sklearn.svm import SVC
from sklearn import metrics
from scipy.sparse import csr_matrix
    
In [4]:
    
# SEED_VAL = 200;
# WORK_DIR = os.getcwd();
# data_subset = "_10Percent"
# YELP_DATA_CSV_DIR = os.path.join(WORK_DIR, "data", "csv")
# YELP_DATA_SPARSE_MATRIX_DIR = os.path.join(WORK_DIR, "data", "sparse_matrix")
# YELP_DATA_WORD_2_VEC_MODEL_DIR = os.path.join(WORK_DIR, "data", "word2vec_model")
    
In [5]:
    
# Read data
read_filename = os.path.join(yelp_utils.YELP_DATA_CSV_DIR, 'business_review_user' + yelp_utils.data_subset + ".csv")
df_data = pd.read_csv(read_filename, engine='c', encoding='utf-8')
    
In [6]:
    
def mySVC(feature_matrix_train, y_train, feature_matrix_test, y_test):
    '''
    Function to apply SVC from sklearn to build model on train data and get accuracy on test and train data
    Input:
        feature_matrix_train: numpy.ndarray
        y_train: numpy.ndarray
        feature_matrix_test: numpy.ndarray
        y_test: numpy.ndarray
    Output:
        list: [train_accuracy, test_accuracy]
    '''
    clf = SVC()
    clf.fit(feature_matrix_train, y_train)  #.set_params(kernel='linear')
    
    clf2_predictions = clf.predict(feature_matrix_train)
    train_accuracy = str(metrics.accuracy_score(y_train, clf2_predictions))
    clf_predictions = clf.predict(feature_matrix_test)
    test_accuracy = str(metrics.accuracy_score(y_test, clf_predictions))
    
    return [train_accuracy, test_accuracy]
def myKFoldSVM(X, y, n_fold):
    '''
    Function to divide data into k folds and apply SVC from sklearn to 
    build model on train data and prints accuracy on test and train data. 
    Input:
        X: numpy.ndarray
        y: numpy.ndarray
        n_fold: numpy.ndarray
    Output:
        Prints train and test accuracy for each of the kfold and the average accuracy 
    '''
    res_svc_train = []
    res_svc_test = []
    res_svc_train_avg = 0.0
    res_svc_test_avg = 0.0
    kf = KFold(X.shape[0], n_folds=n_fold)
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        res_svc = mySVC(X_train, y_train, X_test, y_test)
        
        res_svc_train_avg = res_svc_train_avg + float(res_svc[0])
        res_svc_test_avg = res_svc_test_avg + float(res_svc[1])
        
        res_svc_train.append(res_svc[0])
        res_svc_test.append(res_svc[1])
        
    res_svc_train_avg = res_svc_train_avg / n_fold
    res_svc_test_avg = res_svc_test_avg / n_fold
    
    print "SVC training accuracy: " + str(res_svc_train)
    print "SVC training average accuracy: " + str(res_svc_train_avg)
    print "SVC test accuracy:" + str(res_svc_test)
    print "SVC test average accuracy: " + str(res_svc_test_avg)
    
In [7]:
    
spare_matrix_file = os.path.join(yelp_utils.YELP_DATA_SPARSE_MATRIX_DIR, "bagWords" + yelp_utils.data_subset)
feature_matrix_bag_of_words = load_sparse_csr(spare_matrix_file + ".npz")
y = np.array(df_data.review_stars.copy(), dtype='int32')
    
In [8]:
    
%time myKFoldSVM(feature_matrix_bag_of_words.toarray(), y, 5)
    
    
In [9]:
    
# spare_matrix_file = os.path.join(YELP_DATA_SPARSE_MATRIX_DIR, "bagWords_feat_add" + data_subset)
# feature_matrix_bag_of_words_and_hand_craft_features = load_sparse_csr(spare_matrix_file + ".npz")
    
In [10]:
    
# %time myKFoldSVM(feature_matrix_bag_of_words_and_hand_craft_features, y, 5)
    
In [11]:
    
word2vec_feature_matrix_file = os.path.join(YELP_DATA_WORD_2_VEC_MODEL_DIR, "word2vec_feature_matrix" + data_subset+ ".csv")
feature_matrix_word2vec = np.genfromtxt(word2vec_feature_matrix_file, delimiter=',')
    
In [12]:
    
%time myKFoldSVM(feature_matrix_word2vec, y, 5)
    
    
In [13]:
    
# word2vec_feature_matrix_file = os.path.join(YELP_DATA_WORD_2_VEC_MODEL_DIR, "word2vec_add_feature_matrix" + data_subset+ ".csv")
# feature_matrix_word2vec_and_hand_craft_features = np.genfromtxt(word2vec_feature_matrix_file, delimiter=',')
    
In [14]:
    
# %time myKFoldSVM(feature_matrix_word2vec_and_hand_craft_features, y, 5)
    
In [15]:
    
# feature_matrix_hand_craft_features = feature_matrix_word2vec_and_hand_craft_features[:,100:104]
    
In [16]:
    
# %time myKFoldSVM(feature_matrix_hand_craft_features, y, 5)
    
In [ ]: