In [1]:
# @InProceedings{maas-EtAl:2011:ACL-HLT2011,
#   author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
#   title     = {Learning Word Vectors for Sentiment Analysis},
#   booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
#   month     = {June},
#   year      = {2011},
#   address   = {Portland, Oregon, USA},
#   publisher = {Association for Computational Linguistics},
#   pages     = {142--150},
#   url       = {http://www.aclweb.org/anthology/P11-1015}
# }

In [2]:
print("Importing libraries for machine learining")
# Import all libraries for machine learning
%matplotlib inline
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from time import time 
from mlxtend.plotting import plot_decision_regions
from sklearn.datasets import make_moons
from sklearn.svm import LinearSVC, SVC
from sklearn.utils import shuffle
from sklearn.preprocessing import Binarizer
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_svmlight_files
from sklearn.model_selection import cross_val_predict
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier


Importing libraries for machine learining
/usr/local/lib/python3.5/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)

In [3]:
class sentimental_analysis:
    
    def load_files(self, files):
        return load_svmlight_files(files, n_features=None, dtype=None)

    # Calculating Tf-Idf for training and testing
    def tfidf(self, training_data, testing_data):
        tf_transformer = TfidfTransformer()

        print("Training_data TF-IDF")
        #  It computes the TF for each review, the IDF using each review, and finally the TF-IDF for each review
        training_data_tfidf = tf_transformer.fit_transform(training_data)
        print(training_data_tfidf.shape)

        print("Testing_data TF-IDF")
        # .transform on the testing data which computes the TF for each review, 
        # then the TF-IDF for each review using the IDF from the training data 
        testing_data_tfidf = tf_transformer.transform(testing_data)
        print(testing_data_tfidf.shape)

        return [training_data_tfidf,testing_data_tfidf]

    # Binerize target data

    # Converting target into binary
    def binerize (self, raw_target):    
        binerize_target = []
        for i in range(len(raw_target)):
            if raw_target[i] > 5:
                binerize_target.append(1) # Positive
            else:
                binerize_target.append(0) # Negative
        return binerize_target

    # Train and test Logistic Regression Classifier
    def lrc(self, training_data, raw_training_target, testing_data, raw_testing_target):
        print("Binerizing target ...")
        training_target = self.binerize(raw_training_target)
        testing_target = self.binerize(raw_testing_target)
        start = time()
        logreg = LogisticRegression()
        print("Training ...")
        logreg.fit(training_data, training_target)
        print("Training Done")
        print("Testing ...")
        logreg_accuracy = logreg.score(testing_data, testing_target) * 100
        end = time()
        return [logreg, round(logreg_accuracy,2), str(round((end-start), 2))]
    
    # Train and test Linear SVM Classifier with and without parameter 
    def lSVC(self, training_data, raw_training_target, testing_data, raw_testing_target, parameter=False):
        print("Binerizing target ...")
        training_target = self.binerize(raw_training_target)
        testing_target = self.binerize(raw_testing_target)
        start = time()
        if parameter == True:        
            result_lSVC= self.lSVC_para(training_data, training_target, testing_data, testing_target)
            end = time()
            return [result_lSVC[0], round(result_lSVC[1],2), result_lSVC[2], str(round((end-start), 2))]
        else:
            clf_linear = LinearSVC()
            print("Training ...")
            clf_linear.fit(training_data, training_target)
            print("Training Done")
            print("Testing ...")
            result_lSVC = clf_linear.score(testing_data, testing_target)*100    
            end = time()
            return [clf_linear, round(result_lSVC,2), str(round((end-start), 2))]
    
    # Calculating best parameter for LinearSVC Classifier
    def lSVC_para(self, training_data, training_target, testing_data, testing_target):
        print("Calculating best parameter for LinearSVC Classifier ...")
        clist = 2**np.array(range(-2, 10), dtype='float')
        cvscores = []
        for c in clist:
            print(c)
            clf= LinearSVC(C=c)
            scores = cross_val_score(clf, training_data, training_target, cv=3)
            print("score", scores)
            cvscores.append(scores.mean()*100)
            bestscore, bestC = max([(val, clist[idx]) for (idx, val) in enumerate(cvscores)])
        print('Best CV accuracy =', round(bestscore,2), '% achieved at C =', bestC)

        # Retrain on whole trainning set using best C value obtained from Cross validation
        print("Retrain on whole trainning set using best C value obtained from Cross validation")
        clf = LinearSVC(C=bestC)
        clf.fit(training_data, training_target)
        accu = clf.score(testing_data, testing_target)*100
        return [clf, accu, bestC]

    # Train and test Random Forest Classifier
    def random_forest(self, training_data, raw_training_target, testing_data, raw_testing_target):
        print("Binerizing target ...")
        training_target = self.binerize(raw_training_target)
        testing_target = self.binerize(raw_testing_target)
        start = time()
        print("Training ...")
        clf_forest = RandomForestClassifier(n_estimators = 100, min_samples_leaf=5, max_features='auto', max_depth=16)
        clf_forest.fit(training_data, training_target)
        print("Training Done")
        print("Testing ...")
        clf_forest_accuracy = clf_forest.score(testing_data, testing_target)*100
        end = time()
        return [clf_forest, round(clf_forest_accuracy,2),str(round((end-start), 2))]

    # Train and test Kernel SVM Classifier
    def kernel_SVM(self, training_data, raw_training_target, testing_data, raw_testing_target):
        print("Binerizing target ...")
        training_target = self.binerize(raw_training_target)
        testing_target = self.binerize(raw_testing_target)
        start = time()
        clf_kernel = SVC()
        print("Training ...")
        clf_kernel.fit(training_data, training_target)
        print("Training Done")
        print("Testing ...")
        end = time()
        clf_kernel_accuracy = clf_kernel.score(testing_data, testing_target)*100
        end = time() 
        return [clf_kernel, round(clf_kernel_accuracy,2),str(round((end-start), 2))]
    
    # Prediction from Random Forest 
    def prediction(self, obj_clf,fileName,labels):
        pre = obj_clf.predict(testing_data)
        print("Done")
        prediction_result = []
        for i in range(len(pre)):
            if pre[i] == 0:
                prediction_result.append(str(i) + ", negative") 
            else:
                prediction_result.append(str(i) + ", positive") 
        self.save_csv(prediction_result, fileName, labels)
        
    # Storing prediction in CSV file
    def save_csv(self, prediction_result, fileName, labels):
        print("Creating CSV file")
        # Open File
        output_file = open(fileName+".csv",'w')
        output_file.write(','.join(labels)+"\n")
        # Write data to file
        for r in prediction_result:
            output_file.write(r + "\n")
        output_file.close()
        print("File saved!")

In [4]:
# # Feature Extraction
# print("Feature Extraction")
# from sklearn.feature_extraction.text import CountVectorizer
# count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(training_data.data)
# X_train_counts.shape
# print(X_train_counts)

# # Term Frequency
# print("Term Frequency")
# from sklearn.feature_extraction.text import TfidfTransformer
# tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
# X_train_tf = tf_transformer.transform(X_train_counts)
# X_train_tf.shape
# print(X_train_tf)

# # TF-IDF
# print("TF-IDF")
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# X_train_tfidf.shape
# print(X_train_tfidf)

In [7]:
# Store path in array for training and testing files
files = ["/data/aclImdb/train/labeledBow.feat","/data/aclImdb/test/labeledBow.feat"]

# Object for sentiment_analysis
sa = sentimental_analysis()

# Load data for training_data, training_target and testing_data, testing_target 
print("Loading Files ...")
training_data, raw_training_target, testing_data, raw_testing_target = sa.load_files(files)
print("Done")

# Count tf-idf for training and testing data
tfidf_data = sa.tfidf(training_data, testing_data)

training_data = tfidf_data[0]
testing_data = tfidf_data[1]

print("Logistic Regression Classifier")
result = sa.lrc(training_data, raw_training_target, testing_data, raw_testing_target)
obj_lrc = result[0]
print("Accuracy = ", result[1], "% Time = ", result[2],"seconds")

print("Linear SVM Classifier ")
result = sa.lSVC(training_data, raw_training_target, testing_data, raw_testing_target)
obj_lSCV = result[0]
print("Accuracy = ", result[1], "% Time = ", result[2],"seconds")

print("Linear SVM Classifier With Parameter Selection")
result = sa.lSVC(training_data, raw_training_target, testing_data, raw_testing_target, True)
obj_lSVC_para = result[0]
print("Accuracy = ", result[1], "% at Best C = ", result[2],"Time = ", result[3],"seconds")

print("Random Forest Classifier")
result = sa.random_forest(training_data, raw_training_target, testing_data, raw_testing_target)
obj_random_forest = result[0]
print("Accuracy = ", result[1], "% Time = ", result[2],"seconds")

# print("Kernel SVM Classifier")
# result = sa.random_forest(training_data, raw_training_target, testing_data, raw_testing_target)
# obj_kernel_SVM = result[0]
# print("Accuracy = ", result[1], "% Time = ", result[2],"seconds")

print("Prediction for new dataset from classifier...")
# You can pass any classifier's object for prediction data and file name
labels = ["review","rating"]
sa.prediction(obj_random_forest, "random", labels)


Loading Files ...
Done
Training_data TF-IDF
(25000, 89527)
Testing_data TF-IDF
(25000, 89527)
Logistic Regression Classifier
Binerizing target ...
Training ...
Training Done
Testing ...
Accuracy =  88.32 % Time =  3.2 seconds
Linear SVM Classifier 
Binerizing target ...
Training ...
Training Done
Testing ...
Accuracy =  87.9 % Time =  1.4 seconds
Linear SVM Classifier With Parameter Selection
Binerizing target ...
Calculating best parameter for LinearSVC Classifier ...
0.25
score [ 0.85805136  0.86177106  0.87013922]
0.5
score [ 0.85325174  0.85913127  0.8643783 ]
1.0
score [ 0.84485241  0.85157187  0.85417667]
2.0
score [ 0.83717303  0.84233261  0.84349496]
4.0
score [ 0.82889369  0.83273338  0.83713394]
8.0
score [ 0.82277418  0.82793377  0.83269323]
16.0
score [ 0.81845452  0.82385409  0.82957273]
32.0
score [ 0.81725462  0.82277418  0.82861258]
64.0
score [ 0.81653468  0.82157427  0.82741239]
128.0
score [ 0.81593473  0.82097432  0.82621219]
256.0
score [ 0.81521478  0.82097432  0.82573212]
512.0
score [ 0.81557475  0.82085433  0.8256121 ]
Best CV accuracy = 86.33 % achieved at C = 0.25
Retrain on whole trainning set using best C value obtained from Cross validation
Accuracy =  88.63 % at Best C =  0.25 Time =  267.79 seconds
Random Forest Classifier
Binerizing target ...
Training ...
Training Done
Testing ...
Accuracy =  82.16 % Time =  11.82 seconds
Prediction for new dataset from classifier...
Done
Creating CSV file
File saved!