In [1]:
# @InProceedings{maas-EtAl:2011:ACL-HLT2011,
# author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher},
# title = {Learning Word Vectors for Sentiment Analysis},
# booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
# month = {June},
# year = {2011},
# address = {Portland, Oregon, USA},
# publisher = {Association for Computational Linguistics},
# pages = {142--150},
# url = {http://www.aclweb.org/anthology/P11-1015}
# }
In [2]:
print("Importing libraries for machine learining")
# Import all libraries for machine learning
%matplotlib inline
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from time import time
from mlxtend.plotting import plot_decision_regions
from sklearn.datasets import make_moons
from sklearn.svm import LinearSVC, SVC
from sklearn.utils import shuffle
from sklearn.preprocessing import Binarizer
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_svmlight_files
from sklearn.model_selection import cross_val_predict
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
In [3]:
class sentimental_analysis:
def load_files(self, files):
return load_svmlight_files(files, n_features=None, dtype=None)
# Calculating Tf-Idf for training and testing
def tfidf(self, training_data, testing_data):
tf_transformer = TfidfTransformer()
print("Training_data TF-IDF")
# It computes the TF for each review, the IDF using each review, and finally the TF-IDF for each review
training_data_tfidf = tf_transformer.fit_transform(training_data)
print(training_data_tfidf.shape)
print("Testing_data TF-IDF")
# .transform on the testing data which computes the TF for each review,
# then the TF-IDF for each review using the IDF from the training data
testing_data_tfidf = tf_transformer.transform(testing_data)
print(testing_data_tfidf.shape)
return [training_data_tfidf,testing_data_tfidf]
# Binerize target data
# Converting target into binary
def binerize (self, raw_target):
binerize_target = []
for i in range(len(raw_target)):
if raw_target[i] > 5:
binerize_target.append(1) # Positive
else:
binerize_target.append(0) # Negative
return binerize_target
# Train and test Logistic Regression Classifier
def lrc(self, training_data, raw_training_target, testing_data, raw_testing_target):
print("Binerizing target ...")
training_target = self.binerize(raw_training_target)
testing_target = self.binerize(raw_testing_target)
start = time()
logreg = LogisticRegression()
print("Training ...")
logreg.fit(training_data, training_target)
print("Training Done")
print("Testing ...")
logreg_accuracy = logreg.score(testing_data, testing_target) * 100
end = time()
return [logreg, round(logreg_accuracy,2), str(round((end-start), 2))]
# Train and test Linear SVM Classifier with and without parameter
def lSVC(self, training_data, raw_training_target, testing_data, raw_testing_target, parameter=False):
print("Binerizing target ...")
training_target = self.binerize(raw_training_target)
testing_target = self.binerize(raw_testing_target)
start = time()
if parameter == True:
result_lSVC= self.lSVC_para(training_data, training_target, testing_data, testing_target)
end = time()
return [result_lSVC[0], round(result_lSVC[1],2), result_lSVC[2], str(round((end-start), 2))]
else:
clf_linear = LinearSVC()
print("Training ...")
clf_linear.fit(training_data, training_target)
print("Training Done")
print("Testing ...")
result_lSVC = clf_linear.score(testing_data, testing_target)*100
end = time()
return [clf_linear, round(result_lSVC,2), str(round((end-start), 2))]
# Calculating best parameter for LinearSVC Classifier
def lSVC_para(self, training_data, training_target, testing_data, testing_target):
print("Calculating best parameter for LinearSVC Classifier ...")
clist = 2**np.array(range(-2, 10), dtype='float')
cvscores = []
for c in clist:
print(c)
clf= LinearSVC(C=c)
scores = cross_val_score(clf, training_data, training_target, cv=3)
print("score", scores)
cvscores.append(scores.mean()*100)
bestscore, bestC = max([(val, clist[idx]) for (idx, val) in enumerate(cvscores)])
print('Best CV accuracy =', round(bestscore,2), '% achieved at C =', bestC)
# Retrain on whole trainning set using best C value obtained from Cross validation
print("Retrain on whole trainning set using best C value obtained from Cross validation")
clf = LinearSVC(C=bestC)
clf.fit(training_data, training_target)
accu = clf.score(testing_data, testing_target)*100
return [clf, accu, bestC]
# Train and test Random Forest Classifier
def random_forest(self, training_data, raw_training_target, testing_data, raw_testing_target):
print("Binerizing target ...")
training_target = self.binerize(raw_training_target)
testing_target = self.binerize(raw_testing_target)
start = time()
print("Training ...")
clf_forest = RandomForestClassifier(n_estimators = 100, min_samples_leaf=5, max_features='auto', max_depth=16)
clf_forest.fit(training_data, training_target)
print("Training Done")
print("Testing ...")
clf_forest_accuracy = clf_forest.score(testing_data, testing_target)*100
end = time()
return [clf_forest, round(clf_forest_accuracy,2),str(round((end-start), 2))]
# Train and test Kernel SVM Classifier
def kernel_SVM(self, training_data, raw_training_target, testing_data, raw_testing_target):
print("Binerizing target ...")
training_target = self.binerize(raw_training_target)
testing_target = self.binerize(raw_testing_target)
start = time()
clf_kernel = SVC()
print("Training ...")
clf_kernel.fit(training_data, training_target)
print("Training Done")
print("Testing ...")
end = time()
clf_kernel_accuracy = clf_kernel.score(testing_data, testing_target)*100
end = time()
return [clf_kernel, round(clf_kernel_accuracy,2),str(round((end-start), 2))]
# Prediction from Random Forest
def prediction(self, obj_clf,fileName,labels):
pre = obj_clf.predict(testing_data)
print("Done")
prediction_result = []
for i in range(len(pre)):
if pre[i] == 0:
prediction_result.append(str(i) + ", negative")
else:
prediction_result.append(str(i) + ", positive")
self.save_csv(prediction_result, fileName, labels)
# Storing prediction in CSV file
def save_csv(self, prediction_result, fileName, labels):
print("Creating CSV file")
# Open File
output_file = open(fileName+".csv",'w')
output_file.write(','.join(labels)+"\n")
# Write data to file
for r in prediction_result:
output_file.write(r + "\n")
output_file.close()
print("File saved!")
In [4]:
# # Feature Extraction
# print("Feature Extraction")
# from sklearn.feature_extraction.text import CountVectorizer
# count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(training_data.data)
# X_train_counts.shape
# print(X_train_counts)
# # Term Frequency
# print("Term Frequency")
# from sklearn.feature_extraction.text import TfidfTransformer
# tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
# X_train_tf = tf_transformer.transform(X_train_counts)
# X_train_tf.shape
# print(X_train_tf)
# # TF-IDF
# print("TF-IDF")
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# X_train_tfidf.shape
# print(X_train_tfidf)
In [7]:
# Store path in array for training and testing files
files = ["/data/aclImdb/train/labeledBow.feat","/data/aclImdb/test/labeledBow.feat"]
# Object for sentiment_analysis
sa = sentimental_analysis()
# Load data for training_data, training_target and testing_data, testing_target
print("Loading Files ...")
training_data, raw_training_target, testing_data, raw_testing_target = sa.load_files(files)
print("Done")
# Count tf-idf for training and testing data
tfidf_data = sa.tfidf(training_data, testing_data)
training_data = tfidf_data[0]
testing_data = tfidf_data[1]
print("Logistic Regression Classifier")
result = sa.lrc(training_data, raw_training_target, testing_data, raw_testing_target)
obj_lrc = result[0]
print("Accuracy = ", result[1], "% Time = ", result[2],"seconds")
print("Linear SVM Classifier ")
result = sa.lSVC(training_data, raw_training_target, testing_data, raw_testing_target)
obj_lSCV = result[0]
print("Accuracy = ", result[1], "% Time = ", result[2],"seconds")
print("Linear SVM Classifier With Parameter Selection")
result = sa.lSVC(training_data, raw_training_target, testing_data, raw_testing_target, True)
obj_lSVC_para = result[0]
print("Accuracy = ", result[1], "% at Best C = ", result[2],"Time = ", result[3],"seconds")
print("Random Forest Classifier")
result = sa.random_forest(training_data, raw_training_target, testing_data, raw_testing_target)
obj_random_forest = result[0]
print("Accuracy = ", result[1], "% Time = ", result[2],"seconds")
# print("Kernel SVM Classifier")
# result = sa.random_forest(training_data, raw_training_target, testing_data, raw_testing_target)
# obj_kernel_SVM = result[0]
# print("Accuracy = ", result[1], "% Time = ", result[2],"seconds")
print("Prediction for new dataset from classifier...")
# You can pass any classifier's object for prediction data and file name
labels = ["review","rating"]
sa.prediction(obj_random_forest, "random", labels)