This notebook creates a logistic regression, svm, and naive bayes classifier from training data to train these model. in this motebok data that we are using having review of a movie and sentiment (positive or negative) of movie.
In [1]:
import csv
from sklearn.linear_model import LogisticRegression as LR
from sklearn import svm
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import cross_validation
from sklearn.metrics import classification_report
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.utils import shuffle
from nltk.corpus import stopwords
import nltk
import re
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import SGDClassifier as SGD
In [2]:
def load_file():
with open('review.csv') as csv_file:
reader = csv.reader(csv_file,delimiter=",",quotechar='"')
reader.next()
data =[]
target = []
for row in reader:
# skip missing data
if row[0] and row[1]:
data.append(row[0])
target.append(row[1])
return data,target
In [5]:
data,target = load_file()
In [7]:
# preprocess creates the term frequency matrix for the review data set
count_vectorizer = CountVectorizer()
data1 = count_vectorizer.fit_transform(data)
tfidf_data = TfidfTransformer(use_idf=False).fit_transform(data1)
In [8]:
# preparing data for split validation. 80% training, 20% test
data_train,data_test,target_train,target_test =cross_validation.train_test_split(tfidf_data,\
target,test_size=0.2,\
random_state=43)
classifier = BernoulliNB().fit(data_train,target_train)
predicted = classifier.predict(data_test)
print classification_report(target_test,predicted)
print "The accuracy score is {:.2%}".format(accuracy_score(target_test,predicted))
Here we can see accuracy of our model is 80% when we are using the 80% of data as a training data let's try on different combination of data for training and test
In [9]:
# preparing data for split validation. 60% training, 40% test
data_train,data_test,target_train,target_test =cross_validation.train_test_split(tfidf_data,\
target,test_size=0.4,\
random_state=43)
classifier = BernoulliNB().fit(data_train,target_train)
predicted = classifier.predict(data_test)
print classification_report(target_test,predicted)
print "The accuracy score is {:.2%}".format(accuracy_score(target_test,predicted))