This assessment is very much like the Text Classification Project we just completed, and the dataset is very similar.
The moviereviews2.tsv dataset contains the text of 6000 movie reviews. 3000 are positive, 3000 are negative, and the text has been preprocessed as a tab-delimited file. As before, labels are given as pos and neg.
We've included 20 reviews that contain either NaN data, or have strings made up of whitespace.
For more information on this dataset visit http://ai.stanford.edu/~amaas/data/sentiment/
In [6]:
import spacy as spacy
import numpy as np
import pandas as pd
data = pd.read_csv('../TextFiles/moviereviews2.tsv', sep='\t')
data.head()
Out[6]:
In [9]:
# Check for NaN values:
data.isnull().sum()
Out[9]:
In [10]:
# Check for whitespace strings (it's OK if there aren't any!):
white_spaces = []
for i, lb, rw in data.itertuples():
if type(rw) == str:
if rw.isspace():
white_spaces.append(i)
len(white_spaces)
Out[10]:
In [47]:
data.drop(white_spaces, inplace=True)
data.dropna(inplace=True)
In [48]:
data.groupby('label').count()
Out[48]:
In [49]:
data['label'].value_counts()
Out[49]:
In [50]:
from sklearn.model_selection import train_test_split
X = data['review']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)
X_train.head()
#print(X_train.shape, " ", y_train.shape)
#print(X_test.shape, " ", y_test.shape)
Out[50]:
In [51]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
('clf', LinearSVC())
])
# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)
Out[51]:
In [53]:
# Form a prediction set
predictions = text_clf.predict(X_test)
In [54]:
# Report the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)
Out[54]:
In [56]:
# Print a classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))
In [58]:
# Print the overall accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions) * 100)