For this project, we'll perform the same type of NLTK VADER sentiment analysis, this time on our movie reviews dataset.
The 2,000 record IMDb movie review database is accessible through NLTK directly with
from nltk.corpus import movie_reviews
However, since we already have it in a tab-delimited file we'll use that instead.
In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t')
df.head()
Out[1]:
In [2]:
# REMOVE NaN VALUES AND EMPTY STRINGS:
df.dropna(inplace=True)
blanks = [] # start with an empty list
for i,lb,rv in df.itertuples(): # iterate over the DataFrame
if type(rv)==str: # avoid NaN values
if rv.isspace(): # test 'review' for whitespace
blanks.append(i) # add matching index numbers to the list
df.drop(blanks, inplace=True)
In [3]:
df['label'].value_counts()
Out[3]:
In [5]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
In [6]:
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))
df['compound'] = df['scores'].apply(lambda score_dict: score_dict['compound'])
df['comp_score'] = df['compound'].apply(lambda c: 'pos' if c >=0 else 'neg')
df.head()
Out[6]:
In [7]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
In [8]:
accuracy_score(df['label'],df['comp_score'])
Out[8]:
In [9]:
print(classification_report(df['label'],df['comp_score']))
In [10]:
print(confusion_matrix(df['label'],df['comp_score']))