In this notebook, we use a number of conventional feature engineering based approaches for sentiment classification.
In [1]:
import pandas as pd
import numpy as np
import os
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
In [2]:
sentences = pd.read_csv(
'imdb_labelled.txt', sep='\|', header=None, engine='python',
names=['sentence', 'sentiment_label']
)
In [3]:
sentences.head()
Out[3]:
In [4]:
sentences.shape
Out[4]:
Next we check for label imbalance.
In [5]:
counts = sentences.sentiment_label.value_counts()
ax = counts.plot(kind='bar', rot=0, title='Sentences per label')
So this dataset has almost equal number of articles.
Next we analyze the distribution of words in the two class of articles.
In [6]:
from nltk.tokenize import word_tokenize
sentences = sentences.assign(word_list=sentences.sentence.apply(word_tokenize))
sentences.head()
Out[6]:
In [7]:
# Create a counter for each label and update the relevant counter as we iterate over the entire dataframe
from collections import Counter
counters = {label: Counter() for label in sentences.sentiment_label.unique()}
for _, row in sentences.iterrows():
c = counters[row.loc['sentiment_label']]
c.update(row.loc['word_list'])
def make_df(label, c):
"""Make a DataFrame out of the counter with words in the index."""
df = pd.DataFrame.from_dict(c, orient='index')
df.columns =['sentiment_{}'.format(label)]
return df
counter_dfs = {label: make_df(label, c) for label, c in counters.items()}
Now we look at the high-frequency and low-frequency words for sentiment label 0.
In [8]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
english_stopwords = set(stopwords.words('english'))
print(english_stopwords)
In [9]:
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
sent_0_counts = counter_dfs[0].sort_values(by='sentiment_0', ascending=False)
sent_0_counts = sent_0_counts.drop(english_stopwords, errors='ignore')
sent_0_counts.head(30).plot(kind='barh', title='40 most frequent words in sentiment-0.', ax=ax[0], fontsize=14)
sent_0_counts.tail(30).plot(kind='barh', title='40 least frequent words in sentiment-0.', ax=ax[1], fontsize=14)
plt.tight_layout()
High-frequency and low-frequency words for sentiment label 1.
In [10]:
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
sent_1_counts = counter_dfs[1].sort_values(by='sentiment_1', ascending=False)
sent_1_counts = sent_1_counts.drop(english_stopwords, errors='ignore')
sent_1_counts.head(30).plot(kind='barh', title='40 most frequent words in sentiment-1.', ax=ax[0], fontsize=14)
sent_1_counts.tail(30).plot(kind='barh', title='40 least frequent words in sentiment-1.', ax=ax[1], fontsize=14)
plt.tight_layout()
In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model.stochastic_gradient import SGDClassifier
model = SGDClassifier(alpha=1E-3, random_state=1234, max_iter=1000)
feature_calc = TfidfVectorizer(stop_words=english_stopwords)
pipeline = make_pipeline(feature_calc, model)
In [12]:
X_train = sentences.sample(frac=0.7, replace=False, random_state=1234).loc[:, 'sentence']
Y_train = sentences.sentiment_label.loc[X_train.index]
In [13]:
pipeline = pipeline.fit(X_train.values, Y_train.values)
In [14]:
X_test = sentences.drop(X_train.index, axis=0).loc[:, 'sentence']
Y_test = sentences.sentiment_label.loc[X_test.index]
predictions = pipeline.predict(X_test.values)
In [15]:
from sklearn.metrics import classification_report
results = classification_report(Y_test.values, predictions)
print(results)
In [ ]: