I built this tutorial and kaggle InClass competition to help participants learn and also practice building a sentiment classifier.
SF Project Night: https://www.meetup.com/sfpython/events/234956048/
Date: Oct'18th 2017
This dataset of Movie Reviews is from Stanford AI group ( http://ai.stanford.edu/~amaas/data/sentiment/)
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing.
In [2]:
import os.path
from glob import glob
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
In [2]:
def extract_reviews_data(file_pattern):
"""Returns the extracted review texts from all files that match the pattern"""
data = []
for filename in glob(file_pattern):
with open(filename, "rb") as f:
review = f.read().decode("utf-8")
doc_id = filename.split("/")[-1].split(".")[0]
data.append({
"review": review
})
return data
In [3]:
# Get the Train Data of Positive and Negative Reviews
train_pos_dir = os.path.expanduser("~/datasets/movie-sentiment-analysis/aclImdb/train/pos/*.txt")
train_neg_dir = os.path.expanduser("~/datasets/movie-sentiment-analysis/aclImdb/train/neg/*.txt")
train_pos_data = extract_reviews_data(train_pos_dir)
train_neg_data = extract_reviews_data(train_neg_dir)
In [4]:
# Get the Test Data of Positive and Negative Reviews
test_pos_dir = os.path.expanduser("~/datasets/movie-sentiment-analysis/aclImdb/test/pos/*.txt")
test_neg_dir = os.path.expanduser("~/datasets/movie-sentiment-analysis/aclImdb/test/neg/*.txt")
test_pos_data = extract_reviews_data(test_pos_dir)
test_neg_data = extract_reviews_data(test_neg_dir)
In [7]:
# Build the train.tsv file. Positive Reviews are marked as 1 and Negative as 0
# Data is shuffled before saving to file
train_pos_df = pd.DataFrame(train_pos_data)
train_pos_df["sentiment"] = 1
train_neg_df = pd.DataFrame(train_neg_data)
train_neg_df["sentiment"] = 0
train_df = pd.concat([train_pos_df, train_neg_df], axis=0)
train_df = shuffle(train_df)
train_df["document_id"] = np.arange(len(train_df))
train_df[["document_id", "sentiment", "review"]].to_csv("data/train.tsv", sep="\t", index=False)
In [8]:
# Build the test.tsv file. Positive Reviews are marked as 1 and Negative as 0
# Data is shuffled before saving to file
test_pos_df = pd.DataFrame(test_pos_data)
test_pos_df["sentiment"] = 1
test_neg_df = pd.DataFrame(test_neg_data)
test_neg_df["sentiment"] = 0
test_df = pd.concat([test_pos_df, test_neg_df], axis=0)
test_df = shuffle(test_df)
test_df["document_id"] = np.arange(len(test_df))
test_df[["document_id", "review"]].to_csv("data/test.tsv", sep="\t", index=False)
In [ ]:
# Build the Solutions File to upload to Kaggle Competition
test_df[["document_id", "sentiment"]].to_csv("data/solutions_submission.csv", index=False)
In [ ]:
# Create a Sample Submission File based on Random Guess (All Even reviews are Positive)
#
sample_submission_df = pd.DataFrame({
"document_id": np.arange(len(test_df))
})
sample_submission_df["sentiment"] = sample_submission_df.index % 2
sample_submission_df[["document_id", "sentiment"]].to_csv("data/sample_submission.csv", index=False)
In [15]:
!head data/sample_submission.csv