Create the Movie Reviews Dataset to Host Competition

I built this tutorial and kaggle InClass competition to help participants learn and also practice building a sentiment classifier.

Dataset

This dataset of Movie Reviews is from Stanford AI group ( http://ai.stanford.edu/~amaas/data/sentiment/)

This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing.


In [2]:
import os.path
from glob import glob

import numpy as np
import pandas as pd
from sklearn.utils import shuffle

In [2]:
def extract_reviews_data(file_pattern):
    """Returns the extracted review texts from all files that match the pattern"""
    data = []
    for filename in glob(file_pattern):
        with open(filename, "rb") as f:
            review = f.read().decode("utf-8")
            doc_id = filename.split("/")[-1].split(".")[0]
            data.append({
                "review": review
            })
    return data

In [3]:
# Get the Train Data of Positive and Negative Reviews
train_pos_dir = os.path.expanduser("~/datasets/movie-sentiment-analysis/aclImdb/train/pos/*.txt")
train_neg_dir = os.path.expanduser("~/datasets/movie-sentiment-analysis/aclImdb/train/neg/*.txt")

train_pos_data = extract_reviews_data(train_pos_dir)
train_neg_data = extract_reviews_data(train_neg_dir)

In [4]:
# Get the Test Data of Positive and Negative Reviews
test_pos_dir = os.path.expanduser("~/datasets/movie-sentiment-analysis/aclImdb/test/pos/*.txt")
test_neg_dir = os.path.expanduser("~/datasets/movie-sentiment-analysis/aclImdb/test/neg/*.txt")

test_pos_data = extract_reviews_data(test_pos_dir)
test_neg_data = extract_reviews_data(test_neg_dir)

In [7]:
# Build the train.tsv file.  Positive Reviews are marked as 1 and Negative as 0
# Data is shuffled before saving to file

train_pos_df = pd.DataFrame(train_pos_data)
train_pos_df["sentiment"] = 1
train_neg_df = pd.DataFrame(train_neg_data)
train_neg_df["sentiment"] = 0

train_df = pd.concat([train_pos_df, train_neg_df], axis=0)
train_df = shuffle(train_df)
train_df["document_id"] = np.arange(len(train_df))

train_df[["document_id", "sentiment", "review"]].to_csv("data/train.tsv", sep="\t", index=False)

In [8]:
# Build the test.tsv file.  Positive Reviews are marked as 1 and Negative as 0
# Data is shuffled before saving to file

test_pos_df = pd.DataFrame(test_pos_data)
test_pos_df["sentiment"] = 1
test_neg_df = pd.DataFrame(test_neg_data)
test_neg_df["sentiment"] = 0

test_df = pd.concat([test_pos_df, test_neg_df], axis=0)

test_df = shuffle(test_df)
test_df["document_id"] = np.arange(len(test_df))

test_df[["document_id", "review"]].to_csv("data/test.tsv", sep="\t", index=False)

In [ ]:
# Build the Solutions File to upload to Kaggle Competition
test_df[["document_id", "sentiment"]].to_csv("data/solutions_submission.csv", index=False)

In [ ]:
# Create a Sample Submission File based on Random Guess (All Even reviews are Positive)
#
sample_submission_df = pd.DataFrame({
    "document_id": np.arange(len(test_df))
})

sample_submission_df["sentiment"] = sample_submission_df.index % 2
sample_submission_df[["document_id", "sentiment"]].to_csv("data/sample_submission.csv", index=False)

In [15]:
!head data/sample_submission.csv


document_id,sentiment
0,0
1,1
2,0
3,1
4,0
5,1
6,0
7,1
8,0