In [ ]:
# Perhaps : Just use the Kaggle pre-done one from : https://www.kaggle.com/utathya/imdb-review-dataset
# OR : https://www.kaggle.com/c/sentiment-analysis-on-imdb-movie-reviews/data
#! wget https://www.kaggle.com/c/6337/download/train_data.csv
#! ls -l *.csv
#! more train_data.csv
# BUT that needs a LOGIN
In [ ]:
# OR :
#if not os.path.isfile('imdb_tr.csv'):
# ! wget https://github.com/SrinidhiRaghavan/AI-Sentiment-Analysis-on-IMDB-Dataset/raw/master/imdb_tr.csv
# ! ls -l *.csv
#! head imdb_tr.csv
# But this has stop-words removed...
In [ ]:
import os
In [ ]:
# http://ai.stanford.edu/~amaas/data/sentiment/
if not os.path.isfile('aclImdb_v1.tar.gz'):
! wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
! ls -l
if not os.path.isfile('aclImdb/README'):
! tar -xzf aclImdb_v1.tar.gz
! ls -l aclImdb
if os.path.isfile('aclImdb/README'):
print("We have the IMDB data!")
! head aclImdb/imdb.vocab
In [ ]:
! ls -l aclImdb/train/neg | head
In [ ]:
# Ok, so let's make a line-based text file with the right data in it...
def get_imdb_marked(sent): # sent='pos' or 'net'
with open('all_'+sent+'.txt', 'wt') as fout:
for fname in os.listdir('aclImdb/train/'+sent):
#print(f)
txt = open('aclImdb/train/'+sent+'/'+fname, 'rt').read() # Slurps whole file
txt_clean = txt.replace('<br />', ' ')
txt_clean = (txt_clean.replace("`", "'")
.replace("'", " '") .replace('"', ' " ')
.replace(',', ' , ').replace('.', ' . ')
.replace(':', ' : ').replace('!', ' ! ')
.replace('/', ' / ').replace('(', ' ( ').replace(')', ' ) ')
)
txt_clean = txt_clean.replace("--", " - ").replace(" ", " ").replace(" ", " ")
fout.write("%d|%s\n" % ((1 if sent=='pos' else 0), txt_clean, ))
#break
get_imdb_marked('pos')
get_imdb_marked('neg')
! head all_pos.txt
! head all_neg.txt
In [ ]:
# Unix magic
! cat all_pos.txt all_neg.txt | shuf > IMDB_all_reviews.txt
! head IMDB_all_reviews.txt
In [ ]:
In [ ]:
# If using colab, download the data to your local machine
from google.colab import files
files.download('IMDB_all_reviews.txt') # Better be using Chrome...
In [ ]: