In [ ]:
# Perhaps : Just use the Kaggle pre-done one from : https://www.kaggle.com/utathya/imdb-review-dataset
# OR : https://www.kaggle.com/c/sentiment-analysis-on-imdb-movie-reviews/data  
#! wget https://www.kaggle.com/c/6337/download/train_data.csv
#! ls -l *.csv
#! more train_data.csv

# BUT that needs a LOGIN

In [ ]:
# OR : 

#if not os.path.isfile('imdb_tr.csv'):
#  ! wget https://github.com/SrinidhiRaghavan/AI-Sentiment-Analysis-on-IMDB-Dataset/raw/master/imdb_tr.csv
#  ! ls -l *.csv
#! head imdb_tr.csv

# But this has stop-words removed...

In [ ]:
import os

In [ ]:
# http://ai.stanford.edu/~amaas/data/sentiment/
if not os.path.isfile('aclImdb_v1.tar.gz'):
  ! wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
  ! ls -l 
if not os.path.isfile('aclImdb/README'):
  ! tar -xzf aclImdb_v1.tar.gz
  ! ls -l aclImdb
if os.path.isfile('aclImdb/README'):
  print("We have the IMDB data!")
  ! head aclImdb/imdb.vocab

In [ ]:
! ls -l aclImdb/train/neg | head

In [ ]:
# Ok, so let's make a line-based text file with the right data in it...

def get_imdb_marked(sent):  # sent='pos' or 'net'
  with open('all_'+sent+'.txt', 'wt') as fout:
    for fname in os.listdir('aclImdb/train/'+sent):
      #print(f)
      txt = open('aclImdb/train/'+sent+'/'+fname, 'rt').read() # Slurps whole file
      
      txt_clean = txt.replace('<br />', ' ')
      txt_clean = (txt_clean.replace("`", "'")
                            .replace("'", " '") .replace('"', ' " ')
                            .replace(',', ' , ').replace('.', ' . ')
                            .replace(':', ' : ').replace('!', ' ! ')
                            .replace('/', ' / ').replace('(', ' ( ').replace(')', ' ) ')
                  )
      txt_clean = txt_clean.replace("--", " - ").replace("  ", " ").replace("  ", " ")
      
      fout.write("%d|%s\n" % ((1 if sent=='pos' else 0), txt_clean, ))
      #break

get_imdb_marked('pos')
get_imdb_marked('neg')
! head all_pos.txt
! head all_neg.txt

In [ ]:
# Unix magic
! cat all_pos.txt all_neg.txt | shuf > IMDB_all_reviews.txt
! head IMDB_all_reviews.txt

In [ ]:


In [ ]:
# If using colab, download the data to your local machine
from google.colab import files
files.download('IMDB_all_reviews.txt')   # Better be using Chrome...

In [ ]: