In [36]:
# import libraries
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from bs4 import BeautifulSoup
import requests
import csv
import os
import random
import sys
import json
sys.path.insert(0, '/aclImdb/')
In [37]:
# function to get name of movie from each URL
def get_movie(url):
pageText = requests.get(url)
while (pageText==None):
time.sleep(5)
pageText = requests.get(url)
soup = BeautifulSoup(pageText.text,"html.parser")
if soup == None or soup.find("div",attrs={"id":"tn15title"}) == None:
return None
return soup.find("div",attrs={"id":"tn15title"}).find("a").get_text()
In [15]:
# get all urls for train and test, neg and pos
with open('aclImdb/train/urls_pos.txt','r') as f:
train_pos_urls = f.readlines()
print len(train_pos_urls)
with open('aclImdb/train/urls_neg.txt','r') as f:
train_neg_urls = f.readlines()
print len(train_neg_urls)
with open('aclImdb/test/urls_pos.txt','r') as f:
test_pos_urls = f.readlines()
print len(test_pos_urls)
with open('aclImdb/test/urls_neg.txt','r') as f:
test_neg_urls = f.readlines()
print len(test_neg_urls)
In [28]:
# create function for scraping all names from IMDB with given list of urls
# def get_names (urls):
# url_movie_tr_neg_dict = dict(zip((urls), [None]*len(urls)))
# i=0
# for url in url_movie_tr_neg_dict:
# if url_movie_tr_neg_dict[url] == None:
# url_movie_tr_neg_dict[url] = get_movie(url)
# if random.random() < 0.01:
# print i
# i+=1
# time.sleep(0.001)
# return ()
bob = get_names(train_neg_urls)
In [33]:
# scrape IMDB for negative review training set contained movies
% time
import random
# Make a dictionary of URL: movie name
url_dict = dict(zip(train_neg_urls, [None]*len(train_neg_urls)))
i = 0
for url in train_neg_urls[7365:]:
if url_dict[url] == None:
url_dict[url] = get_movie(url)
if random.random() < 0.01:
print i
i += 1
time.sleep(0.001)
In [34]:
i
Out[34]:
In [38]:
fp = open("url_movie_train_neg.json","w")
json.dump(url_dict,fp)
fp.close()
In [ ]:
% time
import random
# Make a dictionary of URL: movie name
url_dict = dict(zip(train_neg_urls, [None]*len(train_neg_urls)))
i = 0
for url in train_neg_urls[7365:]:
if url_dict[url] == None:
url_dict[url] = get_movie(url)
if random.random() < 0.01:
print i
i += 1
time.sleep(0.001)
In [9]:
train_pos_names = list(os.walk('aclImdb/train/pos/'))[0][2]
for review in train_pos_names:
stars = int(review.split("_")[1].split(".")[0])
movieID = int(review.split("_")[0])
fp = open('aclImdb/train/pos/%(review)s' % {'review': review}, 'r')
text = fp.read()
pos = True
url = train_pos_urls[movieID]
movie_name = url_dict[url]
reviewDict = {'movie_id': movieID, 'stars': stars, 'positive': pos, 'text': text, 'url': url, 'movie_name': movie_name}
review_df = review_df.append(pd.DataFrame(reviewDict),index=[0])
Out[9]:
In [10]:
def data_collect (directory,pos,url_dict):
review_df = pd.DataFrame(columns=['movie_id', 'stars', 'positive', 'text', 'url', 'movie_name'])
train_pos_names = list(os.walk(directory))[0][2]
for review in train_pos_names:
stars = int(review.split("_")[1].split(".")[0])
movieID = int(review.split("_")[0])
fp = open('%(dir)s%(review)s' % {'dir':directory, 'review': review}, 'r')
text = fp.read()
pos = pos
url = train_pos_urls[movieID]
movie_name = url_dict[url]
reviewDict = {'movie_id': movieID, 'stars': stars, 'positive': pos, 'text': text, 'url': url, 'movie_name': movie_name}
review_df = review_df.append(pd.DataFrame(reviewDict),index=[0])
return (review_df)
In [ ]:
data_collect('aclImdb/train/pos/',True)
data_collect('aclImdb/train/neg/',False)
data_collect('aclImdb/test/pos/',True)
data_collect('aclImdb/test/neg/',False)