In [36]:
# import libraries

%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from bs4 import BeautifulSoup
import requests
import csv
import os
import random
import sys
import json
sys.path.insert(0, '/aclImdb/')

In [37]:
# function to get name of movie from each URL

def get_movie(url):
    pageText = requests.get(url)
    while (pageText==None):
        time.sleep(5)
        pageText = requests.get(url)
    soup = BeautifulSoup(pageText.text,"html.parser")
    if soup == None or soup.find("div",attrs={"id":"tn15title"}) == None:
        return None
    return soup.find("div",attrs={"id":"tn15title"}).find("a").get_text()

In [15]:
# get all urls for train and test, neg and pos
with open('aclImdb/train/urls_pos.txt','r') as f:
    train_pos_urls = f.readlines()
    print len(train_pos_urls)
    
with open('aclImdb/train/urls_neg.txt','r') as f:
    train_neg_urls = f.readlines()
    print len(train_neg_urls)

with open('aclImdb/test/urls_pos.txt','r') as f:
    test_pos_urls = f.readlines()
    print len(test_pos_urls)
    
with open('aclImdb/test/urls_neg.txt','r') as f:
    test_neg_urls = f.readlines()
    print len(test_neg_urls)


12500
12500
12500
12500

In [28]:
# create function for scraping all names from IMDB with given list of urls

# def get_names (urls):
#     url_movie_tr_neg_dict = dict(zip((urls), [None]*len(urls)))
#     i=0
#     for url in url_movie_tr_neg_dict:
#         if url_movie_tr_neg_dict[url] == None:
#             url_movie_tr_neg_dict[url] = get_movie(url)
#         if random.random() < 0.01:
#             print i
#         i+=1
#         time.sleep(0.001)
#     return ()

bob = get_names(train_neg_urls)


24
123
433
593
623
639
743
766
768
813
842
---------------------------------------------------------------------------
ConnectionError                           Traceback (most recent call last)
<ipython-input-28-49c21a3d6be3> in <module>()
     12     time.sleep(0.001)
     13 
---> 14 get_names(train_neg_urls)

<ipython-input-28-49c21a3d6be3> in get_names(urls)
      6     for url in url_movie_tr_neg_dict:
      7         if url_movie_tr_neg_dict[url] == None:
----> 8             url_movie_tr_neg_dict[url] = get_movie(url)
      9         if random.random() < 0.01:
     10             print i

<ipython-input-16-410bf7401aef> in get_movie(url)
      2 
      3 def get_movie(url):
----> 4     pageText = requests.get(url)
      5     while (pageText==None):
      6         time.sleep(10)

/Users/jasondong/anaconda/lib/python2.7/site-packages/requests/api.pyc in get(url, params, **kwargs)
     67 
     68     kwargs.setdefault('allow_redirects', True)
---> 69     return request('get', url, params=params, **kwargs)
     70 
     71 

/Users/jasondong/anaconda/lib/python2.7/site-packages/requests/api.pyc in request(method, url, **kwargs)
     48 
     49     session = sessions.Session()
---> 50     response = session.request(method=method, url=url, **kwargs)
     51     # By explicitly closing the session, we avoid leaving sockets open which
     52     # can trigger a ResourceWarning in some cases, and look like a memory leak

/Users/jasondong/anaconda/lib/python2.7/site-packages/requests/sessions.pyc in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    466         }
    467         send_kwargs.update(settings)
--> 468         resp = self.send(prep, **send_kwargs)
    469 
    470         return resp

/Users/jasondong/anaconda/lib/python2.7/site-packages/requests/sessions.pyc in send(self, request, **kwargs)
    574 
    575         # Send the request
--> 576         r = adapter.send(request, **kwargs)
    577 
    578         # Total elapsed time of the request (approximately)

/Users/jasondong/anaconda/lib/python2.7/site-packages/requests/adapters.pyc in send(self, request, stream, timeout, verify, cert, proxies)
    410 
    411         except (ProtocolError, socket.error) as err:
--> 412             raise ConnectionError(err, request=request)
    413 
    414         except MaxRetryError as e:

ConnectionError: ('Connection aborted.', BadStatusLine("''",))

In [33]:
# scrape IMDB for negative review training set contained movies
% time
import random
# Make a dictionary of URL: movie name
url_dict = dict(zip(train_neg_urls, [None]*len(train_neg_urls)))
i = 0
for url in train_neg_urls[7365:]:
    if url_dict[url] == None:
        url_dict[url] = get_movie(url)
    if random.random() < 0.01:
        print i
    i += 1
    time.sleep(0.001)


CPU times: user 3 µs, sys: 5 µs, total: 8 µs
Wall time: 7.15 µs
46
73
319
478
546
599
701
799
813
972
1233
1288
1306
1480
1534
1660
1708
1757
1909
1945
2011
2014
2081
2430
2439
2453
2535
2554
2754
2877
2912
3012
3062
3146
3169
3173
3194
3305
3318
3422
3464
3702
3819
3922
4009
4053
4101
4215
4316
4322
4327
4369
4406
4500
4550
4818
4882
4959
4992
4993

In [34]:
i


Out[34]:
5135

In [38]:
fp = open("url_movie_train_neg.json","w")
json.dump(url_dict,fp)
fp.close()

In [ ]:
% time
import random
# Make a dictionary of URL: movie name
url_dict = dict(zip(train_neg_urls, [None]*len(train_neg_urls)))
i = 0
for url in train_neg_urls[7365:]:
    if url_dict[url] == None:
        url_dict[url] = get_movie(url)
    if random.random() < 0.01:
        print i
    i += 1
    time.sleep(0.001)

In [9]:
train_pos_names = list(os.walk('aclImdb/train/pos/'))[0][2]
for review in train_pos_names:
    stars = int(review.split("_")[1].split(".")[0])
    movieID = int(review.split("_")[0])
    fp = open('aclImdb/train/pos/%(review)s' % {'review': review}, 'r')
    text = fp.read()
    pos = True
    url = train_pos_urls[movieID]
    movie_name = url_dict[url]
    reviewDict = {'movie_id': movieID, 'stars': stars, 'positive': pos, 'text': text, 'url': url, 'movie_name': movie_name}
    review_df = review_df.append(pd.DataFrame(reviewDict),index=[0])


Out[9]:
'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [10]:
def data_collect (directory,pos,url_dict):
    review_df = pd.DataFrame(columns=['movie_id', 'stars', 'positive', 'text', 'url', 'movie_name'])
    train_pos_names = list(os.walk(directory))[0][2]
    for review in train_pos_names:
        stars = int(review.split("_")[1].split(".")[0])
        movieID = int(review.split("_")[0])
        fp = open('%(dir)s%(review)s' % {'dir':directory, 'review': review}, 'r')
        text = fp.read()
        pos = pos
        url = train_pos_urls[movieID]
        movie_name = url_dict[url]
        reviewDict = {'movie_id': movieID, 'stars': stars, 'positive': pos, 'text': text, 'url': url, 'movie_name': movie_name}
        review_df = review_df.append(pd.DataFrame(reviewDict),index=[0])
    return (review_df)

In [ ]:
data_collect('aclImdb/train/pos/',True)
data_collect('aclImdb/train/neg/',False)
data_collect('aclImdb/test/pos/',True)
data_collect('aclImdb/test/neg/',False)