IMDB Crawler


In [5]:
import urllib
from bs4 import BeautifulSoup as bs

In [6]:
import urllib.request

In [7]:
TITLE_QUERY = (
    'http://www.imdb.com/find'
    '?q={title}&s=tt&ttype=ft&exact=true&ref_=fn_tt_ex'
)

In [8]:
movie_name="Avengers: Age of Ultron"

In [9]:
def convert_title(title):
    return urllib.parse.quote(title).lower()

In [10]:
convert_title(movie_name)


Out[10]:
'avengers%3a%20age%20of%20ultron'

In [11]:
query = TITLE_QUERY.format(title=convert_title(movie_name))
search_res = bs(urllib.request.urlopen(query), "html.parser")

In [12]:
res_table = search_res.find_all("table", {"class": "findList"})[0]

In [13]:
for line in res_table.find_all("tr"):
    print('2014' in str(line))
    print(line)


False
<tr class="findResult odd"> <td class="primary_photo"> <a href="/title/tt2395427/?ref_=fn_ft_tt_1"><img src="https://images-na.ssl-images-amazon.com/images/M/MV5BMTM4OGJmNWMtOTM4Ni00NTE3LTg3MDItZmQxYjc4N2JhNmUxXkEyXkFqcGdeQXVyNTgzMDMzMTg@._V1_UX32_CR0,0,32,44_AL_.jpg"/></a> </td> <td class="result_text"> <a href="/title/tt2395427/?ref_=fn_ft_tt_1">Avengers: Age of Ultron</a> (2015) </td> </tr>

In [14]:
first_row = res_table.find_all("tr")[0]

In [15]:
first_row


Out[15]:
<tr class="findResult odd"> <td class="primary_photo"> <a href="/title/tt2395427/?ref_=fn_ft_tt_1"><img src="https://images-na.ssl-images-amazon.com/images/M/MV5BMTM4OGJmNWMtOTM4Ni00NTE3LTg3MDItZmQxYjc4N2JhNmUxXkEyXkFqcGdeQXVyNTgzMDMzMTg@._V1_UX32_CR0,0,32,44_AL_.jpg"/></a> </td> <td class="result_text"> <a href="/title/tt2395427/?ref_=fn_ft_tt_1">Avengers: Age of Ultron</a> (2015) </td> </tr>

Extracting the movie code


In [16]:
import re

In [17]:
MOVIE_CODE_REGEX = r'/title/([a-z0-9]+)/'

In [18]:
movie_code = re.findall(MOVIE_CODE_REGEX, str(first_row))[0]

In [19]:
movie_code


Out[19]:
'tt2395427'

Movie Profile


In [20]:
PROFILE_URL = 'http://www.imdb.com/title/{code}/' #?region=us

In [21]:
cur_profile_url = PROFILE_URL.format(code=movie_code)

In [29]:
prof_page = bs(urllib.request.urlopen(cur_profile_url), "html.parser")

Rating


In [30]:
prof_page.find_all("span", {"itemprop": "ratingValue"})


Out[30]:
[<span itemprop="ratingValue">7.4</span>]

In [31]:
rating = float(prof_page.find_all("span", {"itemprop": "ratingValue"})[0].contents[0])

In [32]:
rating


Out[32]:
7.4

In [33]:
rating_count = int(prof_page.find_all("span", {"itemprop": "ratingCount"})[0].contents[0].replace(',', ''))

In [34]:
rating_count


Out[34]:
501914

Genres


In [35]:
genres = []

In [36]:
for span in prof_page.find_all("span", {"itemprop": "genre"}):
    genres.append(span.contents[0])

In [37]:
genres


Out[37]:
['Action', 'Adventure', 'Sci-Fi']

Review counts


In [38]:
REVIEW_COUNT_REGEX = r'([0-9,]+) ([a-zA-Z]+)'

In [39]:
user_review_count = 0
critic_review_count = 0

In [40]:
for span in prof_page.find_all("span", {"itemprop": "reviewCount"}):
    span_str = span.contents[0]
    res = re.findall(REVIEW_COUNT_REGEX, span_str)[0]
    if res[1] == 'user':
        user_review_count = int(res[0].replace(',', ''))
    elif res[1] == 'critic':
        critic_review_count = int(res[0].replace(',', ''))

In [41]:
user_review_count


Out[41]:
1128

In [42]:
critic_review_count


Out[42]:
650

Metascore


In [43]:
metascore = int(prof_page.find_all("div", {"class": "metacriticScore"})[0].contents[1].contents[0])

In [44]:
metascore


Out[44]:
66

Year


In [45]:
year = int(prof_page.find_all("span", {"id": "titleYear"})[0].contents[1].contents[0])

In [46]:
year


Out[46]:
2015

Duration


In [47]:
MOVIE_DURATION_REGEX = r'PT([0-9]+)M'

In [48]:
duration_str = prof_page.find_all("time", {"itemprop": "duration"})[0]['datetime']

In [49]:
duration_in_minutes = int(re.findall(MOVIE_DURATION_REGEX, duration_str)[0])

In [50]:
duration_in_minutes


Out[50]:
141

Box office section


In [51]:
BOX_CONTENT_REGEX = r"<h3.*>Box Office</h3>([\s\S]+?)<h3"

In [52]:
box_contents = re.findall(BOX_CONTENT_REGEX, str(prof_page))[0]

In [53]:
box_contents


Out[53]:
'\n<div class="txt-block">\n<h4 class="inline">Budget:</h4>        $250,000,000        \n\n      <span class="attribute">(estimated)</span>\n</div>\n<div class="txt-block">\n<h4 class="inline">Opening Weekend:</h4>         $191,271,109        \n\n      (USA)\n      <span class="attribute">(1 May 2015)</span>\n</div>\n<div class="txt-block">\n<h4 class="inline">Gross:</h4>        $458,991,599        \n\n      <span class="attribute">(USA)</span>\n<span class="attribute">(2 October 2015)</span>\n</div>\n<span class="see-more inline">\n<a href="business?ref_=tt_dt_bus" itemprop="url">See more</a>\xa0»\n  </span>\n<hr/>\n'

Budget


In [54]:
BUDGET_REGEX = r"<h4.*>Budget:</h4>\s*\$([0-9,]+)"

In [55]:
budget = int(re.findall(BUDGET_REGEX, box_contents)[0].replace(',', ''))

In [56]:
budget


Out[56]:
250000000

Opening Weekend


In [57]:
from datetime import datetime

In [58]:
OPEN_DATE_REGEX = r"<h4.*>Opening Weekend:</h4>[\s\S]*?\([A-Z]+\)[\s\S]*?\(([0-9a-zA-Z\s]+)\)[\s\S]*?<h4"

In [59]:
open_date_str = re.findall(OPEN_DATE_REGEX, box_contents)[0]

In [60]:
open_date = datetime.strptime(open_date_str, "%d %B %Y").date()

In [61]:
open_date


Out[61]:
datetime.date(2015, 5, 1)

In [62]:
OPEN_PROF_REGEX = r"<h4.*>Opening Weekend:</h4>\s*[\$\£]([0-9,]+)"

In [63]:
opening_weekend_income = int(re.findall(OPEN_PROF_REGEX, box_contents)[0].replace(',', ''))

In [64]:
opening_weekend_income


Out[64]:
191271109

In [65]:
OPEN_PROF_CURRENCY_REGEX = r"<h4.*>Opening Weekend:</h4>\s*([\$\£])[0-9,]+"

In [66]:
opening_weekend_currency = re.findall(OPEN_PROF_CURRENCY_REGEX, box_contents)[0]
opening_weekend_currency


Out[66]:
'$'

Gross


In [67]:
GROSS_DATE_REGEX = r"<h4.*>Gross:</h4>[\s\S]*?\(USA\)[\s\S]*?\(([0-9a-zA-Z\s]+)\)"

In [68]:
gross_date_str = re.findall(GROSS_DATE_REGEX, box_contents)[0]

In [69]:
gross_date = datetime.strptime(gross_date_str, "%d %B %Y").date()

In [70]:
gross_date


Out[70]:
datetime.date(2015, 10, 2)

In [71]:
GROSS_REGEX = r"<h4.*>Gross:</h4>\s*\$([0-9,]+)[\s\S]*?\(USA\)"

In [72]:
gross = int(re.findall(GROSS_REGEX, box_contents)[0].replace(',', ''))

In [73]:
gross


Out[73]:
458991599

Ratings page


In [77]:
RATINGS_URL = 'http://www.imdb.com/title/{code}/ratings'
cur_ratings_url = RATINGS_URL.format(code=movie_code)
ratings_page = bs(urllib.request.urlopen(cur_ratings_url), "html.parser")

In [78]:
tables = ratings_page.find_all("table")

In [79]:
def extract_table(table):
    content = []
    for row in table.find_all("tr")[1:]:
        content.append([td.get_text() for td in row.find_all("td")])
    return content

Rating Frequency


In [80]:
hist_table = tables[0]

In [81]:
hist_content = extract_table(hist_table)

In [82]:
rating_freq = {}
for row in hist_content:
    rating_freq[int(row[2])] = int(row[0])
rating_freq


Out[82]:
{1: 7727,
 2: 3058,
 3: 4656,
 4: 8702,
 5: 21639,
 6: 57665,
 7: 124766,
 8: 131974,
 9: 70569,
 10: 71158}

Demographic breakdown


In [83]:
demog_table = tables[1]
demog_content = extract_table(demog_table)
demog_content


Out[83]:
[[' Males ', '\xa0318966', '\xa07.4'],
 [' Females ', '\xa059366', '\xa07.7'],
 [' Aged under 18 ', '\xa04587', '\xa07.9'],
 [' Males under 18 ', '\xa03560', '\xa07.8'],
 [' Females under 18 ', '\xa0993', '\xa08.5'],
 [' Aged 18-29 ', '\xa0191661', '\xa07.5'],
 [' Males Aged 18-29 ', '\xa0156378', '\xa07.5'],
 [' Females Aged 18-29 ', '\xa033357', '\xa07.7'],
 [' Aged 30-44 ', '\xa0128797', '\xa07.3'],
 [' Males Aged 30-44 ', '\xa0110998', '\xa07.2'],
 [' Females Aged 30-44 ', '\xa015930', '\xa07.5'],
 [' Aged 45+ ', '\xa023475', '\xa07.3'],
 [' Males Aged 45+ ', '\xa019628', '\xa07.3'],
 [' Females Aged 45+ ', '\xa03428', '\xa07.5'],
 [' IMDb staff ', '\xa042', '\xa07.1'],
 [' Top 1000 voters ', '\xa0641', '\xa06.7'],
 [' US users ', '\xa058752', '\xa07.5'],
 [' Non-US users ', '\xa0192773', '\xa07.3'],
 ['\xa0'],
 [' IMDb users                         ', '\xa0501914', '\xa07.4']]

In [84]:
votes_per_demo = {}
avg_rating_per_demo = {}

In [85]:
for row in demog_content:
    try:
        votes_per_demo[row[0].strip()] = int(row[1])
        avg_rating_per_demo[row[0].strip()] = float(row[2])
    except IndexError:
        pass
print(votes_per_demo)
print(avg_rating_per_demo)


{'Top 1000 voters': 641, 'Females under 18': 993, 'Non-US users': 192773, 'Females Aged 30-44': 15930, 'Males Aged 45+': 19628, 'Males Aged 18-29': 156378, 'US users': 58752, 'Females Aged 45+': 3428, 'IMDb staff': 42, 'Males Aged 30-44': 110998, 'Females': 59366, 'Aged 30-44': 128797, 'IMDb users': 501914, 'Males under 18': 3560, 'Males': 318966, 'Aged under 18': 4587, 'Aged 45+': 23475, 'Aged 18-29': 191661, 'Females Aged 18-29': 33357}
{'Top 1000 voters': 6.7, 'Females under 18': 8.5, 'Non-US users': 7.3, 'Females Aged 30-44': 7.5, 'Males Aged 45+': 7.3, 'Males Aged 18-29': 7.5, 'US users': 7.5, 'Females Aged 45+': 7.5, 'IMDb staff': 7.1, 'Males Aged 30-44': 7.2, 'Females': 7.7, 'Aged 30-44': 7.3, 'IMDb users': 7.4, 'Males under 18': 7.8, 'Males': 7.4, 'Aged under 18': 7.9, 'Aged 45+': 7.3, 'Aged 18-29': 7.5, 'Females Aged 18-29': 7.7}

Business page


In [86]:
BUSINESS_URL = 'http://www.imdb.com/title/{code}/business'
cur_business_url = BUSINESS_URL.format(code=movie_code)
busi_page = bs(urllib.request.urlopen(cur_business_url), "html.parser")
busi_str = str(busi_page)

In [87]:
# #### Budget
# BUDGET_REGEX = r"<h5>Budget</h5>\n\s*\$([0-9,]+)"
# budget_dollar = int(re.findall(BUDGET_REGEX, busi_str)[0].replace(',', ''))

Number of screens (weekends)


In [88]:
WEEKEND_CONTENT_REGEX = r"<h5>Weekend Gross</h5>([\s\S]+?)<h5>"
weekend_contents = re.findall(WEEKEND_CONTENT_REGEX, busi_str)[0]
weekend_contents


Out[88]:
'\n$41,047 (USA) (<a href="/date/10-04/">4 October</a> <a href="/year/2015/">2015</a>) (86 Screens)<br/>$88,796 (USA) (<a href="/date/09-27/">27 September</a> <a href="/year/2015/">2015</a>) (130 Screens)<br/>$142,184 (USA) (<a href="/date/09-20/">20 September</a> <a href="/year/2015/">2015</a>) (170 Screens)<br/>$195,764 (USA) (<a href="/date/09-13/">13 September</a> <a href="/year/2015/">2015</a>) (190 Screens)<br/>$214,741 (USA) (<a href="/date/08-30/">30 August</a> <a href="/year/2015/">2015</a>) (237 Screens)<br/>$80,867 (USA) (<a href="/date/08-23/">23 August</a> <a href="/year/2015/">2015</a>) (91 Screens)<br/>$121,941 (USA) (<a href="/date/08-09/">9 August</a> <a href="/year/2015/">2015</a>) (126 Screens)<br/>$168,683 (USA) (<a href="/date/08-02/">2 August</a> <a href="/year/2015/">2015</a>) (214 Screens)<br/>$468,050 (USA) (<a href="/date/07-19/">19 July</a> <a href="/year/2015/">2015</a>) (292 Screens)<br/>$852,000 (USA) (<a href="/date/07-05/">5 July</a> <a href="/year/2015/">2015</a>) (589 Screens)<br/>$1,688,938 (USA) (<a href="/date/06-28/">28 June</a> <a href="/year/2015/">2015</a>) (1,097 Screens)<br/>$2,847,404 (USA) (<a href="/date/06-21/">21 June</a> <a href="/year/2015/">2015</a>) (1,662 Screens)<br/>$3,641,000 (USA) (<a href="/date/06-14/">14 June</a> <a href="/year/2015/">2015</a>) (2,156 Screens)<br/>$6,339,663 (USA) (<a href="/date/06-07/">7 June</a> <a href="/year/2015/">2015</a>) (2,471 Screens)<br/>$11,401,402 (USA) (<a href="/date/05-31/">31 May</a> <a href="/year/2015/">2015</a>) (3,228 Screens)<br/>$21,691,000 (USA) (<a href="/date/05-24/">24 May</a> <a href="/year/2015/">2015</a>) (3,727 Screens)<br/>$38,837,000 (USA) (<a href="/date/05-17/">17 May</a> <a href="/year/2015/">2015</a>) (4,276 Screens)<br/>$77,746,929 (USA) (<a href="/date/05-10/">10 May</a> <a href="/year/2015/">2015</a>) (4,276 Screens)<br/>$191,271,109 (USA) (<a href="/date/05-03/">3 May</a> <a href="/year/2015/">2015</a>) (4,276 Screens)<br/>\n<br/>\n'

In [89]:
US_OPEN_WEEKEND_REGEX = r"\$[\s\S]*?\(USA\)[\s\S]*?\(([0-9,]*) Screens\)"
num_screens_list = [int(match.replace(',','')) for match in re.findall(US_OPEN_WEEKEND_REGEX, weekend_contents)]
num_screens_list


Out[89]:
[86,
 130,
 170,
 190,
 237,
 91,
 126,
 214,
 292,
 589,
 1097,
 1662,
 2156,
 2471,
 3228,
 3727,
 4276,
 4276,
 4276]

In [113]:
[v for v in reversed(num_screens_list)]


Out[113]:
[4276,
 4276,
 4276,
 3727,
 3228,
 2471,
 2156,
 1662,
 1097,
 589,
 292,
 214,
 126,
 91,
 237,
 190,
 170,
 130,
 86]

In [90]:
import math

In [91]:
max_screens = max(num_screens_list)
avg_screens = sum(num_screens_list) / len(num_screens_list)
num_weekends = len(num_screens_list)

In [92]:
# ### Gross Earnings
# GROSS_CONTENT_REGEX = r"<h5>Gross</h5>([\s\S]+?)<h5>"
# gross_contents = re.findall(GROSS_CONTENT_REGEX, busi_str)[0]
# GROSS_REGEX = r"<h5>Gross</h5>\n\s*\$([0-9,]+)\s*\(USA\)"
# gross_inc_dollar = int(re.findall(GROSS_REGEX, busi_str)[0].replace(',', ''))

Release Info Page


In [93]:
RELEASE_URL = 'http://www.imdb.com/title/{code}/releaseinfo'
cur_release_url = RELEASE_URL.format(code=movie_code)
release_page = bs(urllib.request.urlopen(cur_release_url), "html.parser")

In [94]:
release_table = release_page.find_all("table", {"id": "release_dates"})[0]

In [95]:
us_rows = []
for row in release_table.find_all("tr")[1:]:
    row_str = str(row)
    if 'USA' in row_str:
        us_rows.append(row_str)

In [96]:
USA_ROW_REGEX = "<tr[\s\S]*?USA[\s\S]*?(\d\d?)\s+([a-zA-Z]+)[\s\S]*?(\d\d\d\d)[\s\S]*?<td></td>[\s\S]*?</tr>"

In [97]:
for row in us_rows:
    if re.match(USA_ROW_REGEX, row):
        release = re.findall(USA_ROW_REGEX, row)[0]
        release_day = int(release[0])
        release_month = release[1]
        release_year = int(release[2])

In [98]:
release_day


Out[98]:
1

In [99]:
release_month


Out[99]:
'May'

In [100]:
release_year


Out[100]:
2015

Reviews Page


In [101]:
from datetime import datetime

In [105]:
_REVIEWS_URL = 'http://www.imdb.com/title/{code}/reviews-index?start=0;count=9999'
cur_reviews_url = _REVIEWS_URL.format(code=movie_code)
reviews_page = bs(urllib.request.urlopen(cur_reviews_url), "html.parser")

In [106]:
reviews = reviews_page.find_all("td", {"class": "comment-summary"})

In [107]:
_USER_REVIEW_RATING_REGEX = r"alt=\"(\d|10)/10"

In [108]:
user_reviews = []

In [109]:
for review in reviews:
    try:
        rating = int(re.findall(_USER_REVIEW_RATING_REGEX, str(review))[0])
        date = datetime.strptime(re.findall(r"on (\d{1,2} [a-zA-Z]+ \d{4})", str(review))[0], "%d %B %Y").date()
        contents = review.find_all('a', href=re.compile(r'reviews.+?'))[0].contents[0]
        user = review.find_all('a', href=re.compile(r'/user/.+?'))[1].contents[0]
        user_reviews.append({'rating': rating, 'date': date, 'contents': contents, 'user': user})
    except Exception:
        pass

In [110]:
len(user_reviews)


Out[110]:
1088

Uniting Dataframes


In [ ]:
import pandas as pd

In [ ]:
df = pd.read_csv('/Users/shaypalachy/clones/rotten_needles/data/movie_profiles.csv')

In [ ]:
df.columns

In [ ]:
import os
from rotten_needles.imdb_crawl.jsondate import (load, dump)

In [ ]:
profiles = []
for profile_file in os.listdir('/Users/shaypalachy/clones/rotten_needles/data/movie_profiles'):
    print('Reading {}'.format(profile_file))
    file_path = os.path.join('/Users/shaypalachy/clones/rotten_needles/data/movie_profiles', profile_file)
    with open(file_path, 'r') as json_file:
        profiles.append(load(json_file))
df = pd.DataFrame(profiles)

In [ ]:
df.ix[0]

In [ ]:
DEMOGRAPHICS = ['Aged under 18', 'Males under 18', 'Males Aged 45+', 'Females', 'Males Aged 18-29', 'IMDb staff', 'IMDb users', 'Males', 'Aged 30-44', 'Females Aged 45+', 'Aged 18-29', 'Females Aged 18-29', 'Aged 45+', 'Males Aged 30-44', 'Top 1000 voters', 'Females under 18', 'Females Aged 30-44', 'US users', 'Non-US users']

In [ ]:
DEMOGRAPHICS = ['aged_under_18',
 'males_under_18',
 'males_aged_45+',
 'females',
 'males_aged_18-29',
 'imdb_staff',
 'imdb_users',
 'males',
 'aged_30-44',
 'females_aged_45+',
 'aged_18-29',
 'females_aged_18-29',
 'aged_45+',
 'males_aged_30-44',
 'top_1000_voters',
 'females_under_18',
 'females_aged_30-44',
 'us_users',
 'non-us_users']

In [ ]:
def _parse_string(string):
    return string.lower().strip().replace(' ', '_')

In [ ]:
def decompose_dict_column(df, colname, allowed_cols):
    newdf = df[colname].apply(pd.Series)
    newdf = newdf.drop([col for col in newdf.columns if col not in allowed_cols], axis=1)
    newdf.columns = [colname+'.'+col for col in newdf.columns]
    return pd.concat([df.drop([colname], axis=1), newdf], axis=1)

In [ ]:
decompose_dict_column(df, 'avg_rating_per_demo', DEMOGRAPHICS);

In [ ]:
decompose_dict_column(df, 'votes_per_demo', DEMOGRAPHICS);

In [ ]:
decompose_dict_column(df, 'rating_freq', [str(i) for i in range(1,11)])

In [ ]:
genre_set = set([genre for genre_list in df.genres.dropna() for genre in genre_list])
genre_set

In [ ]:
def dummy_list_column(df, colname):
    value_set = set([value for value_list in df[colname].dropna() for value in value_list])
    def value_list_to_dict(value_list):
        try:
            return {value : 1 if value in value_list else 0 for value in value_set}
        except TypeError:
            return {value : 0 for value in value_set}
    df[colname] = df[colname].apply(value_list_to_dict)
    return decompose_dict_column(df, colname, list(value_set))

In [ ]:
dummy_list_column(df, 'genres')

In [ ]:
df.replace?

In [ ]:
df['genres'] = df['genres'].apply(genre_list_to_dict)

In [ ]: