In [5]:
import urllib
from bs4 import BeautifulSoup as bs
In [6]:
import urllib.request
In [7]:
TITLE_QUERY = (
'http://www.imdb.com/find'
'?q={title}&s=tt&ttype=ft&exact=true&ref_=fn_tt_ex'
)
In [8]:
movie_name="Avengers: Age of Ultron"
In [9]:
def convert_title(title):
return urllib.parse.quote(title).lower()
In [10]:
convert_title(movie_name)
Out[10]:
In [11]:
query = TITLE_QUERY.format(title=convert_title(movie_name))
search_res = bs(urllib.request.urlopen(query), "html.parser")
In [12]:
res_table = search_res.find_all("table", {"class": "findList"})[0]
In [13]:
for line in res_table.find_all("tr"):
print('2014' in str(line))
print(line)
In [14]:
first_row = res_table.find_all("tr")[0]
In [15]:
first_row
Out[15]:
In [16]:
import re
In [17]:
MOVIE_CODE_REGEX = r'/title/([a-z0-9]+)/'
In [18]:
movie_code = re.findall(MOVIE_CODE_REGEX, str(first_row))[0]
In [19]:
movie_code
Out[19]:
In [20]:
PROFILE_URL = 'http://www.imdb.com/title/{code}/' #?region=us
In [21]:
cur_profile_url = PROFILE_URL.format(code=movie_code)
In [29]:
prof_page = bs(urllib.request.urlopen(cur_profile_url), "html.parser")
In [30]:
prof_page.find_all("span", {"itemprop": "ratingValue"})
Out[30]:
In [31]:
rating = float(prof_page.find_all("span", {"itemprop": "ratingValue"})[0].contents[0])
In [32]:
rating
Out[32]:
In [33]:
rating_count = int(prof_page.find_all("span", {"itemprop": "ratingCount"})[0].contents[0].replace(',', ''))
In [34]:
rating_count
Out[34]:
In [35]:
genres = []
In [36]:
for span in prof_page.find_all("span", {"itemprop": "genre"}):
genres.append(span.contents[0])
In [37]:
genres
Out[37]:
In [38]:
REVIEW_COUNT_REGEX = r'([0-9,]+) ([a-zA-Z]+)'
In [39]:
user_review_count = 0
critic_review_count = 0
In [40]:
for span in prof_page.find_all("span", {"itemprop": "reviewCount"}):
span_str = span.contents[0]
res = re.findall(REVIEW_COUNT_REGEX, span_str)[0]
if res[1] == 'user':
user_review_count = int(res[0].replace(',', ''))
elif res[1] == 'critic':
critic_review_count = int(res[0].replace(',', ''))
In [41]:
user_review_count
Out[41]:
In [42]:
critic_review_count
Out[42]:
In [43]:
metascore = int(prof_page.find_all("div", {"class": "metacriticScore"})[0].contents[1].contents[0])
In [44]:
metascore
Out[44]:
In [45]:
year = int(prof_page.find_all("span", {"id": "titleYear"})[0].contents[1].contents[0])
In [46]:
year
Out[46]:
In [47]:
MOVIE_DURATION_REGEX = r'PT([0-9]+)M'
In [48]:
duration_str = prof_page.find_all("time", {"itemprop": "duration"})[0]['datetime']
In [49]:
duration_in_minutes = int(re.findall(MOVIE_DURATION_REGEX, duration_str)[0])
In [50]:
duration_in_minutes
Out[50]:
In [51]:
BOX_CONTENT_REGEX = r"<h3.*>Box Office</h3>([\s\S]+?)<h3"
In [52]:
box_contents = re.findall(BOX_CONTENT_REGEX, str(prof_page))[0]
In [53]:
box_contents
Out[53]:
In [54]:
BUDGET_REGEX = r"<h4.*>Budget:</h4>\s*\$([0-9,]+)"
In [55]:
budget = int(re.findall(BUDGET_REGEX, box_contents)[0].replace(',', ''))
In [56]:
budget
Out[56]:
In [57]:
from datetime import datetime
In [58]:
OPEN_DATE_REGEX = r"<h4.*>Opening Weekend:</h4>[\s\S]*?\([A-Z]+\)[\s\S]*?\(([0-9a-zA-Z\s]+)\)[\s\S]*?<h4"
In [59]:
open_date_str = re.findall(OPEN_DATE_REGEX, box_contents)[0]
In [60]:
open_date = datetime.strptime(open_date_str, "%d %B %Y").date()
In [61]:
open_date
Out[61]:
In [62]:
OPEN_PROF_REGEX = r"<h4.*>Opening Weekend:</h4>\s*[\$\£]([0-9,]+)"
In [63]:
opening_weekend_income = int(re.findall(OPEN_PROF_REGEX, box_contents)[0].replace(',', ''))
In [64]:
opening_weekend_income
Out[64]:
In [65]:
OPEN_PROF_CURRENCY_REGEX = r"<h4.*>Opening Weekend:</h4>\s*([\$\£])[0-9,]+"
In [66]:
opening_weekend_currency = re.findall(OPEN_PROF_CURRENCY_REGEX, box_contents)[0]
opening_weekend_currency
Out[66]:
In [67]:
GROSS_DATE_REGEX = r"<h4.*>Gross:</h4>[\s\S]*?\(USA\)[\s\S]*?\(([0-9a-zA-Z\s]+)\)"
In [68]:
gross_date_str = re.findall(GROSS_DATE_REGEX, box_contents)[0]
In [69]:
gross_date = datetime.strptime(gross_date_str, "%d %B %Y").date()
In [70]:
gross_date
Out[70]:
In [71]:
GROSS_REGEX = r"<h4.*>Gross:</h4>\s*\$([0-9,]+)[\s\S]*?\(USA\)"
In [72]:
gross = int(re.findall(GROSS_REGEX, box_contents)[0].replace(',', ''))
In [73]:
gross
Out[73]:
In [77]:
RATINGS_URL = 'http://www.imdb.com/title/{code}/ratings'
cur_ratings_url = RATINGS_URL.format(code=movie_code)
ratings_page = bs(urllib.request.urlopen(cur_ratings_url), "html.parser")
In [78]:
tables = ratings_page.find_all("table")
In [79]:
def extract_table(table):
content = []
for row in table.find_all("tr")[1:]:
content.append([td.get_text() for td in row.find_all("td")])
return content
In [80]:
hist_table = tables[0]
In [81]:
hist_content = extract_table(hist_table)
In [82]:
rating_freq = {}
for row in hist_content:
rating_freq[int(row[2])] = int(row[0])
rating_freq
Out[82]:
In [83]:
demog_table = tables[1]
demog_content = extract_table(demog_table)
demog_content
Out[83]:
In [84]:
votes_per_demo = {}
avg_rating_per_demo = {}
In [85]:
for row in demog_content:
try:
votes_per_demo[row[0].strip()] = int(row[1])
avg_rating_per_demo[row[0].strip()] = float(row[2])
except IndexError:
pass
print(votes_per_demo)
print(avg_rating_per_demo)
In [86]:
BUSINESS_URL = 'http://www.imdb.com/title/{code}/business'
cur_business_url = BUSINESS_URL.format(code=movie_code)
busi_page = bs(urllib.request.urlopen(cur_business_url), "html.parser")
busi_str = str(busi_page)
In [87]:
# #### Budget
# BUDGET_REGEX = r"<h5>Budget</h5>\n\s*\$([0-9,]+)"
# budget_dollar = int(re.findall(BUDGET_REGEX, busi_str)[0].replace(',', ''))
In [88]:
WEEKEND_CONTENT_REGEX = r"<h5>Weekend Gross</h5>([\s\S]+?)<h5>"
weekend_contents = re.findall(WEEKEND_CONTENT_REGEX, busi_str)[0]
weekend_contents
Out[88]:
In [89]:
US_OPEN_WEEKEND_REGEX = r"\$[\s\S]*?\(USA\)[\s\S]*?\(([0-9,]*) Screens\)"
num_screens_list = [int(match.replace(',','')) for match in re.findall(US_OPEN_WEEKEND_REGEX, weekend_contents)]
num_screens_list
Out[89]:
In [113]:
[v for v in reversed(num_screens_list)]
Out[113]:
In [90]:
import math
In [91]:
max_screens = max(num_screens_list)
avg_screens = sum(num_screens_list) / len(num_screens_list)
num_weekends = len(num_screens_list)
In [92]:
# ### Gross Earnings
# GROSS_CONTENT_REGEX = r"<h5>Gross</h5>([\s\S]+?)<h5>"
# gross_contents = re.findall(GROSS_CONTENT_REGEX, busi_str)[0]
# GROSS_REGEX = r"<h5>Gross</h5>\n\s*\$([0-9,]+)\s*\(USA\)"
# gross_inc_dollar = int(re.findall(GROSS_REGEX, busi_str)[0].replace(',', ''))
In [93]:
RELEASE_URL = 'http://www.imdb.com/title/{code}/releaseinfo'
cur_release_url = RELEASE_URL.format(code=movie_code)
release_page = bs(urllib.request.urlopen(cur_release_url), "html.parser")
In [94]:
release_table = release_page.find_all("table", {"id": "release_dates"})[0]
In [95]:
us_rows = []
for row in release_table.find_all("tr")[1:]:
row_str = str(row)
if 'USA' in row_str:
us_rows.append(row_str)
In [96]:
USA_ROW_REGEX = "<tr[\s\S]*?USA[\s\S]*?(\d\d?)\s+([a-zA-Z]+)[\s\S]*?(\d\d\d\d)[\s\S]*?<td></td>[\s\S]*?</tr>"
In [97]:
for row in us_rows:
if re.match(USA_ROW_REGEX, row):
release = re.findall(USA_ROW_REGEX, row)[0]
release_day = int(release[0])
release_month = release[1]
release_year = int(release[2])
In [98]:
release_day
Out[98]:
In [99]:
release_month
Out[99]:
In [100]:
release_year
Out[100]:
In [101]:
from datetime import datetime
In [105]:
_REVIEWS_URL = 'http://www.imdb.com/title/{code}/reviews-index?start=0;count=9999'
cur_reviews_url = _REVIEWS_URL.format(code=movie_code)
reviews_page = bs(urllib.request.urlopen(cur_reviews_url), "html.parser")
In [106]:
reviews = reviews_page.find_all("td", {"class": "comment-summary"})
In [107]:
_USER_REVIEW_RATING_REGEX = r"alt=\"(\d|10)/10"
In [108]:
user_reviews = []
In [109]:
for review in reviews:
try:
rating = int(re.findall(_USER_REVIEW_RATING_REGEX, str(review))[0])
date = datetime.strptime(re.findall(r"on (\d{1,2} [a-zA-Z]+ \d{4})", str(review))[0], "%d %B %Y").date()
contents = review.find_all('a', href=re.compile(r'reviews.+?'))[0].contents[0]
user = review.find_all('a', href=re.compile(r'/user/.+?'))[1].contents[0]
user_reviews.append({'rating': rating, 'date': date, 'contents': contents, 'user': user})
except Exception:
pass
In [110]:
len(user_reviews)
Out[110]:
In [ ]:
import pandas as pd
In [ ]:
df = pd.read_csv('/Users/shaypalachy/clones/rotten_needles/data/movie_profiles.csv')
In [ ]:
df.columns
In [ ]:
import os
from rotten_needles.imdb_crawl.jsondate import (load, dump)
In [ ]:
profiles = []
for profile_file in os.listdir('/Users/shaypalachy/clones/rotten_needles/data/movie_profiles'):
print('Reading {}'.format(profile_file))
file_path = os.path.join('/Users/shaypalachy/clones/rotten_needles/data/movie_profiles', profile_file)
with open(file_path, 'r') as json_file:
profiles.append(load(json_file))
df = pd.DataFrame(profiles)
In [ ]:
df.ix[0]
In [ ]:
DEMOGRAPHICS = ['Aged under 18', 'Males under 18', 'Males Aged 45+', 'Females', 'Males Aged 18-29', 'IMDb staff', 'IMDb users', 'Males', 'Aged 30-44', 'Females Aged 45+', 'Aged 18-29', 'Females Aged 18-29', 'Aged 45+', 'Males Aged 30-44', 'Top 1000 voters', 'Females under 18', 'Females Aged 30-44', 'US users', 'Non-US users']
In [ ]:
DEMOGRAPHICS = ['aged_under_18',
'males_under_18',
'males_aged_45+',
'females',
'males_aged_18-29',
'imdb_staff',
'imdb_users',
'males',
'aged_30-44',
'females_aged_45+',
'aged_18-29',
'females_aged_18-29',
'aged_45+',
'males_aged_30-44',
'top_1000_voters',
'females_under_18',
'females_aged_30-44',
'us_users',
'non-us_users']
In [ ]:
def _parse_string(string):
return string.lower().strip().replace(' ', '_')
In [ ]:
def decompose_dict_column(df, colname, allowed_cols):
newdf = df[colname].apply(pd.Series)
newdf = newdf.drop([col for col in newdf.columns if col not in allowed_cols], axis=1)
newdf.columns = [colname+'.'+col for col in newdf.columns]
return pd.concat([df.drop([colname], axis=1), newdf], axis=1)
In [ ]:
decompose_dict_column(df, 'avg_rating_per_demo', DEMOGRAPHICS);
In [ ]:
decompose_dict_column(df, 'votes_per_demo', DEMOGRAPHICS);
In [ ]:
decompose_dict_column(df, 'rating_freq', [str(i) for i in range(1,11)])
In [ ]:
genre_set = set([genre for genre_list in df.genres.dropna() for genre in genre_list])
genre_set
In [ ]:
def dummy_list_column(df, colname):
value_set = set([value for value_list in df[colname].dropna() for value in value_list])
def value_list_to_dict(value_list):
try:
return {value : 1 if value in value_list else 0 for value in value_set}
except TypeError:
return {value : 0 for value in value_set}
df[colname] = df[colname].apply(value_list_to_dict)
return decompose_dict_column(df, colname, list(value_set))
In [ ]:
dummy_list_column(df, 'genres')
In [ ]:
df.replace?
In [ ]:
df['genres'] = df['genres'].apply(genre_list_to_dict)
In [ ]: