In [76]:
import re
import urllib
import urllib.request
from datetime import datetime
from bs4 import BeautifulSoup as bs
In [120]:
movie_name = "black or white"
In [121]:
CHARS_TO_REMOVE = "[\:\;,\.'/\!]"
In [122]:
def _parse_name_for_search(movie_name):
parsed = re.sub(CHARS_TO_REMOVE, '', movie_name)
return parsed.replace(' ', '+')
In [123]:
SEARCH_URL = "http://www.metacritic.com/search/all/{movie_name}/results?cats%5Bmovie%5D=1&search_type=advanced"
_HEADERS = {'User-Agent': 'Mozilla/5.0'}
METACRITIC_URL = "http://www.metacritic.com"
In [124]:
query = SEARCH_URL.format(movie_name=_parse_name_for_search(movie_name))
request = urllib.request.Request(query, headers=_HEADERS)
search_res = bs(urllib.request.urlopen(request), "html.parser")
In [125]:
results = search_res.find_all("li", {"class": "result"})
In [126]:
correct_result = None
In [127]:
for result in results:
title = result.find_all("h3", {"class": "product_title"})[0].contents[0].contents[0]
print(title)
title_match = title.strip().lower() == movie_name.strip().lower()
print(title_match)
year_match = str(2015) in str(result)
print(year_match)
if title_match and year_match:
correct_result = result
In [119]:
# first_res = search_res.find_all("li", {"class": "result first_result"})[0]?
In [129]:
movie_url_suffix = correct_result.find_all("a")[0]['href']
return METACRITIC_URL + movie_url_suffix
In [130]:
METACRITIC_URL + movie_url_suffix
Out[130]:
In [43]:
def _get_movie_url_by_name(movie_name):
query = SEARCH_URL.format(movie_name=_parse_name_for_search(movie_name))
request = urllib.request.Request(query, headers=_HEADERS)
search_res = bs(urllib.request.urlopen(request), "html.parser")
first_res = search_res.find_all("li", {"class": "result first_result"})[0]
movie_url_suffix = first_res.find_all("a")[0]['href']
return METACRITIC_URL + movie_url_suffix
In [45]:
movie_url = _get_movie_url_by_name(movie_name)
movie_url
Out[45]:
In [38]:
CRITICS_REVIEWS_URL_SUFFIX = "/critic-reviews"
In [46]:
critics_url = movie_url + CRITICS_REVIEWS_URL_SUFFIX
In [47]:
critics_request = urllib.request.Request(critics_url, headers=_HEADERS)
critics_page = bs(urllib.request.urlopen(critics_request), "html.parser")
In [48]:
SCORE_CLASSES = [
"metascore_w larger movie positive",
"metascore_w larger movie mixed",
"metascore_w larger movie negative"
]
In [54]:
metascore = int(critics_page.find_all("span", {"class": SCORE_CLASSES})[0].contents[0])
metascore
Out[54]:
In [117]:
MONTH_SHORTHAND_MAP = {
"Jan": "January", "Feb": "February", "Mar": "March", "Apr": "April",
"May": "May", "Jun": "June", "Jul": "July", "Aug": "August",
"Sep": "September", "Oct": "October", "Nov": "November", "Dec": "December"
}
In [111]:
def _parse_date_str(date_str):
for month in MONTH_SHORTHAND_MAP:
if month in date_str:
return date_str.replace(month, MONTH_SHORTHAND_MAP[month])
In [137]:
def _get_critic_review_props(review):
review_props = {}
date_str = review.find_all("span", {"class": "date"})[0].contents[0]
date_str = _parse_date_str(date_str)
review_props['review_date'] = datetime.strptime(date_str, "%B %d, %Y").date()
review_props['score'] = int(review.find_all("div", {"class": "metascore_w"})[0].contents[0])
review_props['summary'] = review.find_all('a', {'class': 'no_hover'})[0].contents[0].strip()
review_props['publication'] = None
review_props['critic'] = None
for link in review.find_all("a"):
if 'publication' in link['href']:
review_props['publication'] = link.contents[0]
if 'critic' in link['href']:
review_props['critic'] = link.contents[0]
return review_props
In [124]:
reviews = []
for review in critics_page.find_all("div", {"class": "review"}):
try:
reviews.append(_get_critic_review_props(review))
except Exception:
continue
In [128]:
len(reviews)
Out[128]:
In [ ]:
def _get_user_review_props(review):
review_props = {}
date_str = review.find_all("span", {"class": "date"})[0].contents[0]
date_str = _parse_date_str(date_str)
review_props['review_date'] = datetime.strptime(date_str, "%B %d, %Y").date()
review_props['score'] = int(review.find_all("div", {"class": "metascore_w"})[0].contents[0])
try:
review_props['text'] = review.find_all('span', {'class': 'blurb blurb_expanded'})[0].contents[0].strip()
except IndexError:
review_props['text'] = review.find_all('div', {'class': 'review_body'})[0].contents[1].contents[0].strip()
review_props['user'] = review.find_all('span', {'class': 'author'})[0].contents[0].contents[0]
review_props['total_reactions'] = int(review.find_all('span', {'class': 'total_count'})[0].contents[0])
review_props['pos_reactions'] = int(review.find_all('span', {'class': 'yes_count'})[0].contents[0])
review_props['neg_reactions'] = review_props['total_reactions'] - review_props['pos_reactions']
return review_props
In [129]:
USERS_REVIEWS_URL_SUFFIX = "/user-reviews?page=0"
In [ ]:
USER_SCORE_CLASSES = [
"metascore_w user larger movie positive",
"metascore_w user larger movie mixed",
"metascore_w user larger movie negative"
]
In [213]:
def _get_user_rating_freq(users_page, rating):
return int(users_page.find_all("div", {"class": "chart {}".format(rating)})[0].find_all(
"div", {"class": "count fr"})[0].contents[0].replace(',', ''))
In [214]:
def _get_user_reviews_from_page(users_page):
review_elements = users_page.find_all("div", {"class": "review"})
user_reviews = []
for review in review_elements:
try:
user_reviews.append(_get_user_review_props(review))
except Exception:
continue
print("Extracted {} reviews.".format(len(user_reviews)))
nexts = users_page.find_all("a", {"class": "action", "rel": "next"})
if len(nexts) > 0:
next_url = METACRITIC_URL + nexts[0]['href']
next_request = urllib.request.Request(next_url, headers=_HEADERS)
next_page = bs(urllib.request.urlopen(next_request), "html.parser")
user_reviews += _get_user_reviews_from_page(next_page)
return user_reviews
In [215]:
def _get_user_reviews_props(movie_url):
users_url = movie_url + USERS_REVIEWS_URL_SUFFIX
users_request = urllib.request.Request(users_url, headers=_HEADERS)
users_page = bs(urllib.request.urlopen(users_request), "html.parser")
users_props = {}
user_score = float(users_page.find_all("span", {"class": USER_SCORE_CLASSES})[0].contents[0])
users_props['user_score'] = user_score
for rating in ['positive', 'mixed', 'negative']:
users_props['{}_rating_frequency'.format(rating)] = _get_user_rating_freq(users_page, rating)
users_props['user_reviews'] = _get_user_reviews_from_page(users_page)
return users_props
In [216]:
users_props = _get_user_reviews_props(movie_url)
In [ ]:
<meta property="og:title" content="Kingsman: The Secret Service">
In [223]:
users_page.find_all("meta", {"property": "og:title"})[0]['content']
Out[223]:
In [218]:
users_props.keys()
Out[218]:
In [187]:
In [11]:
import os
import morejson as json
import pandas as pd
In [12]:
PROF_DIR_PATH = '/Users/shaypalachy/clones/rotten_needles/data/metacritic_profiles/'
In [25]:
IMDB_PROF_DIR_PATH = '/Users/shaypalachy/clones/rotten_needles/data/imdb_profiles/'
In [13]:
profiles = []
for profile_file in os.listdir(PROF_DIR_PATH):
print('Reading {}'.format(profile_file))
file_path = os.path.join(PROF_DIR_PATH, profile_file)
file_name, ext = os.path.splitext(file_path)
if ext != '.json':
continue
with open(file_path, 'r') as json_file:
profiles.append(json.load(json_file))
df = pd.DataFrame(profiles)