Metacritic Crawler


In [76]:
import re
import urllib
import urllib.request
from datetime import datetime

from bs4 import BeautifulSoup as bs

In [120]:
movie_name = "black or white"

In [121]:
CHARS_TO_REMOVE = "[\:\;,\.'/\!]"

In [122]:
def _parse_name_for_search(movie_name):
    parsed = re.sub(CHARS_TO_REMOVE, '', movie_name)
    return parsed.replace(' ', '+')

In [123]:
SEARCH_URL = "http://www.metacritic.com/search/all/{movie_name}/results?cats%5Bmovie%5D=1&search_type=advanced"
_HEADERS = {'User-Agent': 'Mozilla/5.0'}
METACRITIC_URL = "http://www.metacritic.com"

In [124]:
query = SEARCH_URL.format(movie_name=_parse_name_for_search(movie_name))
request = urllib.request.Request(query, headers=_HEADERS)
search_res = bs(urllib.request.urlopen(request), "html.parser")

In [125]:
results = search_res.find_all("li", {"class": "result"})

In [126]:
correct_result = None

In [127]:
for result in results:
    title = result.find_all("h3", {"class": "product_title"})[0].contents[0].contents[0]
    print(title)
    title_match = title.strip().lower() == movie_name.strip().lower()
    print(title_match)
    year_match = str(2015) in str(result)
    print(year_match)
    if title_match and year_match:
        correct_result = result


Black Cat, White Cat
False
False
Black or White
True
True
Black and White
False
False
White King, Red Rubber, Black Death
False
False
Family Portrait in Black and White
False
False
Herblock: The Black & the White
False
False

In [119]:
# first_res = search_res.find_all("li", {"class": "result first_result"})[0]?

In [129]:
movie_url_suffix = correct_result.find_all("a")[0]['href']
return METACRITIC_URL + movie_url_suffix


  File "<ipython-input-129-e8679a2e6bec>", line 2
    return METACRITIC_URL + movie_url_suffix
                                            ^
SyntaxError: 'return' outside function

In [130]:
METACRITIC_URL + movie_url_suffix


Out[130]:
'http://www.metacritic.com/movie/black-or-white'

In [43]:
def _get_movie_url_by_name(movie_name):
    query = SEARCH_URL.format(movie_name=_parse_name_for_search(movie_name))
    request = urllib.request.Request(query, headers=_HEADERS)
    search_res = bs(urllib.request.urlopen(request), "html.parser")
    first_res = search_res.find_all("li", {"class": "result first_result"})[0]
    movie_url_suffix = first_res.find_all("a")[0]['href']
    return METACRITIC_URL + movie_url_suffix

In [45]:
movie_url = _get_movie_url_by_name(movie_name)
movie_url


Out[45]:
'http://www.metacritic.com/movie/kingsman-the-secret-service'

Critics Reviews Page


In [38]:
CRITICS_REVIEWS_URL_SUFFIX = "/critic-reviews"

In [46]:
critics_url = movie_url + CRITICS_REVIEWS_URL_SUFFIX

In [47]:
critics_request = urllib.request.Request(critics_url, headers=_HEADERS)
critics_page = bs(urllib.request.urlopen(critics_request), "html.parser")

In [48]:
SCORE_CLASSES = [
    "metascore_w larger movie positive",
    "metascore_w larger movie mixed",
    "metascore_w larger movie negative"
]

In [54]:
metascore = int(critics_page.find_all("span", {"class": SCORE_CLASSES})[0].contents[0])
metascore


Out[54]:
58

In [117]:
MONTH_SHORTHAND_MAP = {
    "Jan": "January", "Feb": "February", "Mar": "March", "Apr": "April",
    "May": "May", "Jun": "June", "Jul": "July", "Aug": "August",
    "Sep": "September", "Oct": "October", "Nov": "November", "Dec": "December"
}

In [111]:
def _parse_date_str(date_str):
    for month in MONTH_SHORTHAND_MAP:
        if month in date_str:
            return date_str.replace(month, MONTH_SHORTHAND_MAP[month])

In [137]:
def _get_critic_review_props(review):
    review_props = {}
    date_str = review.find_all("span", {"class": "date"})[0].contents[0]
    date_str = _parse_date_str(date_str)
    review_props['review_date'] = datetime.strptime(date_str, "%B %d, %Y").date()
    review_props['score'] = int(review.find_all("div", {"class": "metascore_w"})[0].contents[0])
    review_props['summary'] = review.find_all('a', {'class': 'no_hover'})[0].contents[0].strip()
    review_props['publication'] = None
    review_props['critic'] = None
    for link in review.find_all("a"):
        if 'publication' in link['href']:
            review_props['publication'] = link.contents[0]
        if 'critic' in link['href']:
            review_props['critic'] = link.contents[0]
    return review_props

In [124]:
reviews = []
for review in critics_page.find_all("div", {"class": "review"}):
    try:
        reviews.append(_get_critic_review_props(review))
    except Exception:
        continue

In [128]:
len(reviews)


Out[128]:
39

User Reviews Page


In [ ]:
def _get_user_review_props(review):
    review_props = {}
    date_str = review.find_all("span", {"class": "date"})[0].contents[0]
    date_str = _parse_date_str(date_str)
    review_props['review_date'] = datetime.strptime(date_str, "%B %d, %Y").date()
    review_props['score'] = int(review.find_all("div", {"class": "metascore_w"})[0].contents[0])
    try:
        review_props['text'] = review.find_all('span', {'class': 'blurb blurb_expanded'})[0].contents[0].strip()
    except IndexError:
        review_props['text'] = review.find_all('div', {'class': 'review_body'})[0].contents[1].contents[0].strip()
    review_props['user'] = review.find_all('span', {'class': 'author'})[0].contents[0].contents[0]
    review_props['total_reactions'] = int(review.find_all('span', {'class': 'total_count'})[0].contents[0])
    review_props['pos_reactions'] = int(review.find_all('span', {'class': 'yes_count'})[0].contents[0])
    review_props['neg_reactions'] = review_props['total_reactions'] - review_props['pos_reactions']
    return review_props

In [129]:
USERS_REVIEWS_URL_SUFFIX = "/user-reviews?page=0"

In [ ]:
USER_SCORE_CLASSES = [
    "metascore_w user larger movie positive",
    "metascore_w user larger movie mixed",
    "metascore_w user larger movie negative"
]

In [213]:
def _get_user_rating_freq(users_page, rating):
    return int(users_page.find_all("div", {"class": "chart {}".format(rating)})[0].find_all(
            "div", {"class": "count fr"})[0].contents[0].replace(',', ''))

In [214]:
def _get_user_reviews_from_page(users_page):
    review_elements = users_page.find_all("div", {"class": "review"})
    user_reviews = []
    for review in review_elements:
        try:
            user_reviews.append(_get_user_review_props(review))
        except Exception:
            continue
    print("Extracted {} reviews.".format(len(user_reviews)))
    nexts = users_page.find_all("a", {"class": "action", "rel": "next"})
    if len(nexts) > 0: 
        next_url = METACRITIC_URL + nexts[0]['href']
        next_request = urllib.request.Request(next_url, headers=_HEADERS)
        next_page = bs(urllib.request.urlopen(next_request), "html.parser")
        user_reviews += _get_user_reviews_from_page(next_page)
    return user_reviews

In [215]:
def _get_user_reviews_props(movie_url):
    users_url = movie_url + USERS_REVIEWS_URL_SUFFIX
    users_request = urllib.request.Request(users_url, headers=_HEADERS)
    users_page = bs(urllib.request.urlopen(users_request), "html.parser")
    users_props = {}
    user_score = float(users_page.find_all("span", {"class": USER_SCORE_CLASSES})[0].contents[0])
    users_props['user_score'] = user_score
    for rating in ['positive', 'mixed', 'negative']:
        users_props['{}_rating_frequency'.format(rating)] = _get_user_rating_freq(users_page, rating)
    users_props['user_reviews'] = _get_user_reviews_from_page(users_page)
    return users_props

In [216]:
users_props = _get_user_reviews_props(movie_url)


Extracted 100 reviews.
Extracted 100 reviews.
Extracted 20 reviews.

In [ ]:
<meta property="og:title" content="Kingsman: The Secret Service">

In [223]:
users_page.find_all("meta", {"property": "og:title"})[0]['content']


Out[223]:
'Kingsman: The Secret Service'

In [218]:
users_props.keys()


Out[218]:
dict_keys(['user_reviews', 'mixed_rating_frequency', 'user_score', 'negative_rating_frequency', 'positive_rating_frequency'])

In [187]:



Extracted 100 reviews.
Extracted 100 reviews.
Extracted 20 reviews.

Uniting profiles to csv


In [11]:
import os
import morejson as json
import pandas as pd

In [12]:
PROF_DIR_PATH = '/Users/shaypalachy/clones/rotten_needles/data/metacritic_profiles/'

In [25]:
IMDB_PROF_DIR_PATH = '/Users/shaypalachy/clones/rotten_needles/data/imdb_profiles/'

In [13]:
profiles = []
for profile_file in os.listdir(PROF_DIR_PATH):
    print('Reading {}'.format(profile_file))
    file_path = os.path.join(PROF_DIR_PATH, profile_file)
    file_name, ext = os.path.splitext(file_path)
    if ext != '.json':
        continue
    with open(file_path, 'r') as json_file:
        profiles.append(json.load(json_file))
df = pd.DataFrame(profiles)


Reading .DS_Store
Reading 13_sins.json
Reading 1915.json
Reading 22_jump_street.json
Reading 300_rise_of_an_empire.json
Reading 3_days_to_kill.json
Reading 50_to_1.json
Reading a_haunted_house_2.json
Reading a_merry_friggin_christmas.json
Reading a_million_ways_to_die_in_the_west.json
Reading a_walk_among_the_tombstones.json
Reading about_last_night.json
Reading accidental_love.json
Reading addicted.json
Reading adult_beginners.json
Reading adult_world.json
Reading after_the_dark.json
Reading agent_47.json
Reading alexander_and_the_terrible_horrible_no_good_very_bad_day.json
Reading alien_abduction.json
Reading aloha.json
Reading america_imagine_the_world_without_her.json
Reading and_so_it_goes.json
Reading android_cop.json
Reading annabelle.json
Reading annie.json
Reading ant-man.json
Reading apocalypse_pompeii.json
Reading as_above_so_below.json
Reading avengers_age_of_ultron.json
Reading back_in_the_day.json
Reading bad_johnson.json
Reading barefoot.json
Reading better_living_through_chemistry.json
Reading beyond_the_lights.json
Reading beyond_the_reach.json
Reading big_eyes.json
Reading big_hero_6.json
Reading black_mass.json
Reading black_or_white.json
Reading black_water_vampire.json
Reading blackhat.json
Reading blended.json
Reading brick_mansions.json
Reading cake.json
Reading camp_takota.json
Reading camp_x-ray.json
Reading captain_america_the_winter_soldier.json
Reading chappie.json
Reading cheap_thrills_(film).json
Reading child_44.json
Reading cinderella.json
Reading cold_comes_the_night.json
Reading cold_in_july.json
Reading comet.json
Reading concussion.json
Reading crimson_peak.json
Reading cymbeline.json
Reading da_sweet_blood_of_jesus.json
Reading daddys_home.json
Reading danny_collins.json
Reading date_and_switch.json
Reading dawn_of_the_planet_of_the_apes.json
Reading dear_white_people.json
Reading decoding_annie_parker.json
Reading deliver_us_from_evil.json
Reading devils_due.json
Reading devils_knot.json
Reading do_you_believe.json
Reading dolphin_tale_2.json
Reading dope.json
Reading dracula_untold.json
Reading draft_day.json
Reading dumb_and_dumber_to.json
Reading dumbbells.json
Reading earth_to_echo.json
Reading endless_love.json
Reading entourage.json
Reading everest.json
Reading everly.json
Reading ex_machina.json
Reading exodus_gods_and_kings.json
Reading far_from_the_madding_crowd.json
Reading fifty_shades_of_grey.json
Reading focus.json
Reading foxcatcher.json
Reading furious_7.json
Reading fury.json
Reading get_hard.json
Reading get_on_up.json
Reading gods_not_dead.json
Reading gods_pocket.json
Reading gone_girl.json
Reading goodbye_to_all_that.json
Reading guardians_of_the_galaxy.json
Reading happy_christmas.json
Reading heaven_is_for_real.json
Reading hellion.json
Reading hercules.json
Reading home.json
Reading home_sweet_hell.json
Reading honeymoon.json
Reading horrible_bosses_2.json
Reading hot_pursuit.json
Reading hot_tub_time_machine_2.json
Reading how_to_train_your_dragon_2.json
Reading i_frankenstein.json
Reading i_origins.json
Reading if_i_stay.json
Reading in_the_blood.json
Reading in_the_heart_of_the_sea.json
Reading inherent_vice.json
Reading inside_out.json
Reading insidious_chapter_3.json
Reading interstellar.json
Reading into_the_storm.json
Reading into_the_woods.json
Reading it_follows.json
Reading jack_ryan_shadow_recruit.json
Reading jamesy_boy.json
Reading jersey_boys.json
Reading jessabelle.json
Reading jinn.json
Reading joy.json
Reading jupiter_ascending.json
Reading jurassic_world.json
Reading just_before_i_go.json
Reading kid_cannabis.json
Reading kingsman_the_secret_service.json
Reading labor_day.json
Reading laggies.json
Reading left_behind.json
Reading leprechaun_origins.json
Reading lets_be_cops.json
Reading life_after_beth.json
Reading listen_up_philip.json
Reading little_boy.json
Reading love_&_mercy.json
Reading love_is_strange.json
Reading love_rosie.json
Reading mad_max_fury_road.json
Reading maggie.json
Reading magic_in_the_moonlight.json
Reading maleficent.json
Reading maps_to_the_stars.json
Reading match.json
Reading max.json
Reading mcfarland_usa.json
Reading me_and_earl_and_the_dying_girl.json
Reading million_dollar_arm.json
Reading minions.json
Reading moms_night_out.json
Reading monkey_kingdom.json
Reading mortdecai.json
Reading mr_peabody_&_sherman.json
Reading muppets_most_wanted.json
Reading murder_101.json
Reading my_man_is_a_loser.json
Reading need_for_speed.json
Reading night_at_the_museum_secret_of_the_tomb.json
Reading nightcrawler.json
Reading no_escape.json
Reading no_good_deed.json
Reading noah.json
Reading non-stop.json
Reading nurse_3d.json
Reading obvious_child.json
Reading oculus.json
Reading open_grave.json
Reading ouija.json
Reading out_of_the_dark.json
Reading paddington.json
Reading paranormal_activity_the_marked_ones.json
Reading paul_blart_mall_cop_2.json
Reading penguins_of_madagascar.json
Reading persecuted.json
Reading ping_pong_summer.json
Reading pitch_perfect_2.json
Reading pixels.json
Reading planes_fire_&_rescue.json
Reading poltergeist.json
Reading project_almanac.json
Reading rage.json
Reading raze.json
Reading rebound.json
Reading ride_along.json
Reading rio_2.json
Reading road_hard.json
Reading road_to_paloma.json
Reading robocop.json
Reading run_all_night.json
Reading sabotage.json
Reading san_andreas.json
Reading saving_christmas.json
Reading serena.json
Reading seventh_son.json
Reading sex_tape.json
Reading snowden.json
Reading someone_marry_barry.json
Reading son_of_god.json
Reading song_one.json
Reading spare_parts.json
Reading spy.json
Reading stage_fright.json
Reading step_up_all_in.json
Reading strange_magic.json
Reading tammy.json
Reading ted_2.json
Reading terminator_genisys.json
Reading that_awkward_moment.json
Reading the_age_of_adaline.json
Reading the_amazing_spider-man_2.json
Reading the_angriest_man_in_brooklyn.json
Reading the_bag_man.json
Reading the_better_angels.json
Reading the_boxtrolls.json
Reading the_boy_next_door.json
Reading the_cobbler.json
Reading the_d_train.json
Reading the_divergent_series_insurgent.json
Reading the_duff.json
Reading the_equalizer.json
Reading the_expendables_3.json
Reading the_fault_in_our_stars.json
Reading the_giver.json
Reading the_good_dinosaur.json
Reading the_good_lie.json
Reading the_guest.json
Reading the_gunman.json
Reading the_homesman.json
Reading the_humbling.json
Reading the_hundred-foot_journey.json
Reading the_interview.json
Reading the_judge.json
Reading the_lazarus_effect.json
Reading the_loft.json
Reading the_longest_ride.json
Reading the_martian.json
Reading the_maze_runner.json
Reading the_monuments_men.json
Reading the_nut_job.json
Reading the_one_i_love.json
Reading the_other_woman.json
Reading the_outsider.json
Reading the_pretty_one.json
Reading the_prince.json
Reading the_purge_anarchy.json
Reading the_revenant.json
Reading the_road_within.json
Reading the_scribbler.json
Reading the_second_best_exotic_marigold_hotel.json
Reading the_signal.json
Reading the_skeleton_twins.json
Reading the_song.json
Reading the_spongebob_movie_sponge_out_of_water.json
Reading the_two_faces_of_january.json
Reading the_vatican_tapes.json
Reading the_voices.json
Reading the_walking_deceased.json
Reading the_wedding_ringer.json
Reading the_woman_in_black_2_angel_of_death.json
Reading they_came_together.json
Reading this_is_where_i_leave_you.json
Reading tiger_orange.json
Reading tomorrowland.json
Reading top_five.json
Reading tracers.json
Reading trainwreck.json
Reading transcendence.json
Reading true_story.json
Reading tusk.json
Reading unbroken.json
Reading unfinished_business.json
Reading unfriended.json
Reading vampire_academy.json
Reading veronica_mars.json
Reading vice.json
Reading victor_frankenstein.json
Reading walk_of_shame.json
Reading when_marnie_was_there.json
Reading while_were_young.json
Reading whiplash.json
Reading wild.json
Reading wild_card.json
Reading wish_i_was_here.json
Reading witching_hour.json
Reading woman_in_gold.json
Reading x-men_days_of_future_past.json
Reading young_ones.json
Reading youre_not_you.json