In [17]:
from multiprocessing import Pool #witness the power
import wikipedia
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline
from fuzzywuzzy import fuzz
from collections import defaultdict
from helper_functions import *

This notebook contains code that was used to scrape the raw movie data from the rotton tomatoes webpages.

Thankfully, rotton tomatoes has a consistent and well structured HTML/CSS codebase. As such, I was able to harnes the fullpower of beauiful soup to get exverything I needed!

The key insight in this notebook is the use of the Multiprocessing library. This library let me scrape websites in parallel saving me countless hours!

The code for that function can be found in helper_functions.py I was incredibly proud that I was able to use it. It is a library I wish to use more often - I hope to learn a lot more about it in future projects.

Towards the end of this notebook, you will see some fragments of code I was using to experiment with collecting movie information from wikipedia. Most notable is the wikipedia API and the WPtools library (essentially a wrapper of the wikipedia API).

In the end, I made use of the wikipedia API in order to obtain the unique HTML addresses for each movie - I then reverted back to the bs4 library and to scrape infoboxes. I really wish the the wikipedia API had a .infobox() method, it would have greatly simplified my codebase. Then again, if they did have that method - I would have learned a lot less!


In [18]:
def extract_rotton_info_v2(webpage):
    
    master_dict = {}
    movie_rank_index = 0
    tomato_rating_index = 1
    movie_url_index = 2
    genre_name = webpage.split("https://www.rottentomatoes.com/top/bestofrt/top_100_")[1].strip("/")


    print("-------------","Processing: ",webpage,"---------------")

    soup = BeautifulSoup(requests.get(webpage).text,'lxml')

    top_100_of_sub_genre = soup.find_all(class_='table')[0].find_all('td')

    for _ in range(1,(int(len(top_100_of_sub_genre)/4)+1)):

        rank = top_100_of_sub_genre[movie_rank_index].text.strip()

        tomato_percentage = top_100_of_sub_genre[tomato_rating_index].find(class_='tMeterScore').text.strip()

        movie_name = top_100_of_sub_genre[movie_url_index].text.strip()
        movie_name = movie_name+" (film)"

        movie_url = base_url+top_100_of_sub_genre[movie_url_index].find('a').get('href')
        
        movie_page = BeautifulSoup(requests.get(movie_url).text, 'lxml')

        #audience rating is out of 5
        audience_rating = movie_page.find(class_="audience-info hidden-xs superPageFontColor").text.split()[2]
        rotton_info_extraction = movie_page.find("div", {"id": "scoreStats"}).text.split()
        
        rotton_average_rating = rotton_info_extraction[2].split('/')[0] #out of 10
        rotton_reviews_counted = rotton_info_extraction[5]
        
        if movie_name not in master_dict: #want to avoid duplicate movies across lists.
            master_dict[movie_name] = [rank, rotton_average_rating, rotton_reviews_counted, tomato_percentage, audience_rating]
            
        
        movie_rank_index +=4
        tomato_rating_index += 4
        movie_url_index += 4
        
    return master_dict

In [14]:
# def extract_movie_names(array):
#     movie_names = []

#     for index, val in enumerate(array):
#         genre_list = array[index][list(array[index].keys())[0]]
#         for row in genre_list:
#             clean = row[0].split('(')
#             name = clean[0].strip()
#             year = clean[1].strip(')')
#             movie_names.append((name, year))
    
#     return movie_names

In [142]:
# def extract_movie_names(array): #movie names will now be the key
#     movie_names = []

#     for index, val in enumerate(array):
#         genre_list = array[index][list(array[index].keys())[0]]
        
#         for row in genre_list:
#             movie_names.append(row[0])
    
#     return movie_names

In [19]:
genre_urls_to_scrape = extract_sub_genre_links(starting_url)

In [20]:
all_rotton_data = witness_the_power(extract_rotton_info_v2, genre_urls_to_scrape)


------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_action__adventure_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_animation_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_art_house__international_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_classics_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_science_fiction__fantasy_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_sports__fitness_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_comedy_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_drama_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_documentary_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_kids__family_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_musical__performing_arts_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_horror_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_romance_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_mystery__suspense_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_television_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_special_interest_movies/ ---------------
------------- Processing:  https://www.rottentomatoes.com/top/bestofrt/top_100_western_movies/ ---------------

In [23]:
movie_database = extract_unique_movies_across_genre(all_rotton_data)

In [24]:
len(movie_database.keys())


Out[24]:
932

In [27]:
pickle_object(all_rotton_data,"all_rotton_data")

In [28]:
pickle_object(movie_database,"movie_database")

In [16]:
list(movie_database.keys())


Out[16]:
['Mad Max: Fury Road (2015)',
 'Metropolis (1927)',
 'King Kong (1933)',
 'The Adventures of Robin Hood (1938)',
 'Zootopia (2016)',
 'Seven Samurai (Shichinin no Samurai) (1956)',
 'The Treasure of the Sierra Madre (1948)',
 'Up (2009)',
 'Logan (2017)',
 'Wonder Woman (2017)',
 'Baby Driver (2017)',
 'The Dark Knight (2008)',
 'Star Wars: Episode VII - The Force Awakens (2015)',
 'The 39 Steps (1935)',
 'The Hurt Locker (2009)',
 'Skyfall (2012)',
 'The Jungle Book (2016)',
 'Star Trek (2009)',
 'Harry Potter and the Deathly Hallows - Part 2 (2011)',
 'Jaws (1975)',
 'Lawrence of Arabia (1962)',
 'WALL-E (2008)',
 'The LEGO Movie (2014)',
 'The Searchers (1956)',
 'The Terminator (1984)',
 'Moana (2016)',
 'Apocalypse Now (1979)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'The French Connection (1971)',
 'Iron Man (2008)',
 'Spider-Man: Homecoming (2017)',
 'Aliens (1986)',
 'Kubo and the Two Strings (2016)',
 "Marvel's The Avengers (2012)",
 'Throne of Blood (1957)',
 'Once Upon a Time in the West (1968)',
 'Mission: Impossible Rogue Nation (2015)',
 'True Grit (2010)',
 'Badlands (1974)',
 'Aguirre, the Wrath of God (Aguirre, der Zorn Gottes) (1972)',
 'Casino Royale (2006)',
 'Captain America: Civil War (2016)',
 'Hunt for the Wilderpeople (2016)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Spartacus (1960)',
 'The Lord of the Rings: The Two Towers (2002)',
 'Chicken Run (2000)',
 'Sicario (2015)',
 'Ghostbusters (1984 Original) (1984)',
 'Guardians of the Galaxy (2014)',
 'X-Men: Days of Future Past (2014)',
 'The Lord of the Rings: The Return of the King (2003)',
 'The Wild Bunch (1969)',
 'Ran (1985)',
 'Back to the Future (1985)',
 'The Princess Bride (1987)',
 'All Is Lost (2013)',
 'Looper (2012)',
 'A Fistful of Dollars (Per un Pugno di Dollari) (1964)',
 'No Country for Old Men (2007)',
 'Gojira (1956)',
 'Who Framed Roger Rabbit (1988)',
 'The Right Stuff (1983)',
 'Mad Max 2: The Road Warrior (1982)',
 'Babe (1995)',
 'Paths of Glory (1957)',
 'Jurassic Park (1993)',
 'Crouching Tiger, Hidden Dragon (2001)',
 'Diva (1981)',
 'The Bridge on the River Kwai (1957)',
 'Embrace Of The Serpent (El Abrazo De La Serpiente) (2016)',
 'Goldfinger (1964)',
 'The Bourne Ultimatum (2007)',
 'The Lego Batman Movie (2017)',
 'Spider-Man 2 (2004)',
 'Dawn Of The Planet Of The Apes (2014)',
 'Catch Me If You Can (2002)',
 'Full Metal Jacket (1987)',
 'Close Encounters of the Third Kind (1977)',
 'Doctor Strange (2016)',
 'Dr. No (1962)',
 'The Fugitive (1993)',
 'Apollo 13 (1995)',
 'From Russia With Love (1964)',
 'One False Move (1992)',
 'Mission: Impossible Ghost Protocol (2011)',
 'Yellow Submarine (1968)',
 'Wallace & Gromit: The Curse of the Were-Rabbit (2005)',
 'The Secret of Roan Inish (1995)',
 'The Lion King (1994)',
 'Hero (2004)',
 "'71 (2015)",
 'The Twilight Samurai (Tasogare Seibei) (2004)',
 'Raiders of the Lost Ark (1981)',
 'The Departed (2006)',
 'Superman (1978)',
 'The Straight Story (1999)',
 'Three Kings (1999)',
 'Captain America: The Winter Soldier (2014)',
 'Enter the Dragon (1973)',
 'Inside Out (2015)',
 'Snow White and the Seven Dwarfs (1937)',
 'Toy Story 3 (2010)',
 'Toy Story 2 (1999)',
 'Finding Nemo (2003)',
 'Pinocchio (1940)',
 'Toy Story (1995)',
 'Shaun the Sheep Movie (2015)',
 'Finding Dory (2016)',
 'My Life as a Zucchini (Ma vie de courgette) (2017)',
 'Ratatouille (2007)',
 'How to Train Your Dragon (2010)',
 'The Incredibles (2004)',
 '101 Dalmatians (1961)',
 'Fantasia (1940)',
 'Tower (2016)',
 'Monsters, Inc. (2001)',
 'The Iron Giant (1999)',
 'Beauty and the Beast (1991)',
 'Spirited Away (2002)',
 'Anomalisa (2015)',
 'Song Of The Sea (2014)',
 'Your Name. (Kimi No Na Wa.) (2017)',
 'Only Yesterday (2016)',
 'Waltz with Bashir (2008)',
 'Antz (1998)',
 'Persepolis (2007)',
 'The Nightmare Before Christmas (1993)',
 'The Red Turtle (La tortue rouge) (2017)',
 'The Secret World of Arrietty (2012)',
 'Ernest & Célestine (2014)',
 'Long Way North (Tout en haut du monde) (2016)',
 'April and the Extraordinary World (Avril et le monde truqué) (2016)',
 'Fantastic Mr. Fox (2009)',
 'Aladdin (1992)',
 'Ghost in the Shell (1996)',
 'How to Train Your Dragon 2 (2014)',
 'Coraline (2009)',
 'The Triplets of Belleville (2003)',
 'Bambi (1942)',
 'Frozen (2013)',
 'James and the Giant Peach (1996)',
 'The Little Mermaid (1989)',
 'Big Hero 6 (2014)',
 "A Bug's Life (1998)",
 'Ponyo (2009)',
 'Princess Mononoke (Mononoke-hime) (1999)',
 'Arthur Christmas (2011)',
 'Mary and Max (2009)',
 'My Neighbor Totoro (1988)',
 'The Little Prince (2016)',
 'Bolt (2008)',
 'Boy and the World (O Menino e o Mundo) (2015)',
 'Tangled (2010)',
 'The Simpsons Movie (2007)',
 'Miss Hokusai (Sarusuberi: Miss Hokusai) (2016)',
 'When Marnie Was There (2015)',
 'Shrek (2001)',
 'Shrek 2 (2004)',
 'The Wind Rises (2014)',
 'Frankenweenie (2012)',
 'Kung Fu Panda (2008)',
 'Winnie the Pooh (2011)',
 'Rango (2011)',
 "The Illusionist (L'illusionniste) (2010)",
 'Millennium Actress (Sennen joyû) (2001)',
 'ParaNorman (2012)',
 'The Secret of Kells (2010)',
 'Tarzan (1999)',
 'Kung Fu Panda 3 (2016)',
 "Howl's Moving Castle (2005)",
 'The Peanuts Movie (2015)',
 'Wreck-it Ralph (2012)',
 'The Boy And The Beast (Bakemono No Ko) (2016)',
 'My Dog Tulip (2010)',
 'Cloudy With a Chance of Meatballs (2009)',
 'The Pirates! Band of Misfits (2012)',
 'Akira (1988)',
 'Tokyo Godfathers (2003)',
 'Mulan (1998)',
 'Lilo & Stitch (2002)',
 'The Princess and the Frog (2009)',
 "The Emperor's New Groove (2000)",
 'Sausage Party (2016)',
 "Tim Burton's Corpse Bride (2005)",
 'Phantom Boy (2016)',
 'Anastasia (1997)',
 'Puss in Boots (2011)',
 'The Cabinet of Dr. Caligari (Das Cabinet des Dr. Caligari) (1920)',
 'Nosferatu, a Symphony of Horror (Nosferatu, eine Symphonie des Grauens) (Nosferatu the Vampire) (1922)',
 'La Grande illusion (Grand Illusion) (1938)',
 'The Battle of Algiers (La Battaglia di Algeri) (1967)',
 'Rashômon (1951)',
 'M (1931)',
 'The 400 Blows (Les Quatre cents coups) (1959)',
 "Army of Shadows (L'Armée des ombres) (1969)",
 'The Conformist (1970)',
 'Tokyo Story (Tôkyô monogatari) (1953)',
 'Open City (1946)',
 'The Wages of Fear (1953)',
 'Battleship Potemkin (1925)',
 'The Leopard (1963)',
 'La Dolce Vita (1960)',
 'Let the Right One In (2008)',
 'Playtime (1973)',
 'The Rules of the Game (La règle du jeu) (1939)',
 'The Discreet Charm Of The Bourgeoisie (Le Charme Discret de la Bourgeoisie) (1972)',
 'The Salesman (Forushande) (2017)',
 "Things to Come (L'avenir) (2016)",
 'Gloria (2014)',
 'Tampopo (1985)',
 'Pather Panchali (1955)',
 '8 1/2 (1963)',
 'A Separation (2011)',
 'Eyes Without a Face (1962)',
 'The Umbrellas of Cherbourg (Les Parapluies de Cherbourg) (1964)',
 'Three Colors: Red (Trois couleurs: Rouge) (1994)',
 'The Tale of the Princess Kaguya (2014)',
 'Timbuktu (2015)',
 'Le Cercle Rouge (1970)',
 'Three Colors: Blue (Trois Couleurs: Bleu) (1993)',
 'Sing Street (2016)',
 'GETT: The Trial of Viviane Amsalem (2015)',
 'Amy (2015)',
 'Truman (2017)',
 'Phoenix (2015)',
 'Solaris (1976)',
 'Jiro Dreams of Sushi (2012)',
 'Aruitemo Aruitemo (Still Walking) (2008)',
 'Project Nim (2011)',
 'Wings of Desire (1987)',
 'Le goût des autres (The Taste of Others) (2000)',
 'Todo sobre mi madre (All About My Mother) (1999)',
 'The Happiest Day in the Life of Olli Mäki (Hymyilevä mies) (2017)',
 'Waste Land (2010)',
 "The Band's Visit (2007)",
 'The Vanishing (Spoorloos) (1988)',
 'Afghan Star (2009)',
 'Poetry (2011)',
 'Sex, Lies, and Videotape (1989)',
 'The Handmaiden (Ah-ga-ssi) (2016)',
 'The Wailing (Goksung) (2016)',
 'About Elly (2015)',
 'A Prophet (Un prophete) (2010)',
 'Under The Shadow (2016)',
 "Elevator to the Gallows (Ascenseur pour l'échafaud) (1958)",
 'Last Train Home (2010)',
 'Mafioso (1964)',
 'Fireworks Wednesday (Chaharshanbe-soori) (2016)',
 'This Is Not a Film (2012)',
 'Monsieur Lazhar (2012)',
 'After the Storm (Umi yori mo mada fukaku) (2017)',
 'Graduation (Bacalaureat) (2017)',
 "L'Avventura (1960)",
 'Nostalgia for the Light (2011)',
 'Bus 174 (Ônibus 174) (2003)',
 'Maria Full of Grace (2004)',
 'Moolaadé (2004)',
 'Rivers and Tides: Andy Goldsworthy Working With Time (2002)',
 'Band of Outsiders (Bande à part) (1964)',
 'The Class (2008)',
 'The Man Without a Past (2002)',
 'Aquarius (2016)',
 'Amour (2012)',
 'The Look of Silence (2015)',
 'The Kid with a Bike (2012)',
 'Rififi (Du Rififi Chez les Hommes) (1956)',
 'The Seventh Seal (Det Sjunde inseglet) (1957)',
 'Oslo, August 31st (2012)',
 'An Education (2009)',
 'The Wizard of Oz (1939)',
 'Citizen Kane (1941)',
 'The Third Man (1949)',
 'All About Eve (1950)',
 'Modern Times (1936)',
 'It Happened One Night (1934)',
 "Singin' in the Rain (1952)",
 'Casablanca (1942)',
 'Psycho (1960)',
 'Laura (1944)',
 "A Hard Day's Night (1964)",
 'North by Northwest (1959)',
 'Repulsion (1965)',
 'Sunset Boulevard (1950)',
 'Rear Window (1954)',
 'The Bride of Frankenstein (1935)',
 'The Philadelphia Story (1940)',
 'All Quiet on the Western Front (1930)',
 '12 Angry Men (Twelve Angry Men) (1957)',
 'A Streetcar Named Desire (1951)',
 'Dr. Strangelove Or How I Learned to Stop Worrying and Love the Bomb (1964)',
 'Frankenstein (1931)',
 'Vertigo (1958)',
 'Rebecca (1940)',
 "Rosemary's Baby (1968)",
 'Touch of Evil (1958)',
 'Gone With the Wind (1939)',
 'The Last Picture Show (1971)',
 'The Grapes of Wrath (1940)',
 'Roman Holiday (1953)',
 'On the Waterfront (1954)',
 'Chinatown (1974)',
 'Anatomy of a Murder (1959)',
 'The Lady Vanishes (1938)',
 'Cool Hand Luke (1967)',
 'An American in Paris (1951)',
 "It's a Wonderful Life (1946)",
 'The Gold Rush (1925)',
 'The Red Shoes (1948)',
 'Sweet Smell of Success (1957)',
 'To Be or Not to Be (1942)',
 'The Big Sleep (1946)',
 'Mary Poppins (1964)',
 'City Lights (1931)',
 'Invasion of the Body Snatchers (1956)',
 'Barry Lyndon (1975)',
 'Miracle on 34th Street (1947)',
 'His Girl Friday (1940)',
 'Freaks (1932)',
 'Gentlemen Prefer Blondes (1953)',
 'Mean Streets (1973)',
 'The Manchurian Candidate (1962)',
 'The Best Years of Our Lives (1946)',
 'Forbidden Planet (1956)',
 'The Day the Earth Stood Still (1951)',
 'Duck Soup (1933)',
 'Some Like It Hot (1959)',
 '2001: A Space Odyssey (1968)',
 'Bringing Up Baby (1938)',
 "One Flew Over the Cuckoo's Nest (1975)",
 'Sunrise: A Song of Two Humans (1927)',
 'Peeping Tom (1960)',
 'The Birds (1963)',
 'Rebel Without a Cause (1955)',
 'Night of the Living Dead (1968)',
 "Don't Look Now (1973)",
 'The Apartment (1960)',
 'To Catch a Thief (1955)',
 'Get Out (2017)',
 'La La Land (2016)',
 'Monty Python and the Holy Grail (1975)',
 'Before Midnight (2013)',
 'Love & Friendship (2016)',
 'The Artist (2011)',
 'Annie Hall (1977)',
 'Paterson (2016)',
 "Don't Think Twice (2016)",
 'The Big Sick (2017)',
 'Airplane! (1980)',
 'Big (1988)',
 'Paddington (2015)',
 'Birdman (2014)',
 'The Grand Budapest Hotel (2014)',
 'Spy (2015)',
 'Before Sunrise (1995)',
 'Moonrise Kingdom (2012)',
 'Sideways (2004)',
 'Enough Said (2013)',
 'The Nice Guys (2016)',
 'The Muppets (2011)',
 'Amadeus (1984)',
 'Bull Durham (1988)',
 'The Player (1992)',
 'Tangerine (2015)',
 'What We Do In The Shadows (2015)',
 'Sense and Sensibility (1995)',
 'Brazil (1985)',
 'Repo Man (1984)',
 'We Are the Best! (2014)',
 'Lost In Translation (2003)',
 'The Truman Show (1998)',
 "Monty Python's Life of Brian (1979)",
 'Manhattan (1979)',
 'Broadcast News (1987)',
 'Say Anything... (1989)',
 'Le Havre (2011)',
 'The Edge of Seventeen (2016)',
 'Groundhog Day (1993)',
 'Hairspray (1988)',
 'Juno (2007)',
 'My Fair Lady (1964)',
 'Silver Linings Playbook (2012)',
 'Man on Wire (2008)',
 'I Am Not Your Negro (2017)',
 'Life Itself (2014)',
 'The Last Waltz (1978)',
 '20 Feet From Stardom (2013)',
 'Taxi to the Dark Side (2007)',
 'Weiner (2016)',
 'Blackfish (2013)',
 'Murderball (2005)',
 "Jodorowsky's Dune (2014)",
 'Deliver Us from Evil (2006)',
 'Inside Job (2010)',
 'Cameraperson (2016)',
 'Anvil! The Story of Anvil (2009)',
 'Seymour: An Introduction (2015)',
 'The Missing Picture (2014)',
 'The Square (Al Midan) (2013)',
 'The Fog of War: Eleven Lessons from the Life of Robert S. McNamara (2003)',
 'Spellbound (2002)',
 'Kedi (2017)',
 'Bright Lights: Starring Carrie Fisher and Debbie Reynolds (2017)',
 'Ingrid Bergman in Her Own Words (Jag är Ingrid) (2015)',
 'Mr. Death: The Rise and Fall of Fred A. Leuchter, Jr. (1999)',
 'City of Ghosts (2017)',
 'Hoop Dreams (1994)',
 'Citizenfour (2014)',
 'O.J.: Made in America (2016)',
 'The Interrupters (2011)',
 'How to Survive a Plague (2012)',
 'Sound City (2013)',
 'Iris (2015)',
 'The Act Of Killing (2013)',
 'Capturing the Friedmans (2003)',
 'The Invisible War (2012)',
 '3 And 1/2 Minutes, 10 Bullets (2015)',
 'Kurt Cobain: Montage Of Heck (2015)',
 'Everyday Sunshine: The Story of Fishbone (2011)',
 'We Were Here (2011)',
 'Enron: The Smartest Guys in the Room (2005)',
 'Bill Cunningham New York (2011)',
 'When We Were Kings (1996)',
 'Bowling for Columbine (2002)',
 'Listen To Me Marlon (2015)',
 'Red Army (2015)',
 'The Overnighters (2014)',
 'Elaine Stritch: Shoot Me (2014)',
 '56 Up (2013)',
 'Hitchcock/Truffaut (2015)',
 'The King of Kong: A Fistful of Quarters (2007)',
 'March of the Penguins (2005)',
 'West of Memphis (2012)',
 'The War Tapes (2006)',
 'Stories We Tell (2013)',
 'Cave of Forgotten Dreams (2011)',
 'Heart of a Dog (2015)',
 'Winged Migration (2003)',
 'Ai Weiwei: Never Sorry (2012)',
 'Mea Maxima Culpa: Silence In The House Of God (2012)',
 'Beware Of Mr. Baker (2012)',
 'Undefeated (2012)',
 'Marwencol (2010)',
 'The Devil Came on Horseback (2007)',
 'Dark Horse (2016)',
 'The Island President (2012)',
 'Blindsight (2006)',
 'Finders Keepers (2015)',
 'Call Me Kuchu (2013)',
 'For the Bible Tells Me So (2007)',
 'Almost Holy (Crocodile Gennadiy) (2016)',
 "Jafar Panahi's Taxi (2015)",
 'Let The Fire Burn (2013)',
 'Food, Inc. (2009)',
 'Welcome To Leith (2015)',
 'Restrepo (2010)',
 'Born To Be Wild (2011)',
 'Surfwise (2007)',
 'The Life and Times of Hank Greenberg (2000)',
 'Chavez: Inside the Coup (2003)',
 'The Life of Reilly (2007)',
 'Exit Through The Gift Shop (2010)',
 'Wordplay (2006)',
 'Burma VJ: Reporter i et Lukket Land (Burma VJ: Reporting from a Closed Country) (2008)',
 'Control Room (2004)',
 'No End in Sight (2007)',
 'Born Into Brothels (2004)',
 'The Imposter (2012)',
 'Muscle Shoals (2013)',
 'The Godfather (1972)',
 'Boyhood (2014)',
 'Moonlight (2016)',
 'The Maltese Falcon (1941)',
 '12 Years a Slave (2013)',
 'Gravity (2013)',
 'Spotlight (2015)',
 'Taxi Driver (1976)',
 'Selma (2015)',
 'Argo (2012)',
 'Alien (1979)',
 'Bicycle Thieves (Ladri di biciclette) (1949)',
 'Arrival (2016)',
 'The Night of the Hunter (1955)',
 'Manchester by the Sea (2016)',
 'The Babadook (2014)',
 'The Wrestler (2008)',
 'L.A. Confidential (1997)',
 'Brooklyn (2015)',
 'Hell or High Water (2016)',
 'The Godfather, Part II (1974)',
 'Creed (2015)',
 'The Social Network (2010)',
 'Whiplash (2014)',
 'Short Term 12 (2013)',
 'Mud (2013)',
 'Her (2013)',
 'Nightcrawler (2014)',
 'Room (2015)',
 'Mr. Turner (2014)',
 'Double Indemnity (1944)',
 'Beauty and The Beast (La Belle et la bête) (1946)',
 'It Follows (2015)',
 "Pan's Labyrinth (2006)",
 'Evil Dead 2: Dead by Dawn (1987)',
 'The Witch (2016)',
 'The Cabin in the Woods (2012)',
 'The Silence of the Lambs (1991)',
 'The Innocents (1961)',
 'Cat People (1942)',
 'Drag Me to Hell (2009)',
 'The Evil Dead (1981)',
 'Invasion of the Body Snatchers (1978)',
 'Carrie (1976)',
 'Young Frankenstein (1974)',
 'The Love Witch (2016)',
 'A Girl Walks Home Alone at Night (2014)',
 'The Loved Ones (2012)',
 'Halloween (1978)',
 'Nosferatu: Phantom der Nacht (Nosferatu the Vampyre) (1979)',
 'Room 237 (2013)',
 'Re-Animator (1985)',
 'A Nightmare on Elm Street (1984)',
 'Green Room (2016)',
 'Dracula (1931)',
 'Train to Busan (Busanhaeng) (2016)',
 'The Host (2007)',
 'Shaun of the Dead (2004)',
 'Zombieland (2009)',
 'Suspiria (1977)',
 'What Ever Happened to Baby Jane? (1962)',
 'Raw (2017)',
 'Eraserhead (1977)',
 'Phantom Of The Opera (1925)',
 'The Fly (1986)',
 'We Are Still Here (2015)',
 'It Comes At Night (2017)',
 'The Wicker Man (1973)',
 'The Texas Chainsaw Massacre (1974)',
 'Little Shop of Horrors (1986)',
 'Let Me In (2010)',
 "Don't Breathe (2016)",
 'The Shining (1980)',
 'The Exorcist (1973)',
 'The Dead Zone (1983)',
 'The Conjuring (2013)',
 'Misery (1990)',
 'Poltergeist (1982)',
 'The Blair Witch Project (1999)',
 '28 Days Later (2003)',
 'An American Werewolf in London (1981)',
 'Bone Tomahawk (2015)',
 'The Orphanage (2007)',
 'Cronos (1994)',
 'Near Dark (1987)',
 "Donnie Darko: The Director's Cut (2004)",
 'Henry: Portrait of a Serial Killer (1986)',
 'Russian Ark (2002)',
 'Spring (2015)',
 'This Is the End (2013)',
 'Chronicle (2012)',
 'The Descent (2006)',
 'The Omen (1976)',
 'Goodnight Mommy (2015)',
 "Exorcist: The Version You've Never Seen (2000)",
 'Slither (2006)',
 'Ginger Snaps (2001)',
 'Altered States (1980)',
 'Gremlins (1984)',
 'Grindhouse (2007)',
 'Backcountry (2015)',
 'A Field in England (2014)',
 'Paranormal Activity (2009)',
 'The Autopsy of Jane Doe (2016)',
 'We Are What We Are (2013)',
 'The House of the Devil (2009)',
 "Dracula: Pages From a Virgin's Diary (2003)",
 'Dressed to Kill (1980)',
 'The Conjuring 2 (2016)',
 'They Live (1988)',
 'The Others (2001)',
 'E.T. The Extra-Terrestrial (1982)',
 'Hugo (2011)',
 'Enchanted (2007)',
 'Queen of Katwe (2016)',
 'Harry Potter and the Prisoner of Azkaban (2004)',
 'Jason and the Argonauts (1963)',
 'A Monster Calls (2017)',
 'Spy Kids (2001)',
 'How Green Was My Valley (1941)',
 'Harry Potter and the Goblet of Fire (2005)',
 'That Thing You Do! (1996)',
 "L'Heure d'été (Summer Hours) (2009)",
 'First Position (2012)',
 'Lassie (2006)',
 "Pete's Dragon (2016)",
 'Duma (2005)',
 'A Christmas Story (1983)',
 'The Sound of Music (1965)',
 'The Karate Kid (1984)',
 'Harry Potter and the Half-Blood Prince (2009)',
 "Pee-wee's Big Adventure (1985)",
 'Millions (2005)',
 'Freaky Friday (2003)',
 'The Muppet Movie (1979)',
 'Cinderella (2015)',
 'Disneynature Bears (2014)',
 'Field of Dreams (1989)',
 'Charlie and the Chocolate Factory (2005)',
 'Bridge to Terabithia (2007)',
 'Once (2007)',
 'West Side Story (1961)',
 "What's Love Got To Do With It? (1993)",
 'Nashville (1975)',
 'Marley (2012)',
 'Searching for Sugar Man (2012)',
 'Hairspray (2007)',
 'Love & Mercy (2015)',
 "Les Plages d'Agnès (The Beaches of Agnes) (2008)",
 'Festival Express (2003)',
 '20,000 Days on Earth (2014)',
 'Crumb (1995)',
 'The Full Monty (1997)',
 'Pina (2011)',
 'Crazy Heart (2009)',
 'Funny Girl (1968)',
 '49 Up (2006)',
 'Lagaan: Once Upon a Time in India (2001)',
 'Lost in La Mancha (2003)',
 'My Kid Could Paint That (2007)',
 'End of the Century: The Story of the Ramones (2004)',
 'Hedwig and the Angry Inch (2001)',
 'The Wrecking Crew (2015)',
 'Marina Abramovic: The Artist Is Present (2012)',
 "Dave Chappelle's Block Party (2006)",
 'The Filth and the Fury (2000)',
 'The Sapphires (2013)',
 'Los Angeles Plays Itself (2003)',
 'U2 3D (2007)',
 'A Band Called Death (2013)',
 'My Architect (2004)',
 'Straight Outta Compton (2015)',
 'Sid and Nancy (1986)',
 'Every Little Step (2009)',
 'Florence Foster Jenkins (2016)',
 'Standing in the Shadows of Motown (2002)',
 'Scratch (2002)',
 'F for Fake (1974)',
 'Ballets Russes (2005)',
 'Sweeney Todd: The Demon Barber of Fleet Street (2007)',
 'Shine (1996)',
 'Chicago (2002)',
 'Neil Young: Heart of Gold (2006)',
 'Mistaken for Strangers (2014)',
 'Five Easy Pieces (1970)',
 'Buena Vista Social Club (1999)',
 'Born To Be Blue (2016)',
 'Topsy-Turvy (1999)',
 'The Devil and Daniel Johnston (2006)',
 'Caesar Must Die (2013)',
 'Metallica: Some Kind of Monster (2004)',
 'Saturday Night Fever (1977)',
 'Dig! (2004)',
 'Shine a Light (2008)',
 'Beats Rhymes & Life: The Travels of a Tribe Called Quest (2011)',
 'Who Killed the Electric Car? (2006)',
 'New York Doll (2005)',
 'The Commitments (1991)',
 'Young@Heart (2007)',
 'High Noon (1952)',
 'Strangers on a Train (1951)',
 'The Conversation (1974)',
 'Ex Machina (2015)',
 'Eye In The Sky (2016)',
 'Blood Simple (1984)',
 'The Crying Game (1992)',
 'Bridge of Spies (2015)',
 'Wake in Fright (2012)',
 'Blue Ruin (2014)',
 'Gone Girl (2014)',
 '10 Cloverfield Lane (2016)',
 'Blue Velvet (1986)',
 'Diabolique (Les Diaboliques) (1955)',
 'Children of Men (2006)',
 'The Imitation Game (2014)',
 'Wild Tales (2015)',
 'In the Heat of the Night (1967)',
 'In the Line of Fire (1993)',
 'Gone Baby Gone (2007)',
 'Dirty Harry (1971)',
 'Carol (2015)',
 'The Hustler (1961)',
 'Slumdog Millionaire (2008)',
 'Titanic (1997)',
 'Shakespeare in Love (1998)',
 'Before Sunset (2004)',
 'Eternal Sunshine Of The Spotless Mind (2004)',
 'The Graduate (1967)',
 'Up in the Air (2009)',
 'The Lunchbox (2014)',
 'The Town (2010)',
 'Four Weddings and a Funeral (1994)',
 "A Summer's Tale (2014)",
 'Midnight in Paris (2011)',
 'From Here to Eternity (1953)',
 'Big Night (1996)',
 'Million Dollar Baby (2004)',
 'About a Boy (2002)',
 'Revanche (2009)',
 'Hamlet (1996)',
 'Monsoon Wedding (2002)',
 'Knocked Up (2007)',
 'Hannah and Her Sisters (1986)',
 'Out of Sight (1998)',
 'Dave (1993)',
 'The Spectacular Now (2013)',
 'Open Hearts (Elsker Dig For Evigt) (2002)',
 'Mrs. Brown (1997)',
 'The General (1927)',
 'The Best of Youth (La meglio gioventù) (2003)',
 'Moonstruck (1987)',
 'Y Tu Mama Tambien (2001)',
 'Kung Fu Hustle (2005)',
 'Southside With You (2016)',
 'The Duke Of Burgundy (2015)',
 'Last Resort (2001)',
 'Eat Drink Man Woman (Yin shi nan nu) (1994)',
 "I'll See You in My Dreams (2015)",
 'Cinema Paradiso (Nuovo Cinema Paradiso) (1988)',
 'Howards End (1992)',
 'Lone Star (1996)',
 'Talk to Her (2002)',
 'Heavenly Creatures (1994)',
 'Brokeback Mountain (2005)',
 'Samson and Delilah (2010)',
 'Obvious Child (2014)',
 'Amélie (2001)',
 'Snowpiercer (2014)',
 'The Martian (2015)',
 'Live Die Repeat: Edge of Tomorrow (2014)',
 'Blade Runner (1982)',
 "L'année dernière à Marienbad (Last Year at Marienbad) (1961)",
 'District 9 (2009)',
 'Source Code (2011)',
 'The Hunger Games: Catching Fire (2013)',
 'Being John Malkovich (1999)',
 'Rogue One: A Star Wars Story (2016)',
 'Love Is Strange (2014)',
 'Finding Vivian Maier (2014)',
 'A Film Unfinished (2010)',
 'Bigger, Stronger, Faster* (2008)',
 'Sweetgrass (2009)',
 'Trouble the Water (2008)',
 'Cutie And The Boxer (2013)',
 'An Inconvenient Truth (2006)',
 'To Be and to Have (Etre et Avoir) (2003)',
 'Chasing Ice (2012)',
 'Sicko (2007)',
 'The Wild Parrots of Telegraph Hill (2005)',
 'The Salt of the Earth (2015)',
 'Super Size Me (2004)',
 'The Queen of Versailles (2012)',
 'I Am Divine (2013)',
 'Brooklyn Castle (2012)',
 'Deep Water (2006)',
 'Manakamana (2014)',
 'The Last of the Unjust (2014)',
 'Up the Yangtze (2007)',
 'Good Hair (2009)',
 'National Gallery (2014)',
 'Last Days in Vietnam (2014)',
 'The Arbor (2011)',
 'Plagues & Pleasures on the Salton Sea (2004)',
 'Promises (2002)',
 'The Gatekeepers (2013)',
 'The Endurance (2001)',
 'Touching the Void (2004)',
 'After Tiller (2013)',
 'The Story of the Weeping Camel (2004)',
 'Crude (2009)',
 'Rocky (1976)',
 'The Crash Reel (2013)',
 'The Fighter (2010)',
 'The Damned United (2009)',
 'Sugar (2008)',
 'Riding Giants (2004)',
 'Senna (2011)',
 'Dogtown and Z-Boys (2001)',
 'Tristram Shandy: A Cock & Bull Story (2005)',
 'Boxing Gym (2010)',
 'The Color of Money (1986)',
 'Up for Grabs (2005)',
 'Hoosiers (1986)',
 'Girlfight (2000)',
 'Tyson (2009)',
 'The Heart of the Game (2005)',
 'Chariots of Fire (1981)',
 'The Hurricane (1999)',
 'Looking for Eric (2010)',
 'The Armstrong Lie (2013)',
 'Major League (1989)',
 'Goon (2012)',
 'Beyond the Mat (1999)',
 'The Boxer (1997)',
 'Step Into Liquid (2003)',
 'Once in a Lifetime: The Extraordinary Story of the New York Cosmos (2006)',
 'Invictus (2009)',
 'Fed Up (2014)',
 'A League of Their Own (1992)',
 'Stoked: The Rise and Fall of Gator (2003)',
 'Talladega Nights: The Ballad of Ricky Bobby (2006)',
 "Gunnin' for That #1 Spot (2008)",
 'Invincible (2006)',
 'Bleed For This (2016)',
 'Ultimate X: The Movie (2002)',
 'The Blind Side (2009)',
 'More Than a Game (2009)',
 'Secretariat (2010)',
 'Dust to Glory (2005)',
 'NASCAR: The IMAX Experience (2004)',
 'The Sandlot (1993)',
 'Glory Road (2006)',
 'The Perfect Game (2010)',
 'Because of Winn-Dixie (2005)',
 'Steep (2007)',
 'First Descent (2005)',
 'Soul Surfer (2011)',
 'Goal! The Dream Begins (Goal!: The Impossible Dream) (2005)',
 'The Replacements (2000)',
 'Beerfest (2006)',
 'Grudge Match (2013)',
 'Jiminy Glick in Lalawood (2005)',
 'Playing for Keeps (2012)',
 'Behind the Candelabra (2013)',
 'The Return (2003)',
 "Being Elmo: A Puppeteer's Journey (2011)",
 'Best Worst Movie (2010)',
 'Not Quite Hollywood: The Wild, Untold Story of Ozploitation! (2008)',
 'American Movie (1999)',
 'Carlos (2010)',
 'The Normal Heart (2014)',
 "Alien: The Director's Cut (2003)",
 'The Kid Stays in the Picture (2002)',
 'Yoo-hoo, Mrs. Goldberg (2009)',
 'Saraband (2003)',
 'Electric Boogaloo: The Wild, Untold Story of Cannon Films (2012)',
 'Side by Side (2012)',
 'Joan Rivers: A Piece Of Work (2010)',
 "Corman's World: Exploits Of A Hollywood Rebel (2011)",
 'Tell Them Who You Are (2005)',
 'The Five Obstructions (2003)',
 'Doctor Zhivago (1965)',
 'This Film Is Not Yet Rated (2006)',
 'Trekkies (1999)',
 'Inside Deep Throat (2005)',
 "Outfoxed: Rupert Murdoch's War on Journalism (2004)",
 'Trumbo (2007)',
 'Confirmation (2016)',
 'Tupac: Resurrection (2003)',
 'Overnight (2004)',
 "Teacher's Pet (2004)",
 'POM Wonderful Presents: The Greatest Movie Ever Sold (2011)',
 "Fellini: I'm a Born Liar (2003)",
 'Only Human (Seres queridos) (2006)',
 'My Date With Drew (2004)',
 'Waking Sleeping Beauty (2010)',
 'Porn Star: The Legend of Ron Jeremy (2001)',
 'The Rugrats Movie (1998)',
 "I'm Still Here (2010)",
 'No Strings Attached (2011)',
 'Bamboozled (2000)',
 'The Real Cancun (2003)',
 'I Am (2011)',
 '15 Minutes (2001)',
 'Man of the Year (2006)',
 'Pokemon 3: The Movie (2001)',
 'The Honeymooners (2005)',
 'From Justin To Kelly (2003)',
 'The Good, the Bad and the Ugly (1966)',
 'Unforgiven (1992)',
 'Django Unchained (2012)',
 'The Man Who Shot Liberty Valance (1962)',
 'True Grit (1969)',
 '3:10 to Yuma (2007)',
 'Blazing Saddles (1974)',
 'The Magnificent Seven (1960)',
 'Butch Cassidy and the Sundance Kid (1969)',
 'McCabe & Mrs. Miller (1971)',
 'The Proposition (2005)',
 'The Three Burials of Melquiades Estrada (2006)',
 "Meek's Cutoff (2011)",
 'Dances With Wolves (1990)',
 'The Homesman (2014)',
 'The Hateful Eight (2015)',
 'The Good, the Bad, the Weird (Joheun-nom, Nabbeun-nom, Isanghan-nom) (2010)',
 'Open Range (2003)',
 'Shanghai Noon (2000)',
 'Appaloosa (2008)',
 'The Assassination of Jesse James by the Coward Robert Ford (2007)',
 'Red Hill (2010)',
 'In a Valley of Violence (2016)',
 'The Horse Whisperer (1998)',
 'Blackthorn (2011)',
 'Tombstone (1993)',
 'Fah talai jone (Tears of the Black Tiger) (2007)',
 'The Keeping Room (2015)',
 'The Salvation (2015)',
 'The Magnificent Seven (2016)',
 'The Rover (2014)',
 'Maverick (1994)',
 'The Claim (2000)',
 'The Missing (2003)',
 "Heaven's Gate (1980)",
 'Legends of the Fall (1994)',
 'The Quick and the Dead (1995)',
 'The Killer Inside Me (2010)',
 'Sukiyaki Western Django (2008)',
 'Ned Kelly (2003)',
 'Seraphim Falls (2007)',
 'Down in the Valley (2006)',
 'Cowboys & Aliens (2011)',
 'Forsaken (2016)',
 'Jane Got a Gun (2016)',
 'A Million Ways to Die in the West (2014)',
 'The Lone Ranger (2013)',
 'Brimstone (2017)',
 'The Alamo (2004)',
 'Wild Wild West (1999)',
 'Priest (2011)',
 'American Outlaws (2001)',
 'Jonah Hex (2010)',
 'September Dawn (2007)',
 'Texas Rangers (2001)']

In [68]:
v = wikipedia.page(all_movie_names[0][0])

In [58]:
v


Out[58]:
<WikipediaPage 'Mad Max: Fury Road'>

In [70]:
for i in dir(v)[39:]:
    print(i)


categories
content
coordinates
html
images
links
original_title
pageid
parent_id
references
revision_id
section
sections
summary
title
url

In [69]:
soup = BeautifulSoup(v.html(), 'lxml')

In [72]:
wikipedia_api_info = soup.find("table",{"class":"infobox vevent"})

In [84]:
result = {}
for tr in wikipedia_api_info.find_all('tr'):
    if tr.find('th'):
        result[tr.find('th').text] = tr.find('td')

In [98]:
result.keys()


Out[98]:
dict_keys(['Mad Max: Fury Road', 'Directed by', 'Produced by', 'Written by', 'Starring', 'Music by', 'Cinematography', 'Edited by', 'Productioncompany ', 'Distributed by', 'Release date', 'Running time', 'Country', 'Language', 'Budget', 'Box office'])

In [97]:
result['Directed by'].text.strip()


Out[97]:
'George Miller'

In [114]:
result['Release date'].li.text.split("\xa0")[1]


Out[114]:
'May'

In [119]:
result['Running time'].text.strip().split(" minutes")[0]


Out[119]:
'120'

In [126]:
result['Box office'].text.strip().split('[')[0]


Out[126]:
'$378.9 million'

In [131]:
result['Budget'].text.strip().split("[")[0]


Out[131]:
'$150 million'

In [134]:
result['Language'].text.strip()


Out[134]:
'English'

In [28]:
wikipedia_api_info.strip().split("\n") # very messy - lets trip the WIP tools!


Out[28]:
['Mad Max: Fury Road',
 'Theatrical release posterDirected by',
 'George MillerProduced by',
 '',
 ' Doug Mitchell',
 ' George Miller',
 ' PJ Voeten',
 'Written by',
 '',
 ' George Miller',
 ' Brendan McCarthy',
 ' Nico Lathouris',
 'Starring',
 '',
 ' Tom Hardy',
 ' Charlize Theron',
 ' Nicholas Hoult',
 ' Hugh Keays-Byrne',
 ' Rosie Huntington-Whiteley',
 ' Riley Keough',
 ' Zoë Kravitz',
 ' Abbey Lee',
 ' Courtney Eaton',
 'Music by',
 'Junkie XLCinematography',
 'John SealeEdited by',
 'Margaret SixelProductioncompany ',
 '',
 ' Village Roadshow Pictures',
 ' Kennedy Miller Mitchell',
 ' RatPac-Dune Entertainment',
 'Distributed by',
 '',
 ' Warner Bros. Pictures (United States/International)',
 ' Roadshow Films (Australia)',
 'Release date',
 ' 7\xa0May\xa02015\xa0(2015-05-07) (TCL Chinese Theatre)',
 ' 14\xa0May\xa02015\xa0(2015-05-14) (Australia)',
 ' 15\xa0May\xa02015\xa0(2015-05-15) (United States)',
 ' ',
 ' ',
 'Running time',
 '120 minutes[1]Country',
 '',
 ' Australia[2]',
 ' United States[2][3]',
 'Language',
 'EnglishBudget',
 '$150 million[4][5]Box office',
 '$378.9 million[6]']

In [29]:
import wptools
x = wptools.page(all_movie_names[0][0]).get() #got the information for mad max


Mad_Max:_Fury_Road (en)
{
  lang: en
  title: Mad_Max:_Fury_Road
}
en.wikipedia.org (query) Mad_Max:_Fury_Road
en.wikipedia.org (parse) 36426373
www.wikidata.org (wikidata) Q1757288
www.wikidata.org (claims) Q11424|Q229390|Q446960|Q1341051|Q188473|Q31922...
en.wikipedia.org (imageinfo) File:Mad Max Fury Road.jpg|File:Mad Max Fur...
Mad_Max:_Fury_Road (en)
{
  cache: <dict(5)> {claims, imageinfo, parse, query, wikidata}
  claims: <dict(29)> {Q11424, Q1341051, Q16193207, Q16728739, Q17100...
  description: 2015 Australian post-apocalyptic action film
  extext: <str(2346)> _**Mad Max: Fury Road**_ is a 2015 action film...
  extract: <str(2426)> <p><i><b>Mad Max: Fury Road</b></i> is a 2015...
  images: <list(2)>
  infobox: <dict(17)> {alt, caption, cinematography, country, direct...
  label: Mad Max: Fury Road
  lang: en
  modified: <dict(2)> {page, wikidata}
  pageid: 36426373
  parsetree: <str(106757)> <root><template><title>Use Australian Eng...
  props: <dict(9)> {P136, P161, P18, P31, P345, P57, P577, P856, P86...
  random: Live Phish Volume 7
  title: Mad_Max:_Fury_Road
  url: https://en.wikipedia.org/wiki/Mad_Max:_Fury_Road
  url_raw: https://en.wikipedia.org/wiki/Mad_Max:_Fury_Road?action=raw
  what: <list(2)>
  wikibase: Q1757288
  wikidata: <dict(9)> {IMDB, cast, composer, director, genre, image,...
  wikidata_url: https://www.wikidata.org/wiki/Q1757288
  wikitext: <str(88968)> {{Use Australian English|date=June 2011}}{{...
}

In [32]:
x.wikidata #returns a nice dict of stuff that is also in the infobox.
#should use this to extract director name and date


Out[32]:
{'IMDB': 'tt1392190',
 'cast': ['Tom Hardy',
  'Charlize Theron',
  'Nicholas Buenote Hoult',
  'Josh Helman',
  'Nathan Jones',
  'Zoë Kravitz',
  'Rosie Huntington-Whiteley',
  'Riley Keough',
  'Hugh Keays-Byrne',
  'Abbey Lee Kershaw',
  'Courtney Eaton',
  'John Howard',
  'Richard Carter',
  'Angus Sampson',
  'Megan Gale',
  'Melissa Jaffer',
  'Gillian Jones',
  'Joy Smithers',
  'Richard Norton',
  'Lee Perry'],
 'composer': 'Junkie XL',
 'director': 'George Miller',
 'genre': ['post-apocalyptic film',
  'action film',
  'adventure film',
  'science fiction film',
  'thriller film'],
 'image': 'Mad Max Fury Road film Logo.png',
 'instance': ['film', '3D film'],
 'pubdate': ['+2015-05-14T00:00:00Z',
  '+2015-05-15T00:00:00Z',
  '+2015-05-13T00:00:00Z',
  '+2015-05-22T00:00:00Z',
  '+2015-05-21T00:00:00Z'],
 'website': 'http://www.madmaxmovie.com/'}

In [34]:
director = x.wikidata['director']
director


Out[34]:
'George Miller'

In [40]:
month_released = x.wikidata['pubdate']
datetime.strptime(month_released[0].strip('+').split('T')[0], "%Y-%m-%d").month


Out[40]:
5

In [66]:
soup_new = BeautifulSoup(x.wikitext, 'lxml')

In [65]:
soup_new.find('table', {"class":"infobox vevent"})

In [125]:
x.infobox


Out[125]:
{'alt': 'Theatrical release poster',
 'caption': 'Theatrical release poster',
 'cinematography': '[[John Seale]]',
 'country': '{{plainlist|\n* Australia|ref| name="Mad Max Fury Road"|{{cite web | url=http://www.bfi.org.uk/films-tv-people/5553f6745dee3 | title=\'\'Mad Max Fury Road\'\' | work=[[British Film Institute]] | date=2015 |access-date=27 November 2016}}|</ref>|\n* United States|ref| name="Mad Max Fury Road"|{{cite web | url=http://www.bfi.org.uk/films-tv-people/5553f6745dee3 | title=\'\'Mad Max Fury Road\'\' | work=[[British Film Institute]] | date=2015 |access-date=27 November 2016}}|</ref>|ref|{{cite web|url=https://www.nytimes.com/movies/movie/439675/Mad-Max-Fury-Road/overview|title=Mad Max: Fury Road (2015)|work=[[The New York Times]]|access-date=21 June 2015}}|</ref>|\n}}',
 'director': '[[George Miller (director)|George Miller]]',
 'distributor': '{{Plainlist|\n* [[Warner Bros. Pictures]] |small|(United States/International)|\n* [[Village Roadshow Pictures|Roadshow Films]] |small|(Australia)|\n}}',
 'editing': '[[Margaret Sixel]]',
 'errors': [b"<part><name> runtime        </name><equals>=</equals><value> 120 minutes<comment>&lt;!--Theatrical runtime: 120:03--&gt;</comment><ext><name>ref</name><attr/><inner>{{cite web | url=http://www.bbfc.co.uk/releases/mad-max-fury-road-film-0 | title=''MAD MAX: FURY ROAD'' (15) | work=[[British Board of Film Classification]] | date=5 May 2015 | access-date=5 May 2015}}</inner><close>&lt;/ref&gt;</close></ext>\n</value></part>",
  b"<part><name> budget         </name><equals>=</equals><value> $150 million<ext><name>ref</name><attr/><inner>{{cite web|url=http://www.ew.com/ew/article/0,,20610393_20830220,00.html|title=Drive Like Hell|last=Sperling|first=Nicole|work=[[Entertainment Weekly]]|date=7 July 2014|access-date=9 August 2014}}</inner><close>&lt;/ref&gt;</close></ext><ext><name>ref</name><attr/><inner>{{cite web|url=http://variety.com/2015/film/news/box-office-mad-max-fury-road-pitch-perfect-2-eye-40-million-openings-1201490072/|title=Box Office: 'Mad Max: Fury Road,' 'Pitch Perfect 2' Eye $40 Million Openings|first=Brent|last=Lang|work=[[Variety (magazine)|Variety]] |publisher=([[Penske Media Corporation]])|date=7 May 2015|access-date=8 May 2015}}</inner><close>&lt;/ref&gt;</close></ext>\n</value></part>",
  b'<part><name> gross          </name><equals>=</equals><value> $378.9 million<ext><name>ref</name><attr> name="BOM"</attr></ext>\n</value></part>'],
 'image': 'Mad Max Fury Road.jpg',
 'language': 'English',
 'music': '[[Junkie XL]]',
 'name': 'Mad Max: Fury Road',
 'producer': '{{plainlist|\n* [[Doug Mitchell (film producer)|Doug Mitchell]]\n* George Miller\n* PJ Voeten\n}}',
 'released': '{{Film date|2015|05|7|[[TCL Chinese Theatre]]|2015|05|14|Australia|2015|05|15|United States|df|=|y}}',
 'starring': '{{plainlist|\n* [[Tom Hardy]]\n* [[Charlize Theron]]\n* [[Nicholas Hoult]]\n* [[Hugh Keays-Byrne]]\n* [[Rosie Huntington-Whiteley]]\n* [[Riley Keough]]\n* [[Zoë Kravitz]]\n* [[Abbey Lee Kershaw|Abbey Lee]]\n* [[Courtney Eaton]]\n}}',
 'studio': '{{plainlist|\n* [[Village Roadshow Pictures]]\n* [[Kennedy Miller Mitchell]]\n* [[RatPac-Dune Entertainment]]\n}}',
 'writer': '{{plainlist|\n* George Miller\n* [[Brendan McCarthy]]\n* [[Nico Lathouris]]\n}}'}

In [126]:
d = x.infobox

In [127]:
for k,v in d.items():
    print(k,v)
    print()


name Mad Max: Fury Road

image Mad Max Fury Road.jpg

caption Theatrical release poster

alt Theatrical release poster

director [[George Miller (director)|George Miller]]

producer {{plainlist|
* [[Doug Mitchell (film producer)|Doug Mitchell]]
* George Miller
* PJ Voeten
}}

writer {{plainlist|
* George Miller
* [[Brendan McCarthy]]
* [[Nico Lathouris]]
}}

starring {{plainlist|
* [[Tom Hardy]]
* [[Charlize Theron]]
* [[Nicholas Hoult]]
* [[Hugh Keays-Byrne]]
* [[Rosie Huntington-Whiteley]]
* [[Riley Keough]]
* [[Zoë Kravitz]]
* [[Abbey Lee Kershaw|Abbey Lee]]
* [[Courtney Eaton]]
}}

music [[Junkie XL]]

cinematography [[John Seale]]

editing [[Margaret Sixel]]

studio {{plainlist|
* [[Village Roadshow Pictures]]
* [[Kennedy Miller Mitchell]]
* [[RatPac-Dune Entertainment]]
}}

distributor {{Plainlist|
* [[Warner Bros. Pictures]] |small|(United States/International)|
* [[Village Roadshow Pictures|Roadshow Films]] |small|(Australia)|
}}

released {{Film date|2015|05|7|[[TCL Chinese Theatre]]|2015|05|14|Australia|2015|05|15|United States|df|=|y}}

country {{plainlist|
* Australia|ref| name="Mad Max Fury Road"|{{cite web | url=http://www.bfi.org.uk/films-tv-people/5553f6745dee3 | title=''Mad Max Fury Road'' | work=[[British Film Institute]] | date=2015 |access-date=27 November 2016}}|</ref>|
* United States|ref| name="Mad Max Fury Road"|{{cite web | url=http://www.bfi.org.uk/films-tv-people/5553f6745dee3 | title=''Mad Max Fury Road'' | work=[[British Film Institute]] | date=2015 |access-date=27 November 2016}}|</ref>|ref|{{cite web|url=https://www.nytimes.com/movies/movie/439675/Mad-Max-Fury-Road/overview|title=Mad Max: Fury Road (2015)|work=[[The New York Times]]|access-date=21 June 2015}}|</ref>|
}}

language English

errors [b"<part><name> runtime        </name><equals>=</equals><value> 120 minutes<comment>&lt;!--Theatrical runtime: 120:03--&gt;</comment><ext><name>ref</name><attr/><inner>{{cite web | url=http://www.bbfc.co.uk/releases/mad-max-fury-road-film-0 | title=''MAD MAX: FURY ROAD'' (15) | work=[[British Board of Film Classification]] | date=5 May 2015 | access-date=5 May 2015}}</inner><close>&lt;/ref&gt;</close></ext>\n</value></part>", b"<part><name> budget         </name><equals>=</equals><value> $150 million<ext><name>ref</name><attr/><inner>{{cite web|url=http://www.ew.com/ew/article/0,,20610393_20830220,00.html|title=Drive Like Hell|last=Sperling|first=Nicole|work=[[Entertainment Weekly]]|date=7 July 2014|access-date=9 August 2014}}</inner><close>&lt;/ref&gt;</close></ext><ext><name>ref</name><attr/><inner>{{cite web|url=http://variety.com/2015/film/news/box-office-mad-max-fury-road-pitch-perfect-2-eye-40-million-openings-1201490072/|title=Box Office: 'Mad Max: Fury Road,' 'Pitch Perfect 2' Eye $40 Million Openings|first=Brent|last=Lang|work=[[Variety (magazine)|Variety]] |publisher=([[Penske Media Corporation]])|date=7 May 2015|access-date=8 May 2015}}</inner><close>&lt;/ref&gt;</close></ext>\n</value></part>", b'<part><name> gross          </name><equals>=</equals><value> $378.9 million<ext><name>ref</name><attr> name="BOM"</attr></ext>\n</value></part>']


In [114]:
h = d['released'].strip('{').strip('}').strip('Film date|').split("|")

In [115]:
h


Out[115]:
['2015',
 '05',
 '7',
 '[[TCL Chinese Theatre]]',
 '2015',
 '05',
 '14',
 'Australia',
 '2015',
 '05',
 '15',
 'United States',
 'df',
 '=',
 'y']

In [120]:
for index, value in enumerate(h):
    if value == 'United States':
        month = h[index-2]
        print(month)


05

In [ ]: