In [51]:

    
from multiprocessing import Pool #witness the power
import wikipedia
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import json
import re
import time
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from datetime import datetime
from helper_functions import *
%matplotlib inline

This notebook contains the code that was used to extract every movie's infobox on wikipedia that I had in my movie-dictionary.

This was an extrmemly iterative process, however, it was worth the time it took to create as it successfully obtained over 95% of the information I required. This notebook was critical in the success of this project.

I hope the code below may serve as some inspiration to others when scraping wikipedia infoboxes - it is no easy task!

At this point, we have all the wikipedia objects for all of our associated movies. Also, for those movies which had no wikipedia object, their information was entered manually (yes, manually - i know you're sighing. I was sighing too).

This notebook took approximately 6 hours to design. It was an iterative process. That is to say, many things broke in the extract_wiki_infobox function at the start.

As such, in order to save time and ensure some semblance of data integrity. I had to removed around 10-15 movies from the movies database.

In the end, we have 900 movies in our database. That does NOT mean however, that all of the fields for each movies have valid values. Wikipedia and rotton tomatoes do not hold all the answers with regards to budget, box office amount etc. As such, there will be many movies that have a range of NaN values.

For simplicity we will impute these - yes, this is NOT best practice, as movies span several years and the effects of inflation are not accounted for.

Bear in mind, I was given a week and half to put this together!

The extract_wiki_infobox does its best to ensure some standard form for all fields. I do this to minimize the cleaning required once I get the movie database dictionary into a pandas dataframe.



In [44]:

    
movie_db = unpickle_object("movie_database.pkl")



In [45]:

    
len(sorted(movie_db.keys()))









    Out[45]:





900



In [47]:

    
def extract_wiki_infobox():
    
    regex = r" *\[[^\]]*.*"
    regex2 = r" *\([^\)].*"
    regex3 = r" *\/[^\)]*.*"
    regex4 = r" *\,[^\)].*"
    regex5 = r".*(?=\$)"
    regex6 = r".*(?=\£)"
    regex7 = r"\–.*$"
    regex_date = r"^[^\(]*"
    regex_date_2 = r" *\)[^\)].*"
    subset=''



    for key in sorted(movie_db.keys()):
        if len(movie_db[key]) == 6:
            html_url = movie_db[key][-1].url
            info_box_dictionary = {}
            soup = BeautifulSoup(movie_db[key][5].html(), 'lxml')
            wikipedia_api_info = soup.find("table",{"class":"infobox vevent"})

            info_box_dictionary = {}

            for tr in wikipedia_api_info.find_all('tr'):
                if tr.find('th'):
                    info_box_dictionary[tr.find('th').text] = tr.find('td')

            try: #done
                date = info_box_dictionary['Release date'].text
                date = re.sub(regex_date, subset, date)
                try:
                    date = date.split()[0].strip("(").strip(")")
                    date = re.sub(regex_date_2,subset, date)
                except IndexError:
                    date = info_box_dictionary['Release date'].text
                    date = re.sub(regex_date, subset, date)
            except KeyError:
                date = np.nan

            try: #done
                runtime = info_box_dictionary['Running time'].text
                runtime = re.sub(regex, subset, runtime)
                runtime = re.sub(regex2, subset, runtime)
            except KeyError:
                runtime = np.nan

            try: #done
                boxoffice = info_box_dictionary['Box office'].text
                boxoffice = re.sub(regex, subset, boxoffice)
                boxoffice = re.sub(regex6, subset, boxoffice)
                boxoffice = re.sub(regex5, subset, boxoffice)
                if "billion" not in boxoffice:
                    boxoffice = re.sub(regex7, subset, boxoffice)
                    boxoffice = re.sub(regex2, subset, boxoffice)
            except KeyError:
                boxoffice = np.nan

            try:#done
                budget = info_box_dictionary['Budget'].text
                budget = re.sub(regex, subset, budget)
                budget = re.sub(regex7, subset, budget)
                if "$" in budget:
                    budget = re.sub(regex5, subset, budget)
                    budget = re.sub(regex2, subset, budget)
                if "£" in budget:
                    budget = re.sub(regex6, subset, budget)
                    budget = re.sub(regex2, subset, budget)
                budget = re.sub(regex5, subset, budget)
            except KeyError:
                budget = np.nan

            try:#done
                country = info_box_dictionary['Country'].text.strip().lower()
                country = re.sub(regex, subset, country) #cleans out a lot of gunk
                country = re.sub(regex2, subset, country)
                country = re.sub(regex3, subset, country)
                country = re.sub(regex4, subset, country)
                country = country.split()
                if country[0] == "united" and country[1] == "states":
                    country = country[0]+" "+country[1]
                elif country[0] =="united" and country[1] == "kingdom":
                    country = country[0] +" "+ country[1]
                else:
                    country = country[0]
            except KeyError:
                country = np.nan

            try:#done
                language = info_box_dictionary['Language'].text.strip().split()[0]
                language = re.sub(regex, subset, language)
            except KeyError:
                language = np.nan

            movie_db[key].append(date)
            movie_db[key].append(runtime)
            movie_db[key].append(boxoffice)
            movie_db[key].append(budget)
            movie_db[key].append(country)
            movie_db[key].append(language)



In [48]:

    
extract_wiki_infobox()



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]: