Simple news robot that gets movie title, director, rating, actors etc. from Internet Movie Database and automatically writes a very short review.
In [ ]:
# Install libraries. Try pip3 instead if it doesn't work.
!pip install BeautifulSoup4
!pip install lxml
In [ ]:
# Import libraries.
from urllib import request
from lxml import html
from bs4 import BeautifulSoup
import ssl
# Function to scrape HTML from web page.
def scrapewebpage(url):
UseSSL = True # If you get SSLError, change this to False.
if UseSSL:
web = request.urlopen(url)
else:
web = request.urlopen(url, context=ssl._create_unverified_context())
if web.getcode() == 200:
return(web.read())
else:
print("Error %s reading %s" % str(web.getcode()), url)
# Helper function that scrape web page and return BeautifulSoup object.
def makesoup(url):
html = scrapewebpage(url)
return(BeautifulSoup(html, "lxml-xml"))
In [ ]:
# Scrape movie Interstellar (2014).
soup = makesoup('http://www.imdb.com/title/tt0816692/')
In [ ]:
# Movie ID.
movieid = soup.find(property="pageId").get("content")
# Movie URL.
movielink = 'http://www.imdb.com/title/' + movieid + '/'
# Title.
title = soup.find(itemprop="name").get_text()
title = title.strip() # Remove white spaces before and after title.
# Year.
year = soup.find(id="titleYear").get_text()
year = year[1:5] # Remove parentheses, so that (2014) becomes 2014.
year = int(year) # Year is a string now, int() will convert it to an integer (number).
# Remove year from title.
titleclean = title.replace("(" + str(year) + ")", "").strip()
# Length of movie.
duration = soup.find(itemprop="duration").get_text()
duration = duration.strip() # Remove white spaces before and after.
# Movie director.
director = soup.find(itemprop="director").find(itemprop="name").get_text()
director = director.strip() # Remove white spaces before and after.
# Movie rating.
rating = soup.find(itemprop="ratingValue").get_text()
rating = float(rating) # Float makes rating into a decimal number, like 6.8.
# Create a list of main actors.
actors = []
for castlist in soup.find_all("table", "cast_list"):
for actor in castlist.find_all(itemprop="name"):
actors.append(actor.get_text().strip())
In [ ]:
print("Title: " + titleclean)
print("Year: " + str(year))
print("Director: " + director)
print("Duration: " + duration)
print("Rating: " + str(rating))
print("Actors:")
for actor in actors:
print("- " + actor)
In [ ]:
# The higher the rating, the more spectacular title of the news article.
if rating == 10:
newstitle = "The master piece {0} ({1}) by {2}".format(titleclean, year, director)
elif rating < 10 and rating > 7:
newstitle = "Impressive {0} ({1}) by {2}".format(titleclean, year, director)
elif rating <= 7 and rating >= 4:
newstitle = "{0} ({1}) by {2}".format(titleclean, year, director)
elif rating < 4:
newstitle = "{0} by {2} is the worst movie of {1}".format(titleclean, year, director)
In [ ]:
newstitle
In [ ]:
# Number of years ago the movie was produced.
import datetime
now = datetime.datetime.now()
yearsago = now.year - year
In [ ]:
yearsago
In [ ]:
# Body text of the news article.
newsbody = """{0} by {1} was produced {2} years ago and have gotten the rating {3} on the movie site IMDb.
{4} and {5} is in the leading cast.
{6}
""".format(titleclean, director, yearsago, rating, actors[0], actors[1], movielink)
In [ ]:
print(newstitle.upper())
print()
print(newsbody)