Scraping Tutorial https://www.youtube.com/watch?v=XjNm9bazxn8&index=5&list=WL
In [88]:
import requests
from bs4 import BeautifulSoup
import json
import datetime
In [213]:
def the_numbers_spider(max_pages):
# BS4 pulls the table, converts the results into a string, then recreates a new soup so that the text method can be run on it.
page = 1
# Dict to hold weekly data
weeklyData = {}
# Declare starting dates. This will be changed to the first weekly report back in 1977.
date = datetime.date(1991, 12, 27) # This creates a datetime object that is not sliceable like a string is.
dateString = str(date)
while page <= max_pages:
url = "http://www.the-numbers.com/box-office-chart/weekly/" + dateString[0:4] + "/" + dateString[5:7] + "/" + dateString[8:10]
# print(url)
# print(dateString)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'lxml')
# Store all the table rows in the second table as a BS4 object.
tableRows = soup.find_all('table')[1].find_all('tr')
# Save the data into the dictionary.
for row in tableRows[1:]:
title = row.a.get_text()
weeklyData[title] = {}
weeklyData[title]['weekEnded'] = dateString
weeklyData[title]['currentRank'] = row.td.get_text()
weeklyData[title]['prevRank'] = row.select('td')[1].get_text(strip=True)[1:-1] # Ignores the parens
weeklyData[title]['distributor'] = row.select('a')[1].get_text(strip=True)
weeklyData[title]['gross'] = int(row.select('td')[4].get_text(strip=True)[1:].replace(",", "")) # Finds the string for gross rev, strips lead/trailing spaces, replaces the commas with nothing, and ignores the first char which is a dollar sign. Whew.
weeklyData[title]['change'] = row.select('td')[5].get_text(strip=True)
weeklyData[title]['theaters'] = int(row.select('td')[6].get_text(strip=True).replace(",", ""))
weeklyData[title]['perThtr'] = int(row.select('td')[7].get_text(strip=True)[1:].replace(",", ""))
weeklyData[title]['totalGross'] = int(row.select('td')[8].get_text(strip=True)[1:].replace(",", ""))
weeklyData[title]['grossDays'] = int(row.select('td')[9].get_text(strip=True).replace(",", ""))
# # Debug print.
# print(weeklyData)
# Dump the dict into a json file.
# Will think about writing it all to one giant file.
with open('the-numbers_weekly ' + dateString + ".json", mode='wt', encoding='utf-8') as file:
json.dump(weeklyData, file)
# Increment operations
date += datetime.timedelta(days=7)
dateString = str(date)
page += 1
print("done!")
In [214]:
the_numbers_spider(3)