Scraping Tutorial https://www.youtube.com/watch?v=XjNm9bazxn8&index=5&list=WL
In [1]:
import requests
from bs4 import BeautifulSoup
In [ ]:
def the_numbers_spider(max_pages):
# Basic BS4 test to find our table data. Returns too much stuff.
page = 1
while page <= max_pages:
url = "http://www.the-numbers.com/box-office-chart/weekly/1991/12/27"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'lxml')
for tableData in soup.find_all('div', {'id': 'page_filling_chart'})[1]:
soup2 = BeautifulSoup(tableData, 'lxml') # Create a new soup with the find_all method available
rowData = soup2.find_all('td')
print(type(rowData))
# title = link.get('title')
print(rowData)
# get_single_item_data(href)
page += 1
In [ ]:
def the_numbers_spider(max_pages):
# Parent Test. This returns the table we want, but still has extra header info we don't want.
page = 1
while page <= max_pages:
url = "http://www.the-numbers.com/box-office-chart/weekly/1991/12/27"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'lxml')
parentEl = soup.find('td', {'class': 'data'}).parent.parent
# soup2 = BeautifulSoup(tableData, 'lxml') # Create a new soup with the find_all method available
# rowData = soup2.find_all('td')
# print(type(rowData))
# href = "http://www.imagefap.com" + link.get('href')
# title = link.get('title')
print(parentEl)
# get_single_item_data(href)
page += 1
In [22]:
def the_numbers_spider(max_pages):
# BS4 pulls the table, converts the results into a string, then recreates a new soup so that the text method can be run on it.
page = 1
while page <= max_pages:
url = "http://www.the-numbers.com/box-office-chart/weekly/1991/12/27"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'lxml')
parentEl = soup.find('td', {'class': 'data'}).parent
# soup2 = BeautifulSoup(tableData, 'lxml') # Create a new soup with the find_all method available
# rowData = soup2.find_all('td')
# print(type(rowData))
containerList = []
for siblings in parentEl.next_siblings:
# print(type(siblings))
# print(siblings)
containerList.append(str(siblings))
stringList = ''.join(containerList)
# print(containerList)
soup2 = BeautifulSoup(stringList, 'lxml')
# print(soup2)
# print(soup2.text)
# print(parentEl.next_siblings)
# get_single_item_data(href)
page += 1
with open('numbers_test.txt', mode='wt', encoding='utf-8') as file:
file.write(soup2.text)
print("done!")
In [21]:
the_numbers_spider(1)
In [ ]:
def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for item_name in soup.findAll('div', {'id': 'cnt_cats'}):
links = item_name.find('a').contents[0]
print(links)
In [ ]:
trade_spider(2)
In [ ]: