Scraping Tutorial https://www.youtube.com/watch?v=XjNm9bazxn8&index=5&list=WL



In [1]:

    
import requests
from bs4 import BeautifulSoup



In [ ]:

    
def the_numbers_spider(max_pages):
    # Basic BS4 test to find our table data. Returns too much stuff.
    page = 1
    while page <= max_pages:
        url = "http://www.the-numbers.com/box-office-chart/weekly/1991/12/27"
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, 'lxml')
        
        for tableData in soup.find_all('div', {'id': 'page_filling_chart'})[1]:
            soup2 = BeautifulSoup(tableData, 'lxml') # Create a new soup with the find_all method available
            rowData = soup2.find_all('td')
            print(type(rowData))
#             title = link.get('title')
            print(rowData)
#             get_single_item_data(href)
            
        page += 1



In [ ]:

    
def the_numbers_spider(max_pages):
    # Parent Test. This returns the table we want, but still has extra header info we don't want.
    page = 1
    while page <= max_pages:
        url = "http://www.the-numbers.com/box-office-chart/weekly/1991/12/27"
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, 'lxml')
        
        parentEl = soup.find('td', {'class': 'data'}).parent.parent
#             soup2 = BeautifulSoup(tableData, 'lxml') # Create a new soup with the find_all method available
#             rowData = soup2.find_all('td')
#             print(type(rowData))
#             href = "http://www.imagefap.com" + link.get('href')
#             title = link.get('title')
        print(parentEl)
    
#             get_single_item_data(href)
            
        page += 1



In [22]:

    
def the_numbers_spider(max_pages):
    # BS4 pulls the table, converts the results into a string, then recreates a new soup so that the text method can be run on it.
    page = 1
    while page <= max_pages:
        url = "http://www.the-numbers.com/box-office-chart/weekly/1991/12/27"
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, 'lxml')
        
        parentEl = soup.find('td', {'class': 'data'}).parent
#             soup2 = BeautifulSoup(tableData, 'lxml') # Create a new soup with the find_all method available
#             rowData = soup2.find_all('td')
#             print(type(rowData))

        containerList = []

        for siblings in parentEl.next_siblings:
#             print(type(siblings))
#             print(siblings)
            containerList.append(str(siblings))
        
        stringList = ''.join(containerList)

#         print(containerList)
        soup2 = BeautifulSoup(stringList, 'lxml')
        
#         print(soup2)
#         print(soup2.text)
#         print(parentEl.next_siblings)
    
#             get_single_item_data(href)
            
        page += 1
        
    with open('numbers_test.txt', mode='wt', encoding='utf-8') as file:
        file.write(soup2.text)
        
    print("done!")



In [21]:

    
the_numbers_spider(1)



In [ ]:

    
def get_single_item_data(item_url):
    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)
    
    for item_name in soup.findAll('div', {'id': 'cnt_cats'}):
        links = item_name.find('a').contents[0]
        print(links)



In [ ]:

    
trade_spider(2)



In [ ]: