Scraping Tutorial https://www.youtube.com/watch?v=XjNm9bazxn8&index=5&list=WL



In [88]:

    
import requests
from bs4 import BeautifulSoup
import json
import datetime



In [ ]:

    
def the_numbers_spider(max_pages):
    # Basic BS4 test to find our table data. Returns too much stuff.
    page = 1
    while page <= max_pages:
        url = "http://www.the-numbers.com/box-office-chart/weekly/1991/12/27"
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, 'lxml')
        
        for tableData in soup.find_all('div', {'id': 'page_filling_chart'})[1]:
            soup2 = BeautifulSoup(tableData, 'lxml') # Create a new soup with the find_all method available
            rowData = soup2.find_all('td')
            print(type(rowData))
#             title = link.get('title')
            print(rowData)
#             get_single_item_data(href)
            
        page += 1



In [ ]:

    
def the_numbers_spider(max_pages):
    # Parent Test. This returns the table we want, but still has extra header info we don't want.
    page = 1
    while page <= max_pages:
        url = "http://www.the-numbers.com/box-office-chart/weekly/1991/12/27"
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, 'lxml')
        
        parentEl = soup.find('td', {'class': 'data'}).parent.parent
#             soup2 = BeautifulSoup(tableData, 'lxml') # Create a new soup with the find_all method available
#             rowData = soup2.find_all('td')
#             print(type(rowData))
#             href = "http://www.imagefap.com" + link.get('href')
#             title = link.get('title')
        print(parentEl)
    
#             get_single_item_data(href)
            
        page += 1



In [186]:

    
def the_numbers_spider(max_pages):
    # BS4 pulls the table, converts the results into a string, then recreates a new soup so that the text method can be run on it.
    page = 1
    
    # Dict to hold weekly data
    weeklyData = {}
    weekEnded = str(datetime.date(1991, 12, 27))
    date = datetime.date(1991, 12, 27)
    dateString = str(date)
    
    while page <= max_pages:
#         url = "http://www.the-numbers.com/box-office-chart/weekly/1991/12/27"
        url = "http://www.the-numbers.com/box-office-chart/weekly/" + dateString[0:4] + "/" + dateString[5:7] + "/" + dateString[8:10]
        print(url)
        print(dateString)
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, 'lxml')
        
        tableRows = soup.find_all('table')[1].find_all('tr')
        for row in tableRows[1:]:
            title = row.a.get_text()
            weeklyData[title] = {}
#             date = datetime.date(1991, 12, 27) # Change to get date from weblink or a date variable (prob latter)
            weeklyData[title]['weekEnded'] = weekEnded
            weeklyData[title]['currentRank'] = row.td.get_text()
            weeklyData[title]['prevRank'] = row.select('td')[1].get_text(strip=True)[1:-1] # Ignores the parens
            weeklyData[title]['distributor'] = row.select('a')[1].get_text(strip=True)
            weeklyData[title]['gross'] = int(row.select('td')[4].get_text(strip=True)[1:].replace(",", "")) # Finds the string for gross rev, strips lead/trailing spaces, replaces the commas with nothing, and ignores the first char which is a dollar sign. Whew.
            weeklyData[title]['change'] = row.select('td')[5].get_text(strip=True)
            weeklyData[title]['theaters'] = int(row.select('td')[6].get_text(strip=True).replace(",", ""))
            weeklyData[title]['perThtr'] = int(row.select('td')[7].get_text(strip=True)[1:].replace(",", ""))
            weeklyData[title]['totalGross'] = int(row.select('td')[8].get_text(strip=True)[1:].replace(",", ""))
            weeklyData[title]['grossDays'] = int(row.select('td')[9].get_text(strip=True).replace(",", ""))
            #Change	Thtrs.	Per Thtr.	Total Gross	Days
            
        print(weeklyData)
        
#         parentEl = soup.find('td', {'class': 'data'}).parent
# #             soup2 = BeautifulSoup(tableData, 'lxml') # Create a new soup with the find_all method available
# #             rowData = soup2.find_all('td')
# #             print(type(rowData))

#         print(parentEl)
# ###########################################
#         parentEl = soup.find_all('td', {'class': 'data'})
# #         print(parentEl)
# #         plain_text = parentEl
#         parentEl = str(parentEl)
#         soup2 = BeautifulSoup(parentEl, 'lxml')
# #         rowData = soup2.find_all('td')
# #         print(rowData)
    
#         for row in soup2.find_all('td'):
#             print(row.text)
# ###########################################
            



#         # Prints out the td text and that's it
#         for td in soup.find_all('table')[1].find_all('td'):
#             print(td.text)


######### Create an empty list, and add the html data as string into the list
#         containerList = []

#         for siblings in parentEl.next_siblings:
# #             print(type(siblings))
# #             print(siblings)
#             containerList.append(str(siblings))
        
#         stringList = ''.join(containerList)

#         soup2 = BeautifulSoup(stringList, 'lxml')
        
# #         print(soup2)
# #         print(soup2.text)
# #         print(parentEl.next_siblings)
    
# #             get_single_item_data(href)
#####################################################################
        
    
    
    
    
        page += 1
        
#     print(type(soup2))
        
#     print()
    
#     with open('numbers_test.txt', mode='wt', encoding='utf-8') as file:
#         file.write(soup2.text)
        
    with open('numbers_test.json', mode='wt', encoding='utf-8') as file:
        json.dump(weeklyData, file)
        
    print("done!")



In [213]:

    
def the_numbers_spider(max_pages):
    # BS4 pulls the table, converts the results into a string, then recreates a new soup so that the text method can be run on it.
    page = 1
    
    # Dict to hold weekly data
    weeklyData = {}
    
    # Declare starting dates. This will be changed to the first weekly report back in 1977.
    date = datetime.date(1991, 12, 27) # This creates a datetime object that is not sliceable like a string is.
    dateString = str(date)
    
    while page <= max_pages:
        url = "http://www.the-numbers.com/box-office-chart/weekly/" + dateString[0:4] + "/" + dateString[5:7] + "/" + dateString[8:10]
#         print(url)
#         print(dateString)
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, 'lxml')
        
        # Store all the table rows in the second table as a BS4 object.
        tableRows = soup.find_all('table')[1].find_all('tr')
        
        # Save the data into the dictionary.
        # TODO: clean up more of the data types.
        for row in tableRows[1:]:
            title = row.a.get_text()
            weeklyData[title] = {}
            weeklyData[title]['weekEnded'] = dateString
            weeklyData[title]['currentRank'] = row.td.get_text()
            weeklyData[title]['prevRank'] = row.select('td')[1].get_text(strip=True)[1:-1] # Ignores the parens
            weeklyData[title]['distributor'] = row.select('a')[1].get_text(strip=True)
            weeklyData[title]['gross'] = int(row.select('td')[4].get_text(strip=True)[1:].replace(",", "")) # Finds the string for gross rev, strips lead/trailing spaces, replaces the commas with nothing, and ignores the first char which is a dollar sign. Whew.
            weeklyData[title]['change'] = row.select('td')[5].get_text(strip=True)
            weeklyData[title]['theaters'] = int(row.select('td')[6].get_text(strip=True).replace(",", ""))
            weeklyData[title]['perThtr'] = int(row.select('td')[7].get_text(strip=True)[1:].replace(",", ""))
            weeklyData[title]['totalGross'] = int(row.select('td')[8].get_text(strip=True)[1:].replace(",", ""))
            weeklyData[title]['grossDays'] = int(row.select('td')[9].get_text(strip=True).replace(",", ""))
            
#         # Debug print.
#         print(weeklyData)

        # Dump the dict into a json file.
        # Will think about writing it all to one giant file.
        with open('the-numbers_weekly ' + dateString + ".json", mode='wt', encoding='utf-8') as file:
            json.dump(weeklyData, file)
            
        # Increment operations
        date += datetime.timedelta(days=7)
        dateString = str(date)            
        page += 1
        
    print("done!")



In [214]:

    
the_numbers_spider(3)









    



done!



In [ ]:

    
def get_single_item_data(item_url):
    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)
    
    for item_name in soup.findAll('div', {'id': 'name'}):
        links = item_name.find('a').contents[0]
        print(links)



In [ ]:

    
trade_spider(2)



In [ ]: