The latest Mountain Goats album is called Goths. (It's good!) I made a simple HTML table with the track listing -- let's scrape it into a CSV.
In [ ]:
from bs4 import BeautifulSoup
import csv
In [ ]:
# in a with block, open the HTML file
with open('mountain-goats.html', 'r') as html_file:
# .read() in the contents of a file -- it'll be a string
html_code = html_file.read()
# print the string to see what's there
print(html_code)
In [ ]:
with open('mountain-goats.html', 'r') as html_file:
html_code = html_file.read()
# use the type() function to see what kind of object `html_code` is
print(type(html_code))
# feed the file's contents (the string of HTML) to BeautifulSoup
# will complain if you don't specify the parser
soup = BeautifulSoup(html_code, 'html.parser')
# use the type() function to see what kind of object `soup` is
print(type(soup))
In [ ]:
with open('mountain-goats.html', 'r') as html_file:
html_code = html_file.read()
soup = BeautifulSoup(html_code, 'html.parser')
# by position on the page
# find_all returns a list of matching elements, and we want the second ([1]) one
# song_table = soup.find_all('table')[1]
# by class name
# => with `find`, you can pass a dictionary of element attributes to match on
# song_table = soup.find('table', {'class': 'song-table'})
# by ID
# song_table = soup.find('table', {'id': 'my-cool-table'})
# by style
song_table = soup.find('table', {'style': 'width: 95%;'})
print(song_table)
Let's print a list of track numbers and song titles. Look at the structure of the table -- a table has rows represented by the tag tr
, and within each row there are cells represented by td
tags. The find_all()
method returns a list. And we know how to iterate over lists: with a for loop. Let's do that.
In [ ]:
with open('mountain-goats.html', 'r') as html_file:
html_code = html_file.read()
soup = BeautifulSoup(html_code, 'html.parser')
song_table = soup.find('table', {'style': 'width: 95%;'})
# find the rows in the table
# slice to skip the header row
song_rows = song_table.find_all('tr')[1:]
# loop over the rows
for row in song_rows:
# get the table cells in the row
song = row.find_all('td')
# assign them to variables
track, title, duration, artist, album = song
# use the .string attribute to get the text in the cell
print(track.string, title.string)
In [ ]:
with open('mountain-goats.html', 'r') as html_file, open('mountain-goats.csv', 'w') as outfile:
html_code = html_file.read()
soup = BeautifulSoup(html_code, 'html.parser')
song_table = soup.find('table', {'style': 'width: 95%;'})
song_rows = song_table.find_all('tr')[1:]
# set up a writer object
writer = csv.DictWriter(outfile, fieldnames=['track', 'title', 'duration', 'artist', 'album'])
writer.writeheader()
for row in song_rows:
# get the table cells in the row
song = row.find_all('td')
# assign them to variables
track, title, duration, artist, album = song
# write out the dictionary to file
writer.writerow({
'track': track.string,
'title': title.string,
'duration': duration.string,
'artist': artist.string,
'album': album.string
})