In [4]:
!pip3 install bs4
In [5]:
from bs4 import BeautifulSoup
In [6]:
from urllib.request import urlopen
html_str = urlopen("http://static.decontextualize.com/kittens.html").read()
In [7]:
print(html_str)
In [8]:
document = BeautifulSoup(html_str,"html.parser")
In [9]:
type(document)
Out[9]:
In [10]:
h1_tag = document.find('h1')
In [11]:
h1_tag.string
Out[11]:
In [12]:
img_tag = document.find('img')
In [13]:
img_tag.string
In [14]:
img_tag('src')
Out[14]:
In [15]:
img_tag['src']
Out[15]:
In [16]:
document.find_all('img')
Out[16]:
In [17]:
img_tags=document.find_all('img')
In [18]:
type(img_tags)
Out[18]:
In [19]:
first_img = img_tags[0]
In [20]:
first_img['src']
Out[20]:
In [21]:
second_img = img_tags[1]
In [22]:
second_img['src']
Out[22]:
In [23]:
for item in img_tags:
print(item['src'])
In [24]:
h2_tags = document.find_all('h2')
for item in h2_tags:
print(item.string)
In [25]:
checkups = document.find_all('span',{'class':'lastcheckup'})
for item in checkups:
print(item.string)
In [26]:
kittens = document.find_all('div', {'class': 'kitten'})
for item in kittens:
h2_tag = item.find('h2')
print(h2_tag.string)
checkup = item.find('span')
print(checkup.string)
In [27]:
kittens = document.find_all('div', {'class': 'kitten'})
In [28]:
first_kitten = kittens[0]
first_kitten_h2 = first_kitten.find('h2')
print(first_kitten_h2.string)
In [29]:
planets = ["Mercury", "Venus", "Earth", "Mars", "Jupiter", "Saturn", "Uranus", "Neptune"]
In [30]:
separator = ","
In [31]:
separator.join(planets)
Out[31]:
In [32]:
print("&\n".join(planets))
In [33]:
print("&\n".join(planets[:4]))
In [34]:
kittens = document.find_all('div', {'class': 'kitten'})
for item in kittens:
h2_tag = item.find('h2')
print(h2_tag.string)
a_tags = item.find_all('a') #anchor tag, ancestor
all_shows_str = [] #create a new list
for a_tag_item in a_tags:
#print("-", a_tag_item.string)
tag_str = a_tag_item.string
all_shows_str.append(tag_str)
string_with_all_show_names = ",".join(all_shows_str)
print(h2_tag.string+ ":", string_with_all_show_names)
In [35]:
kittens_data = list()#create an empty list
kittens = document.find_all('div', {'class': 'kitten'})
for item in kittens:
h2_tag = item.find('h2')
print(h2_tag.string)
a_tags = item.find_all('a') #anchor tag, ancestor
all_shows_str = [] #create a new list
for a_tag_item in a_tags:
#print("-", a_tag_item.string)
tag_str = a_tag_item.string
all_shows_str.append(tag_str)
#1 create a dictionary and add to it the relevant key/value pairs
#kitten_map = {}
#kitten_map["name"] = h2_tag.string
#kitten_map["tvshows"] = all_shows_str
kitten_map = {"name":h2_tag.string, "tvshows":all_shows_str }
#2 append that dictionary to the kittens_data
string_with_all_show_names = ",".join(all_shows_str)
#print(h2_tag.string+ ":", string_with_all_show_names
kittens_data
Out[35]:
In [36]:
kittens_data = list()#create an empty list
kittens = document.find_all('div', {'class': 'kitten'})
for item in kittens:
h2_tag = item.find('h2')
print(h2_tag.string)
a_tags = item.find_all('a') #anchor tag, ancestor
all_shows_str = []
for a_tag_item in a_tags:
tag_str = a_tag_item.string
all_shows_str.append(tag_str)
#create a dictionary adding kittens checkups
checkup = item.find('span')# get the string with checkup.string
kittens_data.append(
{"name":h2_tag.string,
"tvshows":all_shows_str,
"last_checkup": checkup.string})
kittens_data
Out[36]:
In [37]:
Our next goal is to create a data structure that looks like this:
[
{'name': 'Fluffy',
'tv shows': ['Deep Space Nine', 'Mr.Belvedere']},
{}
In [38]:
x = ["a", "b", "c", "d"]
In [39]:
x[0]
Out[39]:
In [40]:
x.append("e")
In [41]:
len(x)
Out[41]:
In [42]:
x[4]
Out[42]:
In [43]:
numbers = [1,2,3,4,5,6]
# end up with: [1,4,9,16,25,36]
In [44]:
squared = [item * item for item in numbers]
for item in numbers:
s = item*item
squared.append(s)
In [45]:
squared
Out[45]:
In [46]:
## Aside the Third: Making dictionaries
#declaring a dictionary
x = {'a':1, 'b':2, 'c':3}
In [47]:
#get a value out of a dictionary
x['a']
Out[47]:
In [48]:
x.keys()
Out[48]:
In [49]:
for key in x.keys():
print(key) #print out keys
In [50]:
# target: {1:1, 2:4, 3:9, 4:16, 5:25,...}
squares = {}
for n in range(1,11):
squares[n] = n*n
squares
Out[50]:
In [51]:
squares[7]
Out[51]:
In [52]:
names = ["Aaron", "Bob", "Caroline", "Daphne"]
#target: {"Aaron": 5} #show the name and how many characters each name has
name_length_map = {}#map is
for item in names:
name_length_map[item] = len(item)
name_length_map #evaluate the dictionary, Python 3 # take a list and create a new dictionary
Out[52]:
In [53]:
from urllib.request import urlopen
faculty_html = urlopen("http://www.journalism.columbia.edu/page/10/10?category_ids%5B%5D=2&category_ids%5B%5D=3&category_ids%5B%5D=37").read()
In [54]:
document = BeautifulSoup(faculty_html, "html.parser")
In [55]:
document.find('h2').string
Out[55]:
In [56]:
h2_tag = document.find('h2')
h2_tag.string
Out[56]:
very first task: print out the names of all the faculty members.
In [57]:
# this doesn't work,
ul_tag = document.find('ul', {'class': 'experts-list'})
li_tags= ul_tag.find_all('li')
for item in li_tags:
h4_tag = item.find('h4')
if h4_tag: #none counts as false in python, only proceed if we actually found a h4-tag under li tags
a_tag = h4_tag.find('a')#name of adjunct
p_tag = item.find('p', {'class':'description'})#position of adjunct
print(a_tag.string, "/", p_tag.string)
Now, we want to make a list of dictionaries of faculty members along with their titles [{'name': 'Bodarky George', 'title': 'Adjunct Assistant Professor '}, {'name':''}]
In [58]:
profs = []
ul_tag = document.find('ul', {'class': 'experts-list'})
li_tags= ul_tag.find_all('li')
for item in li_tags:
h4_tag = item.find('h4')
if h4_tag: #none counts as false in python, only proceed if we actually found a h4-tag under li tags
a_tag = h4_tag.find('a')
p_tag = item.find('p', {'class':'description'})
prof_map = {'name': a_tag.string, 'title': p_tag.string}
profs.append(prof_map)
profs
Out[58]:
In [59]:
for item in profs:
print(item['name'])
In [61]:
# print all of the professors whose last name start with 'M'
m_profs = []
mcount = 0
for item in profs:
prof_name = item['name']
if prof_name[0]=='M':
print(item['name'])
mcount += 1 #mcount= mcount+1
print(mcount)
In [65]:
# find all of the professors listed as "Adjunct Faculty"
adjunct_profs = []
#same as where clause
mcount=0
for item in profs:
if item['title'] is not None and ("Adjunct" in item['title']):
adjunct_profs.append(item)
len(adjunct_profs)
Out[65]:
In [ ]:
for item in profs:
if item['title'] is not None and ("Adjunct" in item['title']: adjunct_profs.append(item)
In [64]:
message = "bungalow"
message[0]
Out[64]:
In [65]:
message[2:6]
Out[65]:
In [66]:
message[-1]
Out[66]:
In [67]:
message[0:3]
Out[67]:
In [68]:
message[:3]
Out[68]:
In [69]:
message[4:]
Out[69]:
In [70]:
message[-5:-2]
Out[70]:
In [74]:
x=5
In [75]:
x
Out[75]:
In [76]:
x = x-1
In [77]:
x
Out[77]:
In [78]:
x -= 1
In [79]:
x
Out[79]:
In [80]:
x *=2
In [81]:
x
Out[81]:
In [ ]: