In [2]:
from bs4 import BeautifulSoup
import requests
import re
import unicodedata
import string
url = "http://dl.acm.org/citation.cfm?id=2783258&preflayout=flat"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data)
In [3]:
#print(soup.prettify()); # remove colon to see the format of the html we are parsing
In [4]:
# builds a list of journal and keynote speaker abstracts; remove the colon below to see the list output
abstracts=[]
for l in soup.find_all('p'):
abstracts.append(filter(lambda x: x in string.printable,unicodedata.normalize('NFKD', l.get_text()).encode('ascii','ignore').decode('unicode_escape').encode('ascii','ignore')))
abstracts;
In [7]:
# similar to code above, builds a list of all authors and speakers
authors=[]
for l in soup.find_all('a', href=re.compile('author_page.cfm')):
authors.append(filter(lambda x: x in string.printable,unicodedata.normalize('NFKD', l.get_text()).encode('ascii','ignore').decode('unicode_escape').encode('ascii','ignore')))
authors = [x for x in authors if x != "View colleagues"] # removes leftover value
#set(authors)
authors; # remove the colon to see the output
In [8]:
# similar to code above, builds a list of titles
titles = []
for l in soup.find_all('a', href=re.compile('citation.cfm')):
titles.append(filter(lambda x: x in string.printable,unicodedata.normalize('NFKD', l.get_text()).encode('ascii','ignore').decode('unicode_escape').encode('ascii','ignore')))
titles; # remove colon to see list output