In [157]:
import requests
from bs4 import BeautifulSoup
import re
from string import punctuation
In [120]:
page = requests.get('http://www.mendeley.com/people/search/?query=paul+groth+vrije+universiteit+amsterdam')
soup = BeautifulSoup(page.text)
In [121]:
person = soup.find("article")
#do some matching
link = person.find('a', href=True)['href']
profile = requests.get(link)
In [164]:
profile_soup = BeautifulSoup(profile.text)
invalid_tags = ['span','br','div']
for div in profile_soup.findAll('div', id=re.compile('^experiences_info_[\d]')):
bio = div.find('div',class_="prof_bio_right")
for tag in invalid_tags:
for match in bio.findAll(tag):
match.unwrap()
for item in bio.contents:
if item.strip():
print item.strip()
print '-'
In [ ]: