In [4]:
from bs4 import BeautifulSoup as bs
import requests

In [5]:
url = 'http://apps.who.int/gho/data/view.ebola-sitrep.ebola-summary-latest?lang=en'

In [6]:
r = requests.get(url)
data = r.text
soup = bs(data)


/home/simonwoerpel/Websites/scraping/who-ebola/venv/lib/python3.5/site-packages/bs4/__init__.py:166: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.

To get rid of this warning, change this:

 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))

In [26]:
ul = soup.findAll('ul', {'class': 'list_dash'})[0]

In [30]:
lis = ul.findAll('li')

In [31]:
len(lis)


Out[31]:
283

In [32]:
lis[0]


Out[32]:
<li><a class="link_internal" href="view.ebola-sitrep.ebola-summary-20160120?lang=en">Data published on 20 January 2016</a></li>

In [49]:
url_prefix = 'http://apps.who.int/gho/data/'
from datetime import datetime
d_format = '%d %B %Y'

In [50]:
with open('files.csv', 'w') as f:
    for li in lis:
        d = li.a.text.lstrip('Data published on ')
        date = datetime.strptime(d, d_format).date().isoformat()
        url = li.a.attrs['href']
        f.writelines('{},{}{}\n'.format(date, url_prefix, url))

In [38]:
li


Out[38]:
<li><a class="link_internal" href="view.ebola-sitrep.ebola-summary-20141112?lang=en">Data published on 12 November 2014</a></li>

In [39]:
li.a.url

In [40]:
li.a


Out[40]:
<a class="link_internal" href="view.ebola-sitrep.ebola-summary-20141112?lang=en">Data published on 12 November 2014</a>

In [44]:
li.a.attrs['href']


Out[44]:
'view.ebola-sitrep.ebola-summary-20141112?lang=en'

In [ ]: