Introduction

In this recipe, we will use selenium and BeautifulSoup packages to scrape AJAX pages to obtain data automatically.

Data are then merged and converted to a single CSV file.


In [1]:
import pandas as pd
import numpy as np
import time

from selenium import webdriver
from bs4 import BeautifulSoup

Load HTML from the main page

First we get a list of molecules from the pre-downloaded HTML main page using Pandas.


In [4]:
mainpage = pd.read_html('data/mainpage.html')

In [33]:
# see the first few rows of data
mainpage[0].ix[2300:2320,:]


Out[33]:
Result(5688/5688) ID Name N terminus Sequence C terminus Activity ( µg/ml ) View
2300 2491 Varv He, Varv peptide H NaN GLPVCGETCFGGTCNTPGCSCXTWPVCSRN NaN View NaN NaN
2301 2492 Varv Hm, Varv peptide H NaN GLPVCGETCFGGTCNTPGCSCXTWPVCSRN NaN View NaN NaN
2302 2493 Vitri B NaN GYPICGESCVGGICNIPGCSCSWPVCTTN NaN View NaN NaN
2303 2494 Vitri C NaN GLPICGETCVGGTCNTPGCFCTWPVCTRN NaN View NaN NaN
2304 2495 Vitri D NaN GLPVCGETCFTGSCYTPGCSCNWPVCNRN NaN View NaN NaN
2305 2496 Vitri E, Cyclotide C NaN GLPVCGETCVGGTCNTPGCSCSWPVCFRN NaN View NaN NaN
2306 2497 Vitri F, Cycloviolacin-O19 NaN GTLPCGESCVWIPCISSVVGCACKSKVCYKD NaN View NaN NaN
2307 2498 X1 NaN PRLKVYLPRYKVYSTAAGRYQLLSRYWDAYR NaN View NaN NaN
2308 2499 X2 NaN PRLKVYLPRYKVYSTAAGRYQLLSRYW NaN View NaN NaN
2309 2500 X3 NaN PRLKVYLPRYKVYSTAAGRY NaN View NaN NaN
2310 2501 X4 NaN PRLKVYLPRYKVYSTA NaN View NaN NaN
2311 2502 X2a NaN PRLKVYLPRYKVYSTAAGRYQLLSRYW AMD View NaN NaN
2312 2503 X3a NaN PRLKVYLPRYKVYSTAAGRY NaN View NaN NaN
2313 2504 Kassinatuerin-1 NaN GFMKYIGPLIPHAVKAISDLI AMD View NaN NaN
2314 2505 Kassinatuerin-1 (10?21) NaN IPHAVKAISDLI AMD View NaN NaN
2315 2506 Lys13 Kassinatuerin-1 NaN GFMKYIGPLIPHKVKAISDLI AMD View NaN NaN
2316 2507 Lys19 Kassinatuerin-1 NaN GFMKYIGPLIPHAVKAISKLI AMD View NaN NaN
2317 2508 Lys18,Lys19 Kassinatuerin-1 NaN GFMKYIGPLIPHAVKAIKKLI AMD View NaN NaN
2318 2509 Lys7,Lys19 Kassinatuerin-1 NaN GFMKYIKPLIPHAVKAISKLI AMD View NaN NaN
2319 2510 Lys7,Lys18,Lys19 Kassinatuerin-1 NaN GFMKYIKPLIPHAVKAIKKLI AMD View NaN NaN
2320 2511 Kassinatuerin-1 -COOH NaN GFMKYIGPLIPHAVKAISDLI NaN View NaN NaN

In [ ]:
browser = webdriver.Firefox()  # Launch Firefow web browser

for id in mainpage[0].ix[2301:2500,0]:
    outfile = open('data/source_%d' % id, 'w')
    url = 'http://23.88.121.143:8080/fullInfo.xhtml?type=39&id=%d' % id
    browser.get(url)
    time.sleep(15)  # wait for a page to finish loading
    html_source = BeautifulSoup(browser.page_source, 'lxml')
    outfile.write(str(html_source))
    outfile.close()

browser.quit()

In [ ]:
for f in glob.glob('source_*'):
    tables = pd.read_html(f)
    if not tables[3].empty: print >> op, f, tables[3].ix[:,1], tables[3].ix[:,0]