In [1]:
import pandas as pd
import numpy as np
import time
from selenium import webdriver
from bs4 import BeautifulSoup
In [4]:
mainpage = pd.read_html('data/mainpage.html')
In [33]:
# see the first few rows of data
mainpage[0].ix[2300:2320,:]
Out[33]:
In [ ]:
browser = webdriver.Firefox() # Launch Firefow web browser
for id in mainpage[0].ix[2301:2500,0]:
outfile = open('data/source_%d' % id, 'w')
url = 'http://23.88.121.143:8080/fullInfo.xhtml?type=39&id=%d' % id
browser.get(url)
time.sleep(15) # wait for a page to finish loading
html_source = BeautifulSoup(browser.page_source, 'lxml')
outfile.write(str(html_source))
outfile.close()
browser.quit()
In [ ]:
for f in glob.glob('source_*'):
tables = pd.read_html(f)
if not tables[3].empty: print >> op, f, tables[3].ix[:,1], tables[3].ix[:,0]