In [ ]:
# this is the boilerplate import language needed to get Selenium
# up and running. Just make sure you include this for any Selenium script
# you want to write.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from lxml import html
import csv
# -*- coding: utf-8 -*-
# here, we create a function to go to the next page of our target site
# Selenium acts like a browser, so we can tell it to find the "next page" link and "click" on it
def go_to_next_page():
next_page_link = driver.find_element_by_xpath('//table[@id="datagrid_results"]//tr//td//span/following-sibling::a')
print ("Going to page", next_page_link.text)
next_page_link.click()
# here we loop through the rows to capture the data from the table at our target page
# to pull off each row of data, we loop through its rows based on the xpath
def get_data(source):
texts = []
# convert the page source into something lxml can parse
content = html.fromstring(source)
# grab all of the rows from the data table
rows = content.xpath(".//table[@id='datagrid_results']//tr")
# loop through each of the rows to get the content
for row in rows:
columns = row.xpath(".//td")
text = [col.text_content() for col in columns]
texts.append(text)
# Remove the first and last columns (header + pagination)
return texts[1:-1]
## Everything above was our functions - our tools - that we'll now use below in the actual
## sequence of the script
# Load the first page
# When we load the page, a firefox web browswer window will actually open up
# and Selenium will control it based on our instructions
print ("Loading the first page")
driver = webdriver.Firefox()
driver.get("http://www.licensepa.state.pa.us/")
# Select medicine and surgery from the dropdown menu
print ("Selecting the dropdown")
dropdown = Select(driver.find_element_by_name("t_web_lookup__license_type_name"))
dropdown.select_by_value("Medical Physician and Surgeon")
# Here we'll enter a last name letter and wildcard
# Since this site is interolably slow, we'll carve out just last names starting with a certain
# letter to keep things from crashing.
currentletter = "x"
print ("Selecting last names that start with '" + currentletter + "'")
textinput = driver.find_element_by_name("t_web_lookup__last_name")
textinput.send_keys(currentletter + "*")
# Click the search button
print ("Clicking the search button")
search_button = driver.find_element_by_id("sch_button")
search_button.click()
# If everything worked as planned, we'll copy the data we've captured into
# a .csv file that includes the name of the letter of the last names we've pulled down
failed = False
currentcsv = "pa-" + currentletter + ".csv"
with open(currentcsv, "w") as output:
writer = csv.writer(output, delimiter=',')
while not failed:
print ("Trying a new page")
# Get the data on the page
data_from_page = get_data(driver.page_source)
# Write them to the CSV
writer.writerows(data_from_page)
# Go to the next page
try:
# And then it just loops back up
go_to_next_page()
except:
# But it goes into here if it can't find the next link
failed = True
# Here we'll write a print statemtnt to let us know if everything worked correctly
# If there was an error, we'll get an error message. If we don't see any error
# messages and see this printed statement below instead, we know the script executed correctly.
print ("This job processed successfully.")
print ("Names with the letter '" + currentletter + "' are done!")
# Finally we must "close" the Selenium driver at the end. This is very important, and just leave this
# as the final line for any Selenium script you try to write.
driver.close()