In [1]:
from selenium import webdriver
#import urllib you can use urllib to send web request to websites and get back html text as response
import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
from lxml import html
import numpy
# import dependencies
In [2]:
browser = webdriver.Firefox() #I only tested in firefox
browser.get('http://costcotravel.com/Rental-Cars')
browser.implicitly_wait(5)#wait for webpage download
In [3]:
browser.find_element_by_id('pickupLocationTextWidget').send_keys("PHX");
In [4]:
browser.find_element_by_css_selector('.sayt-result').click()
In [5]:
browser.find_element_by_id("pickupDateWidget").send_keys('08/27/2016')#you can't send it directly, need to clear first
In [6]:
browser.find_element_by_id("pickupDateWidget").clear()
In [7]:
browser.find_element_by_id("pickupDateWidget").send_keys('08/27/2016')
In [8]:
browser.find_element_by_id("dropoffDateWidget").clear()
In [9]:
browser.find_element_by_id("dropoffDateWidget").send_keys('08/31/2016',Keys.RETURN)
In [10]:
browser.find_element_by_css_selector('#pickupTimeWidget option[value="03:00 PM"]').click() #select time
In [11]:
browser.find_element_by_css_selector('#dropoffTimeWidget option[value="03:00 PM"]').click()
In [12]:
browser.find_element_by_link_text('SEARCH').click() #click the red button !!
In [15]:
n = browser.page_source #grab the page source
The follow code is same as before, but you can send the commands all in one go. However, there are implicit wait for the driver so it can do AJAX request and render the page for elements also, you can you find_element_by_xpath method
In [14]:
# browser = webdriver.Firefox() #I only tested in firefox
# browser.get('http://costcotravel.com/Rental-Cars')
# browser.implicitly_wait(5)#wait for webpage download
# browser.find_element_by_id('pickupLocationTextWidget').send_keys("PHX");
# browser.implicitly_wait(5) #wait for the airport suggestion box to show
# browser.find_element_by_xpath('//li[@class="sayt-result"]').click()
# #click the airport suggestion box
# browser.find_element_by_xpath('//input[@id="pickupDateWidget"]').send_keys('08/27/2016')
# browser.find_element_by_xpath('//input[@id="dropoffDateWidget"]').send_keys('08/30/2016',Keys.RETURN)
# browser.find_element_by_xpath('//select[@id="pickupTimeWidget"]/option[@value="09:00 AM"]').click()
# browser.find_element_by_xpath('//select[@id="dropoffTimeWidget"]/option[@value="05:00 PM"]').click()
# browser.implicitly_wait(5) #wait for the clicks to be completed
# browser.find_element_by_link_text('SEARCH').click()
# #click the search box
# time.sleep(8) #wait for firefox to download and render the page
# n = browser.page_source #grab the html source code
In [16]:
type(n) #the site use unicode
Out[16]:
In [17]:
soup = BeautifulSoup(n,'lxml') #use BeautifulSoup to parse the source
In [18]:
print "--------------first 1000 characters:--------------\n"
print soup.prettify()[:1000]
print "\n--------------last 1000 characters:--------------"
print soup.prettify()[-1000:]
In [19]:
table = soup.find('div',{'class':'rentalCarTableDetails'}) #find the table
In [20]:
print "--------------first 1000 characters:--------------\n"
print table.prettify()[:1000]
print "\n--------------last 1000 characters:--------------"
print table.prettify()[-1000:]
In [21]:
tr = table.select('tr') #let's look at one of the row
In [22]:
type(tr)
Out[22]:
In [23]:
#lets look at first three row
for i in tr[0:3]:
print i.prettify()
print "-----------------------------------"
let play with one of the row
In [24]:
row = tr[3]
In [25]:
row.find('th',{'class':'tar'}).text.encode('utf-8')
Out[25]:
In [26]:
row
Out[26]:
In [27]:
row.contents[4].text #1. this is unicode, 2. the dollar sign is in the way
Out[27]:
In [28]:
'Car' in 'Econ Car' #use this string logic to filter out unwanted data
Out[28]:
In [29]:
rows = [i for i in tr if (('Price' not in i.contents[0].text and 'Fees' not in i.contents[0].text and 'Location' not in i.contents[0].text and i.contents[0].text !='') and len(i.contents[0].text)<30)]
# use this crazy list comprehension to get the data we want
#1. don't want the text 'Price' in the first column
#2. don't want the text 'Fee' in the first column
#3. don't want the text 'Location' in the first column
#4. the text length of first column must be less than 30 characters long
In [30]:
rows[0].contents[0].text #just exploring here...
Out[30]:
In [31]:
rows[0].contents[4].text #need to get rid of the $....
Out[31]:
In [32]:
rows[3].contents[0].text #need to make it utf-8
Out[32]:
In [33]:
#process the data
prices = {}
for i in rows:
#print the 1st column text
print i.contents[0].text.encode('utf-8')
prices[i.contents[0].text.encode('utf-8')] = [i.contents[1].text.encode('utf-8'),i.contents[2].text.encode('utf-8'), i.contents[3].text.encode('utf-8'),i.contents[4].text.encode('utf-8')]
In [34]:
prices
Out[34]:
In [35]:
iteritems = prices.iteritems()
#call .iteritems() on a dictionary will give you a generator which you can iter over
In [36]:
iteritems.next() #run me five times
Out[36]:
In [37]:
for name, priceList in prices.iteritems():
newPriceList = []
for i in priceList:
newPriceList.append(i.replace('$',''))
prices[name] = newPriceList
In [38]:
prices
Out[38]:
In [39]:
data = pd.DataFrame.from_dict(prices, orient='index') #get a pandas DataFrame from the prices dictionary
In [40]:
data
Out[40]:
In [41]:
data = data.replace('Not Available', numpy.nan) #replace the 'Not Available' data point to numpy.nan
In [42]:
data = pd.to_numeric(data, errors='coerce') #cast to numeric data
In [43]:
data
Out[43]:
In [44]:
data.columns= ['Alamo','Avis','Budget','Enterprise'] #set column names
In [45]:
data
Out[45]:
In [46]:
data.notnull() #check for missing data
Out[46]:
In [47]:
data.min(axis=1, skipna=True) #look at the cheapest car in each class
Out[47]:
From this point on, you can set up to run every night and email yourself results etc.
In [ ]: