In [ ]:
# This is grade A lazy attempt to stop github key parsers - need to implement env stuff properly...
gapikey1 = 'asdasd_AIzaSyA8ZTz_'
gapikey2 = 'dsfsdf_o3VhApo4CoehKa6m7_cFLKKBnXt'
gapikey = (gapikey1+gapikey2[::-1])[7:-7]
headers_base = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Referer':'https://www.costcotravel.com',
'Host':'www.costcotravel.com',
}
data_base = {
'rcs':'1',
'driverAge':'25',
'pickupZip':ZIP,
'pickupCityLocationTypeSearch':'2',
'dropoffZip':ZIP,
'dropoffCityLocationTypeSearch':'2',
'pickupCountry':'US',
'dropoffCountry':'US',
'pickupCityRadius':RADIUS,
'dropoffCityRadius':RADIUS,
'pickupAsAirport':'false',
'dropoffAsAirport':'false',
'pickupTime':TIME,
'dropoffTime':TIME,
'pickupLocationCode':ZIP,
'pickupLocationName':ZIP+' (CHICAGO, IL, US)',
'pickupLocationType':'zipCode',
'pickupLocationCityCode':'CHICAGO',
'pickupLocationStateCode':'',
'pickupLatitude':'',
'pickupLongitude':'',
'dropoffLocationCode':ZIP,
'dropoffLocationName':ZIP+' (CHICAGO, IL, US)',
'dropoffLocationType':'zipCode',
'dropoffLocationCityCode':'CHICAGO',
'dropoffLocationStateCode':'',
'dropoffLatitude':'',
'dropoffLongitude':'',
'fromHomePage':'true',
'fromCarVendorMainMenu':'true',
'carSearchInModifyFlow':'false',
'suppressOutput':'false',
}
In [ ]:
%run DB_driver.ipynb
In [3]:
# Make this notebook wiiiiiiiiide
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
In [ ]:
def getagencylist(span,agencyid=None,agencybcode=None):
"""
Obtains lists of agencies, unless specific id/brand are specified as filters
"""
# copy headers and set specific dates
headers = dict(headers_base)
data = dict(data_base)
data.update({'pickupDate':span[0].strftime("%m/%d/%Y"),
'dropoffDate':span[1].strftime("%m/%d/%Y")})
# get initial rent page
s = requests.Session()
r3 = s.get('https://www.costcotravel.com/h=4005',headers=headers)
#cks = r3.cookies
csrf = re.search('\(\"Csrf-token\", \"(\S{128})\"\)',r3.text).group(1)
ts = re.search('namespace.sessionTimestamp = (\d{10,20});',r3.text).group(1)
# various settings
s.headers.update({'X-Csrf-Token':csrf})
s.headers.update({'Referer':'https://www.costcotravel.com/h=4005'})
s.cookies.set('Csrf-token',csrf)
if 'SESSION_TIME_OUT_DETECTED' in s.cookies: s.cookies.pop('SESSION_TIME_OUT_DETECTED')
s.cookies.set('SESSION_TIMESTAMP',str(ts))
time.sleep(1.0)
#print('cks',cks)
#print(s.cookies)
r = s.post('https://www.costcotravel.com/rentalCarSearch.act',
data=data)
#print(s.headers)
#print(s.cookies)
res = re.findall(
('{\"id\":\"S(\d{1,3})\",\"type\":\"(city|airport)\",\"agency\":\"(\S{1,40})\",\"agencyCode\":\"(\S{1,8})\",'
'\"title\":\"(.{4,50})\",\"isOpen\":true,\"unAvailableMessage\":\"\",\"vendorCode\":\"(\S{2,3})\",\"address\"'
':\"(.{5,110})\",\"city\":\"(.{3,30})\",\"state\":\"(\S{2})\",\"zip\":\"'
'(\d{5})\",\"country\":\"US\",\"distance\":\"(\S{2,7})\",\"latitude\":(\S{2,10}),\"longitude\":(\S{2,10}),'),
r.text)
results = dict()
for tp in res:
if agencyid and agencybcode:
if (tp[3] == agencyid and tp[5] == agencybcode):
results[int(tp[0])] = {'num':int(tp[0]),'type':tp[1],'brand':tp[2],'code':tp[3],'name':tp[4],
'bcode':tp[5],'addr':tp[6],'city':tp[7],'state':tp[8],'zip':tp[9],'dist':tp[10],
'lat':tp[11],'lon':tp[12],'start':data['pickupDate'],'end':data['dropoffDate']}
else:
results[int(tp[0])] = {'num':int(tp[0]),'type':tp[1],'brand':tp[2],'code':tp[3],'name':tp[4],
'bcode':tp[5],'addr':tp[6],'city':tp[7],'state':tp[8],'zip':tp[9],'dist':tp[10],
'lat':tp[11],'lon':tp[12],'start':data['pickupDate'],'end':data['dropoffDate']}
return (results,s)
In [ ]:
def getprice(session,agencybcode,agencyid,span,idx=None):
"""
Obtains a single price set from particular location
"""
# # copy headers and set specific dates
# headers = dict(headers_base)
# data = dict(data_base)
# data.update({'pickupDate':span[0].strftime("%m/%d/%Y"),
# 'dropoffDate':span[1].strftime("%m/%d/%Y")})
# # get initial rent page
# s = requests.Session()
# r3 = s.get('https://www.costcotravel.com/h=4005',headers=headers)
# cks = r3.cookies
# csrf = re.search('\(\"Csrf-token\", \"(\S{128})\"\)',r3.text).group(1)
# ts = re.search('namespace.sessionTimestamp = (\d{10,20});',r3.text).group(1)
# # various settings
# headers.update({'X-Csrf-Token':csrf})
# headers.update({'Referer':'https://www.costcotravel.com/h=4005'})
# cks.set('Csrf-token',csrf)
# if 'SESSION_TIME_OUT_DETECTED' in cks: cks.pop('SESSION_TIME_OUT_DETECTED')
# cks.set('SESSION_TIMESTAMP',str(ts))
# time.sleep(0.8)
# # get agency list
# r = s.post('https://www.costcotravel.com/rentalCarSearch.act',
# data=data, headers=headers, cookies=cks)
# res = re.findall(
# ('{\"id\":\"S(\d{1,3})\",\"type\":\"(city|airport)\",\"agency\":\"(\S{1,40})\",\"agencyCode\":\"(\S{1,8})\",'
# '\"title\":\"(.{4,50})\",\"isOpen\":true,\"unAvailableMessage\":\"\",\"vendorCode\":\"(\S{2,3})\",\"address\"'
# ':\"(.{5,110})\",\"city\":\"(.{3,30})\",\"state\":\"(\S{2})\",\"zip\":\"'
# '(\d{5})\",\"country\":\"US\",\"distance\":\"(\S{2,7})\",\"latitude\":(\S{2,10}),\"longitude\":(\S{2,10}),'),
# r.text)
# #print(r.text)
# #print(res)
# match = [v for v in res if (v[3] == agencyid and v[2] == agencybrand)]
(match,s) = getagencylist(span,agencybcode=agencybcode,agencyid=agencyid)
#print(match)
if len(match)>0:
# tp = match[0] # should match those weird same code ones correctlys
# resd = {'num':int(tp[0]),'type':tp[1],'brand':tp[2],'code':tp[3],'name':tp[4],
# 'bcode':tp[5],'addr':tp[6],'city':tp[7],'state':tp[8],'zip':tp[9],
# 'dist':tp[10],'lat':tp[11],'lon':tp[12],'start':data['pickupDate'],
# 'end':data['dropoffDate']}
resd = list(match.values())[0]
time.sleep(1.0)
s.headers.update({'Referer':'https://www.costcotravel.com/h=3001'})
k = idx
v = resd
data2 = {
'rcas':'Load_Forword_Navigation_From_Agency_Results',
'carAgenciesForVendors':'[{{\"vendorId\":\"{}\",\"agencyCodes\":[\"{}\"]}}]'.\
format(v['bcode'],v['code']),
}
#print(data2)
# r2 = s.post('https://www.costcotravel.com/rentalCarAgencySelection.act',
# data=data2,cookies=cks,headers=headers)
r2 = s.post('https://www.costcotravel.com/rentalCarAgencySelection.act',
data=data2)
#print(s.cookies)
#print(s.headers)
prices = re.findall('<h3>(.{5,40})</h3></div></div></div><div style=\"height: 94px;\" class=\"col col-lg col-1 '
'col-lg-1 text-center height-item test\"><a data-responsive-referrer=\"carMatrix\" '
'data-category-id=\"\S{20,50}\" data-selected=\"(?:false|true)\" data-product-id=\"'
'\S{20,50}\" data-price=\"(\S{4,30})\"',
r2.text)
#print(r2.text)
if len(prices) == 0:
print("{:03d} ND | ".format(idx),resd['start'],resd['end'],resd['num'],resd['brand'],
resd['code'],resd['name'],resd['bcode'],resd['dist'])
else:
#print("{:03d} OK |".format(k),tp[0],tp[2:5],tp[10:])
for pr in prices:
try:
v[pr[0]] = float(pr[1])
except:
v[pr[0]] = pr[1]
#print(v)
return (k,v)
else:
print("{:03d} ND | ".format(idx),agencyid)
return (idx,None)
In [ ]:
# def getprice(session,agencybcode,agencyid,span,idx=None):
# """
# Obtains a single price set from particular location
# """
# # # copy headers and set specific dates
# # headers = dict(headers_base)
# # data = dict(data_base)
# # data.update({'pickupDate':span[0].strftime("%m/%d/%Y"),
# # 'dropoffDate':span[1].strftime("%m/%d/%Y")})
# # # get initial rent page
# # s = requests.Session()
# # r3 = s.get('https://www.costcotravel.com/h=4005',headers=headers)
# # cks = r3.cookies
# # csrf = re.search('\(\"Csrf-token\", \"(\S{128})\"\)',r3.text).group(1)
# # ts = re.search('namespace.sessionTimestamp = (\d{10,20});',r3.text).group(1)
# # # various settings
# # headers.update({'X-Csrf-Token':csrf})
# # headers.update({'Referer':'https://www.costcotravel.com/h=4005'})
# # cks.set('Csrf-token',csrf)
# # if 'SESSION_TIME_OUT_DETECTED' in cks: cks.pop('SESSION_TIME_OUT_DETECTED')
# # cks.set('SESSION_TIMESTAMP',str(ts))
# # time.sleep(0.8)
# # # get agency list
# # r = s.post('https://www.costcotravel.com/rentalCarSearch.act',
# # data=data, headers=headers, cookies=cks)
# # res = re.findall(
# # ('{\"id\":\"S(\d{1,3})\",\"type\":\"(city|airport)\",\"agency\":\"(\S{1,40})\",\"agencyCode\":\"(\S{1,8})\",'
# # '\"title\":\"(.{4,50})\",\"isOpen\":true,\"unAvailableMessage\":\"\",\"vendorCode\":\"(\S{2,3})\",\"address\"'
# # ':\"(.{5,110})\",\"city\":\"(.{3,30})\",\"state\":\"(\S{2})\",\"zip\":\"'
# # '(\d{5})\",\"country\":\"US\",\"distance\":\"(\S{2,7})\",\"latitude\":(\S{2,10}),\"longitude\":(\S{2,10}),'),
# # r.text)
# # #print(r.text)
# # #print(res)
# # match = [v for v in res if (v[3] == agencyid and v[2] == agencybrand)]
# (match,s) = getagencylist(span,agencybcode=agencybcode,agencyid=agencyid)
# print(match)
# if len(match)>0:
# # tp = match[0] # should match those weird same code ones correctlys
# # resd = {'num':int(tp[0]),'type':tp[1],'brand':tp[2],'code':tp[3],'name':tp[4],
# # 'bcode':tp[5],'addr':tp[6],'city':tp[7],'state':tp[8],'zip':tp[9],
# # 'dist':tp[10],'lat':tp[11],'lon':tp[12],'start':data['pickupDate'],
# # 'end':data['dropoffDate']}
# resd = list(match.values())[0]
# time.sleep(1.0)
# s.headers.update({'Referer':'https://www.costcotravel.com/h=3001'})
# k = idx
# v = resd
# data2 = {
# 'rcas':'Load_Forword_Navigation_From_Agency_Results',
# 'carAgenciesForVendors':'[{{\"vendorId\":\"{}\",\"agencyCodes\":[\"{}\"]}}]'.\
# format(v['bcode'],v['code']),
# }
# #print(data2)
# # r2 = s.post('https://www.costcotravel.com/rentalCarAgencySelection.act',
# # data=data2,cookies=cks,headers=headers)
# r2 = s.post('https://www.costcotravel.com/rentalCarAgencySelection.act',
# data=data2)
# prices = re.findall('<h3>(.{5,40})</h3></div></div></div><div style=\"height: 94px;\" class=\"col col-lg col-1'
# 'col-lg-1 text-center height-item test\"><a data-responsive-referrer=\"carMatrix\"'
# 'data-category-id=\"\S{20,50}\" data-selected=\"(?:false|true)\" data-product-id=\"'
# '\S{20,50}\"data-price=\"(\S{4,30})\"',
# r2.text)
# #print(r2.text)
# if len(prices) == 0:
# print("{:03d} ND | ".format(idx),resd['start'],resd['end'],resd['num'],resd['brand'],
# resd['code'],resd['name'],resd['bcode'],resd['dist'])
# else:
# #print("{:03d} OK |".format(k),tp[0],tp[2:5],tp[10:])
# for pr in prices:
# try:
# v[pr[0]] = float(pr[1])
# except:
# v[pr[0]] = pr[1]
# #print(v)
# return (k,v)
# else:
# print("{:03d} ND | ".format(idx),agencyid)
# return (idx,None)
In [ ]:
def runparser(agencylist,threads=1):
assert agencylist is not None
startidx = 0
resultsfinal = {}
for agencyid in agencylist:
if FIXEDLENGTH:
# just want a sliding window
starts = generateDates(PICKUP,VARYSTART)
ends = generateDates(DROPOFF,VARYSTART)
assert len(starts) == len(ends)
spans = list(zip(starts,ends))
else:
# creates a cartesian product essentially
starts = generateDates(PICKUP,VARYSTART)
ends = generateDates(DROPOFF,VARYEND)
spans = [(x,y) for x in starts for y in ends]
#print(len(spans))
resultstemp = dict()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
# Requests uses blocking io, so doing things in single thread event loop doesn't help much
# -> have to do executor thingies and run separate threads
executor = concurrent.futures.ThreadPoolExecutor(max_workers=threads)
try:
loop.run_until_complete(runall(executor,resultstemp,agencyid[0],agencyid[1],spans,startidx))
finally:
loop.close()
numresults = len(resultstemp)
startidx += numresults
resultsfinal.update(resultstemp)
# You can't really do threads+sqlalchemy without careful connection management
# So here results are first assembled into dictionary, then batch committed
placeintodb(resultsfinal)
return resultsfinal
def placeintodb(resultstemp):
for r in resultstemp.values():
if r:
entry = Price(
agency_id=r['code']+r['brand'],
pickup=r['start'],
dropoff=r['end'],
time=TIME,
querytime=datetime.datetime.utcnow(),
car_econ = r.get('Economy Car'),
car_comp = r.get('Compact Car'),
car_int = r.get('Intermediate Car'),
car_std = r.get('Standard Car'),
car_full = r.get('Fullsize Car'),
car_prem = r.get('Premium Car'),
car_lux = r.get('Luxury Car'),
car_intsuv = r.get('Intermediate SUV'),
car_stdsuv = r.get('Standard SUV'),
car_fullsuv = r.get('Fullsize SUV'),
car_premsuv = r.get('Premium SUV'),
car_mv = r.get('Mini Van'),
car_fv = r.get('Fullsize Van'),
)
session.add(entry)
session.commit()
async def runall(executor,resultstemp,agencybcode,agencyid,spans,startidx):
loop = asyncio.get_event_loop()
#print(spans)
blocking_tasks = [loop.run_in_executor(executor, getprice, session, agencybcode, agencyid, spans[i],\
i+startidx) for i in range(0,len(spans))]
completed, pending = await asyncio.wait(blocking_tasks)
results = [t.result() for t in completed]
for i,r in enumerate(results):
resultstemp[r[0]] = r[1]
In [ ]:
def getpriceasync_googletest(num):
s = requests.Session()
r = s.get('https://www.costcotravel.com/h=4005')
print('{}: static test'.format(num))
return (r.text)
In [ ]:
def populateagenciesdb(span):
(results,_) = getagencylist(span)
currentlist = session.query(Agency.ag_id, Agency.ag_brand).all()
for r in results.values():
if (r['code'],r['brand']) not in currentlist:
entry = Agency(uid = r['code']+r['brand'],
ag_id = r['code'], ag_type = r['type'],
ag_brand = r['brand'], ag_name = r['name'],
ag_bcode = r['bcode'], ag_addr = r['addr'],
ag_city = r['city'], ag_state = r['state'],
ag_zip = r['zip'], ag_dist = r['dist'],
ag_lat = r['lat'], ag_lon = r['lon'])
session.add(entry)
session.commit()
In [ ]:
def computeTravelTimes(results):
addresses = []
for i in range(1,len(results)+1):
entry = results[i]
addresses.append(entry['addr']+', '+entry['city']+', '+entry['state']+', '+entry['zip']+', USA')
batchsize = 100
fullbatches = len(addresses)//batchsize
print(len(addresses))
destinations = []
for i in range(0,fullbatches):
#print(i)
destinations.append('|'.join(addresses[i*batchsize:(i+1)*batchsize]))
if len(addresses)%batchsize > 0:
destinations.append('|'.join(addresses[fullbatches*batchsize:]))
print([destinations[i].count('|')+1 for i in range(0,len(destinations))])
origin = '1369 East Hyde Park Blvd, Chicago, IL, 60615, USA'
for i, dest in enumerate(destinations):
gresponse = getGDistance(origin, dest)
time.sleep(0.5)
#print(gresponse)
if gresponse['status'] == 'OK':
elems = gresponse['rows'][0]['elements']
print('Got gmatrix response for batch {}|{}'.format(i,len(elems)))
for j,entry in enumerate(elems):
idx = i*batchsize+j+1
if (entry['status'] == 'OK'):
results[idx]['time'] = entry['duration']['value']
results[idx]['roaddist'] = entry['distance']['value']
#print(idx,results[idx]['time'],results[idx]['roaddist'])
else:
print('{} ENTRY FAILED'.format(idx))
else:
print('Gmatrix response for batch {} FAILED'.format(i))
def getGDistance(origin,destination,mode='transit'):
urlparams = {'units':'imperial','origins':origin,'destinations':destination,'key':gapikey,'mode':mode,
'departure_time':1511366822,'transit_routing_preference':'fewer_transfers'}
resp = requests.get('https://maps.googleapis.com/maps/api/distancematrix/json',params=urlparams)
rj = resp.json()
print(rj)
return rj
In [ ]:
# def runparser(agencyids=None,threads=1):
# assert agencyids is not None
# startidx = 0
# resultsfinal = {}
# for agencyid in agencyids:
# if FIXEDLENGTH:
# # just want a sliding window
# starts = generateDates(PICKUP,VARYSTART)
# ends = generateDates(DROPOFF,VARYSTART)
# assert len(starts) == len(ends)
# spans = list(zip(starts,ends))
# else:
# # creates a cartesian product essentially
# starts = generateDates(PICKUP,VARYSTART)
# ends = generateDates(DROPOFF,VARYEND)
# spans = [(x,y) for x in starts for y in ends]
# print(len(spans))
# resultstemp = dict()
# loop = asyncio.new_event_loop()
# asyncio.set_event_loop(loop)
# # Requests uses blocking io, so doing things in single thread event loop doesn't help much
# # -> have to do executor thingies and run separate threads
# executor = concurrent.futures.ThreadPoolExecutor(max_workers=threads)
# try:
# loop.run_until_complete(runall(executor,resultstemp,agencyid,spans,startidx))
# finally:
# loop.close()
# numresults = len(resultstemp)
# startidx += numresults
# resultsfinal.update(resultstemp)
# return resultsfinal
# async def runall(executor,resultstemp,agencyid,spans,startidx):
# #print('start t2')
# loop = asyncio.get_event_loop()
# #print(pool)
# blocking_tasks = [loop.run_in_executor(executor, getpriceasync, agencyid, spans[i], i+startidx) for i in range(0,len(spans))]
# #print(blocking_tasks)
# completed, pending = await asyncio.wait(blocking_tasks)
# #print(completed,pending)
# results = [t.result() for t in completed]
# for i,r in enumerate(results):
# resultstemp[r[0]] = r[1]
# #print(i,r)
In [ ]:
def generateDates(center,variance):
dates = []
#lowerlim = datetime.datetime.now().date()
for i in range(-variance,variance+1):
dates.append(center+datetime.timedelta(days=i))
return dates