In [ ]:
# This is grade A lazy attempt to stop github key parsers - need to implement env stuff properly...
gapikey1 = 'asdasd_AIzaSyA8ZTz_'
gapikey2 = 'dsfsdf_o3VhApo4CoehKa6m7_cFLKKBnXt'
gapikey = (gapikey1+gapikey2[::-1])[7:-7]
headers_base = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Referer':'https://www.costcotravel.com',
    'Host':'www.costcotravel.com',
}
data_base = {
    'rcs':'1',
    'driverAge':'25',
    'pickupZip':ZIP,
    'pickupCityLocationTypeSearch':'2',
    'dropoffZip':ZIP,
    'dropoffCityLocationTypeSearch':'2',
    'pickupCountry':'US',
    'dropoffCountry':'US',
    'pickupCityRadius':RADIUS,
    'dropoffCityRadius':RADIUS,
    'pickupAsAirport':'false',
    'dropoffAsAirport':'false',
    'pickupTime':TIME,
    'dropoffTime':TIME,
    'pickupLocationCode':ZIP,
    'pickupLocationName':ZIP+' (CHICAGO, IL, US)',
    'pickupLocationType':'zipCode',
    'pickupLocationCityCode':'CHICAGO',
    'pickupLocationStateCode':'',
    'pickupLatitude':'',
    'pickupLongitude':'',
    'dropoffLocationCode':ZIP,
    'dropoffLocationName':ZIP+' (CHICAGO, IL, US)',
    'dropoffLocationType':'zipCode',
    'dropoffLocationCityCode':'CHICAGO',
    'dropoffLocationStateCode':'',
    'dropoffLatitude':'',
    'dropoffLongitude':'',
    'fromHomePage':'true',
    'fromCarVendorMainMenu':'true',
    'carSearchInModifyFlow':'false',
    'suppressOutput':'false',
}

In [ ]:
%run DB_driver.ipynb

In [3]:
# Make this notebook wiiiiiiiiide
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))



In [ ]:
def getagencylist(span,agencyid=None,agencybcode=None):
    """
    Obtains lists of agencies, unless specific id/brand are specified as filters
    """
    # copy headers and set specific dates
    headers = dict(headers_base)
    data = dict(data_base)
    data.update({'pickupDate':span[0].strftime("%m/%d/%Y"),
                 'dropoffDate':span[1].strftime("%m/%d/%Y")})
    
    # get initial rent page
    s = requests.Session()
    r3 = s.get('https://www.costcotravel.com/h=4005',headers=headers)
    #cks = r3.cookies    
    csrf = re.search('\(\"Csrf-token\", \"(\S{128})\"\)',r3.text).group(1)
    ts = re.search('namespace.sessionTimestamp = (\d{10,20});',r3.text).group(1)
    
    # various settings
    s.headers.update({'X-Csrf-Token':csrf})
    s.headers.update({'Referer':'https://www.costcotravel.com/h=4005'})
    
    s.cookies.set('Csrf-token',csrf)
    if 'SESSION_TIME_OUT_DETECTED' in s.cookies: s.cookies.pop('SESSION_TIME_OUT_DETECTED')
    s.cookies.set('SESSION_TIMESTAMP',str(ts))
    time.sleep(1.0)
    #print('cks',cks)
    #print(s.cookies)
    
    r = s.post('https://www.costcotravel.com/rentalCarSearch.act',
               data=data)   
    #print(s.headers)
    #print(s.cookies)
    res = re.findall(
        ('{\"id\":\"S(\d{1,3})\",\"type\":\"(city|airport)\",\"agency\":\"(\S{1,40})\",\"agencyCode\":\"(\S{1,8})\",'
        '\"title\":\"(.{4,50})\",\"isOpen\":true,\"unAvailableMessage\":\"\",\"vendorCode\":\"(\S{2,3})\",\"address\"'
        ':\"(.{5,110})\",\"city\":\"(.{3,30})\",\"state\":\"(\S{2})\",\"zip\":\"'
        '(\d{5})\",\"country\":\"US\",\"distance\":\"(\S{2,7})\",\"latitude\":(\S{2,10}),\"longitude\":(\S{2,10}),'),
        r.text)
    results = dict()
    for tp in res:
        if agencyid and agencybcode:
            if (tp[3] == agencyid and tp[5] == agencybcode):
                results[int(tp[0])] = {'num':int(tp[0]),'type':tp[1],'brand':tp[2],'code':tp[3],'name':tp[4],
                               'bcode':tp[5],'addr':tp[6],'city':tp[7],'state':tp[8],'zip':tp[9],'dist':tp[10],
                               'lat':tp[11],'lon':tp[12],'start':data['pickupDate'],'end':data['dropoffDate']}
        else:
            results[int(tp[0])] = {'num':int(tp[0]),'type':tp[1],'brand':tp[2],'code':tp[3],'name':tp[4],
                               'bcode':tp[5],'addr':tp[6],'city':tp[7],'state':tp[8],'zip':tp[9],'dist':tp[10],
                               'lat':tp[11],'lon':tp[12],'start':data['pickupDate'],'end':data['dropoffDate']}
    return (results,s)

In [ ]:
def getprice(session,agencybcode,agencyid,span,idx=None):
    """
    Obtains a single price set from particular location
    """
#     # copy headers and set specific dates
#     headers = dict(headers_base)
#     data = dict(data_base)
#     data.update({'pickupDate':span[0].strftime("%m/%d/%Y"),
#                  'dropoffDate':span[1].strftime("%m/%d/%Y")})

#     # get initial rent page
#     s = requests.Session()
#     r3 = s.get('https://www.costcotravel.com/h=4005',headers=headers)
#     cks = r3.cookies    
#     csrf = re.search('\(\"Csrf-token\", \"(\S{128})\"\)',r3.text).group(1)
#     ts = re.search('namespace.sessionTimestamp = (\d{10,20});',r3.text).group(1)
    
#     # various settings
#     headers.update({'X-Csrf-Token':csrf})
#     headers.update({'Referer':'https://www.costcotravel.com/h=4005'})
    
#     cks.set('Csrf-token',csrf)
#     if 'SESSION_TIME_OUT_DETECTED' in cks: cks.pop('SESSION_TIME_OUT_DETECTED')
#     cks.set('SESSION_TIMESTAMP',str(ts))
#     time.sleep(0.8)
    
#     # get agency list
#     r = s.post('https://www.costcotravel.com/rentalCarSearch.act',
#                data=data, headers=headers, cookies=cks)    
#     res = re.findall(
#         ('{\"id\":\"S(\d{1,3})\",\"type\":\"(city|airport)\",\"agency\":\"(\S{1,40})\",\"agencyCode\":\"(\S{1,8})\",'
#         '\"title\":\"(.{4,50})\",\"isOpen\":true,\"unAvailableMessage\":\"\",\"vendorCode\":\"(\S{2,3})\",\"address\"'
#         ':\"(.{5,110})\",\"city\":\"(.{3,30})\",\"state\":\"(\S{2})\",\"zip\":\"'
#         '(\d{5})\",\"country\":\"US\",\"distance\":\"(\S{2,7})\",\"latitude\":(\S{2,10}),\"longitude\":(\S{2,10}),'),
#         r.text)
#     #print(r.text)
#     #print(res)
#     match = [v for v in res if (v[3] == agencyid and v[2] == agencybrand)]
    
    (match,s) = getagencylist(span,agencybcode=agencybcode,agencyid=agencyid)
    #print(match)
    if len(match)>0: 
#         tp = match[0] # should match those weird same code ones correctlys
#         resd = {'num':int(tp[0]),'type':tp[1],'brand':tp[2],'code':tp[3],'name':tp[4],
#                  'bcode':tp[5],'addr':tp[6],'city':tp[7],'state':tp[8],'zip':tp[9],
#                  'dist':tp[10],'lat':tp[11],'lon':tp[12],'start':data['pickupDate'],
#                  'end':data['dropoffDate']}
        resd = list(match.values())[0]
        time.sleep(1.0)

        s.headers.update({'Referer':'https://www.costcotravel.com/h=3001'})
        k = idx
        v = resd
        data2 = {
            'rcas':'Load_Forword_Navigation_From_Agency_Results',
            'carAgenciesForVendors':'[{{\"vendorId\":\"{}\",\"agencyCodes\":[\"{}\"]}}]'.\
                format(v['bcode'],v['code']),
        }
        #print(data2)
#         r2 = s.post('https://www.costcotravel.com/rentalCarAgencySelection.act',
#                     data=data2,cookies=cks,headers=headers)
        r2 = s.post('https://www.costcotravel.com/rentalCarAgencySelection.act',
                     data=data2)
        #print(s.cookies)
        #print(s.headers)
        prices = re.findall('<h3>(.{5,40})</h3></div></div></div><div style=\"height: 94px;\" class=\"col col-lg col-1 '
                            'col-lg-1 text-center height-item test\"><a data-responsive-referrer=\"carMatrix\" '
                            'data-category-id=\"\S{20,50}\" data-selected=\"(?:false|true)\" data-product-id=\"'
                            '\S{20,50}\" data-price=\"(\S{4,30})\"',
                            r2.text)        
        #print(r2.text)
        
        if len(prices) == 0:
            print("{:03d} ND | ".format(idx),resd['start'],resd['end'],resd['num'],resd['brand'],
                  resd['code'],resd['name'],resd['bcode'],resd['dist'])
        else:
            #print("{:03d} OK |".format(k),tp[0],tp[2:5],tp[10:])
            for pr in prices:
                try:
                    v[pr[0]] = float(pr[1])
                except:
                    v[pr[0]] = pr[1]
        #print(v)
        return (k,v)
    else:
        print("{:03d} ND | ".format(idx),agencyid)
        return (idx,None)

In [ ]:
# def getprice(session,agencybcode,agencyid,span,idx=None):
#     """
#     Obtains a single price set from particular location
#     """
# #     # copy headers and set specific dates
# #     headers = dict(headers_base)
# #     data = dict(data_base)
# #     data.update({'pickupDate':span[0].strftime("%m/%d/%Y"),
# #                  'dropoffDate':span[1].strftime("%m/%d/%Y")})

# #     # get initial rent page
# #     s = requests.Session()
# #     r3 = s.get('https://www.costcotravel.com/h=4005',headers=headers)
# #     cks = r3.cookies    
# #     csrf = re.search('\(\"Csrf-token\", \"(\S{128})\"\)',r3.text).group(1)
# #     ts = re.search('namespace.sessionTimestamp = (\d{10,20});',r3.text).group(1)
    
# #     # various settings
# #     headers.update({'X-Csrf-Token':csrf})
# #     headers.update({'Referer':'https://www.costcotravel.com/h=4005'})
    
# #     cks.set('Csrf-token',csrf)
# #     if 'SESSION_TIME_OUT_DETECTED' in cks: cks.pop('SESSION_TIME_OUT_DETECTED')
# #     cks.set('SESSION_TIMESTAMP',str(ts))
# #     time.sleep(0.8)
    
# #     # get agency list
# #     r = s.post('https://www.costcotravel.com/rentalCarSearch.act',
# #                data=data, headers=headers, cookies=cks)    
# #     res = re.findall(
# #         ('{\"id\":\"S(\d{1,3})\",\"type\":\"(city|airport)\",\"agency\":\"(\S{1,40})\",\"agencyCode\":\"(\S{1,8})\",'
# #         '\"title\":\"(.{4,50})\",\"isOpen\":true,\"unAvailableMessage\":\"\",\"vendorCode\":\"(\S{2,3})\",\"address\"'
# #         ':\"(.{5,110})\",\"city\":\"(.{3,30})\",\"state\":\"(\S{2})\",\"zip\":\"'
# #         '(\d{5})\",\"country\":\"US\",\"distance\":\"(\S{2,7})\",\"latitude\":(\S{2,10}),\"longitude\":(\S{2,10}),'),
# #         r.text)
# #     #print(r.text)
# #     #print(res)
# #     match = [v for v in res if (v[3] == agencyid and v[2] == agencybrand)]
    
#     (match,s) = getagencylist(span,agencybcode=agencybcode,agencyid=agencyid)
#     print(match)
#     if len(match)>0: 
# #         tp = match[0] # should match those weird same code ones correctlys
# #         resd = {'num':int(tp[0]),'type':tp[1],'brand':tp[2],'code':tp[3],'name':tp[4],
# #                  'bcode':tp[5],'addr':tp[6],'city':tp[7],'state':tp[8],'zip':tp[9],
# #                  'dist':tp[10],'lat':tp[11],'lon':tp[12],'start':data['pickupDate'],
# #                  'end':data['dropoffDate']}
#         resd = list(match.values())[0]
#         time.sleep(1.0)

#         s.headers.update({'Referer':'https://www.costcotravel.com/h=3001'})
#         k = idx
#         v = resd
#         data2 = {
#             'rcas':'Load_Forword_Navigation_From_Agency_Results',
#             'carAgenciesForVendors':'[{{\"vendorId\":\"{}\",\"agencyCodes\":[\"{}\"]}}]'.\
#                 format(v['bcode'],v['code']),
#         }
#         #print(data2)
# #         r2 = s.post('https://www.costcotravel.com/rentalCarAgencySelection.act',
# #                     data=data2,cookies=cks,headers=headers)
#         r2 = s.post('https://www.costcotravel.com/rentalCarAgencySelection.act',
#                      data=data2)
#         prices = re.findall('<h3>(.{5,40})</h3></div></div></div><div style=\"height: 94px;\" class=\"col col-lg col-1'
#                             'col-lg-1 text-center height-item test\"><a data-responsive-referrer=\"carMatrix\"'
#                             'data-category-id=\"\S{20,50}\" data-selected=\"(?:false|true)\" data-product-id=\"'
#                             '\S{20,50}\"data-price=\"(\S{4,30})\"',
#                             r2.text)        
#         #print(r2.text)
        
#         if len(prices) == 0:
#             print("{:03d} ND | ".format(idx),resd['start'],resd['end'],resd['num'],resd['brand'],
#                   resd['code'],resd['name'],resd['bcode'],resd['dist'])
#         else:
#             #print("{:03d} OK |".format(k),tp[0],tp[2:5],tp[10:])
#             for pr in prices:
#                 try:
#                     v[pr[0]] = float(pr[1])
#                 except:
#                     v[pr[0]] = pr[1]
#         #print(v)
#         return (k,v)
#     else:
#         print("{:03d} ND | ".format(idx),agencyid)
#         return (idx,None)

In [ ]:
def runparser(agencylist,threads=1):
    assert agencylist is not None
    startidx = 0
    resultsfinal = {}
    for agencyid in agencylist: 
        if FIXEDLENGTH:
            # just want a sliding window
            starts = generateDates(PICKUP,VARYSTART)
            ends = generateDates(DROPOFF,VARYSTART)
            assert len(starts) == len(ends)
            spans = list(zip(starts,ends))
        else:
            # creates a cartesian product essentially
            starts = generateDates(PICKUP,VARYSTART)
            ends = generateDates(DROPOFF,VARYEND)
            spans = [(x,y) for x in starts for y in ends]    
        #print(len(spans))

        resultstemp = dict()
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop) 

        # Requests uses blocking io, so doing things in single thread event loop doesn't help much
        # -> have to do executor thingies and run separate threads
        executor = concurrent.futures.ThreadPoolExecutor(max_workers=threads)
        try:
            loop.run_until_complete(runall(executor,resultstemp,agencyid[0],agencyid[1],spans,startidx))
        finally:
            loop.close() 
        numresults = len(resultstemp)
        startidx += numresults
        resultsfinal.update(resultstemp)
        
    # You can't really do threads+sqlalchemy without careful connection management
    # So here results are first assembled into dictionary, then batch committed
    placeintodb(resultsfinal)
    return resultsfinal

def placeintodb(resultstemp):  
    for r in resultstemp.values():
        if r:
            entry = Price(
                agency_id=r['code']+r['brand'],
                pickup=r['start'],
                dropoff=r['end'],
                time=TIME,
                querytime=datetime.datetime.utcnow(),

                car_econ = r.get('Economy Car'),
                car_comp = r.get('Compact Car'),
                car_int = r.get('Intermediate Car'),
                car_std = r.get('Standard Car'),
                car_full = r.get('Fullsize Car'),
                car_prem = r.get('Premium Car'),
                car_lux = r.get('Luxury Car'),
                car_intsuv = r.get('Intermediate SUV'),
                car_stdsuv = r.get('Standard SUV'),
                car_fullsuv = r.get('Fullsize SUV'),
                car_premsuv = r.get('Premium SUV'),
                car_mv = r.get('Mini Van'),
                car_fv = r.get('Fullsize Van'),
            )
            session.add(entry)
    session.commit()

async def runall(executor,resultstemp,agencybcode,agencyid,spans,startidx):
    loop = asyncio.get_event_loop()
    #print(spans)
    blocking_tasks = [loop.run_in_executor(executor, getprice, session, agencybcode, agencyid, spans[i],\
                           i+startidx) for i in range(0,len(spans))]
    completed, pending = await asyncio.wait(blocking_tasks)
    results = [t.result() for t in completed]
    for i,r in enumerate(results):
        resultstemp[r[0]] = r[1]

In [ ]:
def getpriceasync_googletest(num):
    s = requests.Session()
    r = s.get('https://www.costcotravel.com/h=4005')
    print('{}: static test'.format(num))

    return (r.text)

In [ ]:
def populateagenciesdb(span):
    (results,_) = getagencylist(span)
    currentlist = session.query(Agency.ag_id, Agency.ag_brand).all()    
    for r in results.values(): 
        if (r['code'],r['brand']) not in currentlist:
            entry = Agency(uid = r['code']+r['brand'],
                ag_id = r['code'], ag_type = r['type'],                             
                ag_brand = r['brand'], ag_name = r['name'],
                ag_bcode = r['bcode'], ag_addr = r['addr'],
                ag_city = r['city'], ag_state = r['state'],
                ag_zip = r['zip'], ag_dist = r['dist'],
                ag_lat = r['lat'], ag_lon = r['lon'])
            session.add(entry)    
    session.commit()

In [ ]:
def computeTravelTimes(results):
    addresses = []
    for i in range(1,len(results)+1):
        entry = results[i]
        addresses.append(entry['addr']+', '+entry['city']+', '+entry['state']+', '+entry['zip']+', USA')
    batchsize = 100
    fullbatches = len(addresses)//batchsize
    print(len(addresses))
    destinations = []
    for i in range(0,fullbatches):
        #print(i)
        destinations.append('|'.join(addresses[i*batchsize:(i+1)*batchsize]))
    if len(addresses)%batchsize > 0:
        destinations.append('|'.join(addresses[fullbatches*batchsize:]))
    print([destinations[i].count('|')+1 for i in range(0,len(destinations))])
    origin = '1369 East Hyde Park Blvd, Chicago, IL, 60615, USA'

    for i, dest in enumerate(destinations):
        gresponse = getGDistance(origin, dest)
        time.sleep(0.5)
        #print(gresponse)
        if gresponse['status'] == 'OK':        
            elems = gresponse['rows'][0]['elements']
            print('Got gmatrix response for batch {}|{}'.format(i,len(elems)))
            for j,entry in enumerate(elems):
                idx = i*batchsize+j+1
                if (entry['status'] == 'OK'):
                    results[idx]['time'] = entry['duration']['value']
                    results[idx]['roaddist'] = entry['distance']['value']
                    #print(idx,results[idx]['time'],results[idx]['roaddist'])
                else:
                    print('{} ENTRY FAILED'.format(idx))
        else:
            print('Gmatrix response for batch {} FAILED'.format(i))
        
def getGDistance(origin,destination,mode='transit'):
    urlparams = {'units':'imperial','origins':origin,'destinations':destination,'key':gapikey,'mode':mode,
                'departure_time':1511366822,'transit_routing_preference':'fewer_transfers'}
    resp = requests.get('https://maps.googleapis.com/maps/api/distancematrix/json',params=urlparams)
    rj = resp.json()
    print(rj)
    return rj

In [ ]:
# def runparser(agencyids=None,threads=1):
#     assert agencyids is not None
#     startidx = 0
#     resultsfinal = {}
#     for agencyid in agencyids: 
#         if FIXEDLENGTH:
#             # just want a sliding window
#             starts = generateDates(PICKUP,VARYSTART)
#             ends = generateDates(DROPOFF,VARYSTART)
#             assert len(starts) == len(ends)
#             spans = list(zip(starts,ends))
#         else:
#             # creates a cartesian product essentially
#             starts = generateDates(PICKUP,VARYSTART)
#             ends = generateDates(DROPOFF,VARYEND)
#             spans = [(x,y) for x in starts for y in ends]    
#         print(len(spans))

#         resultstemp = dict()
#         loop = asyncio.new_event_loop()
#         asyncio.set_event_loop(loop) 

#         # Requests uses blocking io, so doing things in single thread event loop doesn't help much
#         # -> have to do executor thingies and run separate threads
#         executor = concurrent.futures.ThreadPoolExecutor(max_workers=threads)
#         try:
#             loop.run_until_complete(runall(executor,resultstemp,agencyid,spans,startidx))
#         finally:
#             loop.close() 
#         numresults = len(resultstemp)
#         startidx += numresults
#         resultsfinal.update(resultstemp)
#     return resultsfinal

# async def runall(executor,resultstemp,agencyid,spans,startidx):
#     #print('start t2')
#     loop = asyncio.get_event_loop()
#     #print(pool)
#     blocking_tasks = [loop.run_in_executor(executor, getpriceasync, agencyid, spans[i], i+startidx) for i in range(0,len(spans))]
#     #print(blocking_tasks)
#     completed, pending = await asyncio.wait(blocking_tasks)
#     #print(completed,pending)
#     results = [t.result() for t in completed]
#     for i,r in enumerate(results):
#         resultstemp[r[0]] = r[1]
#         #print(i,r)

In [ ]:
def generateDates(center,variance):
    dates = []    
    #lowerlim = datetime.datetime.now().date()
    for i in range(-variance,variance+1):
        dates.append(center+datetime.timedelta(days=i))
    return dates