In [30]:
import pandas as pd
import time
import numpy as np
import requests
import psycopg2
import json
import simplejson
import urllib
import config
import ast
import bs4
import pprint
import progressbar
from pymongo import MongoClient
from geopy.geocoders import Nominatim
from bs4 import BeautifulSoup as BS
from operator import itemgetter
from sklearn.cluster import KMeans
from sqlalchemy import create_engine
In [ ]:
!pip install --upgrade progressbar2
In [17]:
# conn_str = "dbname='travel_with_friends' user='Gon' host='localhost'"
conn_str = "dbname='travel_with_friends' user='zoesh' host='localhost'"
In [ ]:
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
# cur.execute("select index, name, coord0, coord1 from poi_detail_table where city !='%s' and state = '%s';" %(current_city, current_state))
cur.execute("select distinct city, state from poi_detail_table;" )
all_cities = cur.fetchall()
In [ ]:
all_cities[0][0]
In [ ]:
cities_coords = pd.read_csv('cities_coords.csv', header=None)
cities_coords.columns = ['area_code', 'city','state','nation', 'coord0','coord1']
cities_coords = cities_coords[['city','state','nation', 'coord0','coord1']].drop_duplicates()
cities_coords.reset_index(drop = True, inplace = True)
In [ ]:
geolocator = Nominatim()
for items in all_cities:
if cities_coords[cities_coords['state'] == items[1]][cities_coords.city == items[0]].shape[0] == 0:
location_name = ', '.join([items[0], items[1]])
try:
location = geolocator.geocode(location_name)
cities_coords.loc[len(cities_coords)] = [items[0], items[1], 'US', location.latitude, location.longitude]
except:
"error, rest"
time.sleep(20)
print" start again"
# print cities_coords.loc(len(cities_coords))
In [ ]:
cities_coords.to_csv('all_cities_coords.csv')
In [ ]:
engine = create_engine('postgresql://Gon@localhost:5432/travel_with_friends')
cities_coords.to_sql('all_cities_coords',engine, if_exists = "replace")
In [ ]:
import us_state_abbrevation as abb
state_abb_dict = abb.abb2state
state_abb_dict['CA']
# print state_abb_dict.keys()[state_abb_dict.values().index('CA')]
In [ ]:
from googleplaces import GooglePlaces, types, lang
YOUR_API_KEY = 'AIzaSyDMbpmHBLl7dTOXUOMZP7Vi3zbMJlByEKM'
google_places = GooglePlaces(YOUR_API_KEY)
In [ ]:
import geocoder
api_key1 = 'AIzaSyCrgwS_L75NfO9qzIKG8L0ox7zGw81BpRU'
api_key2 = 'AIzaSyBwh4WqOIVJGJuKkmzpQxlkjahgx6qzimk'
api_key3 = 'AIzaSyA25LW2CRcD9mSmiAWBYSPOSoiKP_m2plQ'
api_key4 = 'AIzaSyB3l2Trzm4LnrC0nyUwwoM9803Fuwf0my4'
api_key5 = 'AIzaSyDj0yH_35G1zMq5uYPF6X0ogkHYcLsNN1w'
add = ' 497 lakeside drive'
g = geocoder.google(add, key = api_key5)
In [ ]:
g.ok
In [5]:
import json
with open('api_key_list.config') as api_key_list_file:
api_key_list = json.load(api_key_list_file)
api_key_list['api_key_list']
In [27]:
api_key_list
In [ ]:
df_tmp = pd.read_csv('test_poi_detail_df_100.csv', index_col = 0)
# df_tmp.to_csv('test_poi_detail_df_100.csv', index_col=None)
df_tmp.head()
s.find(text ="Recommended length of visit:")
# visit_length = s.find(text ="Recommended length of visit:").parent.next_sibling
In [ ]:
client = MongoClient()
db = client.zoeshrm
db.TripAdvisor_state_park.count()
In [ ]:
from web_scraping_tripadvisor import state_park_web as web
state_park_pages = db.TripAdvisor_state_park.find()
poi_detail_state_park_df, error_message_df = web(state_park_pages)
In [ ]:
import re
page = db.TripAdvisor.find_one({'city': 'San Francisco, California'})
search_visit_length = re.compile('Recommended length of visit:')
s = BS(page['html'], "html.parser")
#index
#name
input_list, error_message = [],[]
state_abb_error, state_error, address_error, geo_error, review_error, score_error, ranking_error, tag_error = 0,0,0,0,0,0,0,0
latitude, longitude, geo_content = None, None, None
# print name
url = page['url']
name = s.find('h1', attrs = {'class':'heading_name'}).text.strip()
#street_address
street_address = s.find('span', attrs = {'class':'street-address'}).text.strip()
#city
city = s.find('span', attrs = {'property':'addressLocality'}).text.strip()
#state
state_abb = s.find('span', attrs = {'property':'addressRegion'}).text.strip()
if state_abb:
try:
# state = state_abb_dict.keys()[state_abb_dict.values().index(state_abb)]
state = abb2state_dict[state_abb]
except:
state_abb_error = 1
state = state_abb
else:
state_error =1
state_abb = None
state = None
#postal_code
postal_code = s.find('span', attrs = {'property':'postalCode'}).text.strip()
#country
if s.find('span', attrs = {'property':'addressCountry'}).get('content'):
country = s.find('span',{'property':'addressCountry'}).get('content')
elif s.find('span',{'property':'addressCountry'}).get('content') == None:
country = s.find('span',{'property':'addressCountry'}).text.strip()
else:
country = 'United States'
#address
if state:
full_address = street_address+', '+city+', '+state+', '+postal_code[:5]+', '+country
else:
address_error =1
full_address = street_address+', '+city+', '+postal_code[:5]+', '+country
# if (name in name_lst) and (full_address in full_address_lst):
# continue
# else:
# name_lst.append(name)
# full_address_lst.append(full_address)
#coord
try:
latitude, longitude, geo_content = find_latlng(full_address, name)
except:
geo_error =1
latitude, longitude, geo_content = None, None, None
#num_reviews
try:
num_reviews = s.find('div', attrs = {'class': 'rs rating'}).find('a').get('content')
if num_reviews == None:
num_reviews = s.find('a', {'property': "reviewCount"}).get('content')
except:
num_reviews = 0
review_error=1
#review_score
try:
review_score = s.find('div', attrs = {'class': 'heading_rating separator'}).find('img').get('content')
if review_score == None:
review_score = s.find('a', {'property': "ratingValue"}).get('content')
except:
review_score = 0
score_error =1
#ranking
try:
ranking = s.find('b', attrs = {'class':'rank_text wrap'}).text.strip().replace('#',"")
except:
ranking = 999
ranking_error=1
#tag
try:
tags = ", ".join(label.text.strip() for label in s.select('div.detail > a') + s.select('span.collapse.hidden > a'))
except:
tags = None
tag_error =1
#visit_length
if s.find('b', text =search_visit_length):
raw_visit_length = s.find('b', text =search_visit_length).next_sibling.strip()
else:
raw_visit_length = None
#fee
if s.find(text= "Fee:"):
fee = s.find(text= "Fee:").parent.next_sibling.upper()
else:
fee = 'NO'
#description
if s.find('div', attrs = {'class': "listing_details"}):
description = s.find('div', attrs = {'class': "listing_details"}).text.strip()
else:
description = None
# error_message = [len(poi_detail_state_park_df), name, url,state_abb_error, state_error, address_error, geo_error, review_error, score_error, ranking_error, tag_error]
# error_message_df.loc[len(poi_detail_state_park_df)] =error_message
# input_list = [len(poi_detail_state_park_df), name, street_address, city, state_abb, state, postal_code, country, full_address, latitude, longitude, num_reviews, review_score, ranking, tags, visit_length, fee, description, url, geo_content]
# poi_detail_state_park_df.loc[len(poi_detail_state_park_df)] = input_list
In [ ]:
import re
search_visit_length = re.compile('Recommended length of visit:')
test = s.find('b', text =search_visit_length).next_sibling.strip()
tests
In [28]:
import json
with open('api_key_list.config') as f:
data = json.load(f)
In [29]:
data['api_key_list']
Out[29]:
In [35]:
df_poi = pd.read_csv('poi_detail_df.csv', index_col = 0)
In [36]:
Out[36]:
In [263]:
search_fee = re.compile('Fee:')
df_poi = pd.read_csv('test_poi_detail_df.csv', index_col= 0)
poi_pages = db.TripAdvisor.find()
fee_lst = []
cnt = 0
for page in poi_pages:
s = BS(page['html'], "html.parser")
if s.find('b', text= search_fee):
fee = s.find('b',text= search_fee).next_sibling.strip()
else:
fee = 'Unknown'
fee_lst.append(fee)
cnt+=1
if cnt%100 ==0 :
print '#items in fee lst: ',len(fee_lst)
In [ ]:
fee_lst
In [ ]:
error_message_df.to_csv('error_message.csv', encoding=('utf-8'))
poi_detail_state_park_df.to_csv("poi_detail_state_park.csv", encoding=('utf-8'))
In [ ]:
try:
poi_additional_detail = poi_detail_state_park[['index','name','url','address','geo_content']]
geo_content_detail=poi_detail_state_park.pop('geo_content')
except:
None
In [ ]:
db.geo_content.drop()
db.geo_content.insert_many(poi_additional_detail.to_dict('records'))
poi_detail_state_park.to_sql('poi_detail_state_park_table',engine, if_exists = "replace")
In [ ]:
print poi_detail_state_park_df.shape, error_message_df.shape
In [ ]:
error_message_df.columns
In [ ]:
In [ ]:
# !pip install geocoder
In [64]:
def find_latlng(full_address, name):
g_address = geocoder.google(full_address)
if g_address.ok:
latitude= g_address.lat
longitude = g_address.lng
return latitude, longitude, g_address.content
g_name = geocoder.google(name)
if g_name.ok:
latitude= g_name.lat
longitude = g_name.lng
return latitude, longitude, g_name.content
else:
latitude = None
longitude = None
return latitude, longitude, None
def find_latlng(full_address, name, key):
g_address = geocoder.google(full_address, key = key)
if g_address.content['status'] == 'OVER_QUERY_LIMIT':
return False
if g_address.ok:
latitude= g_address.lat
longitude = g_address.lng
return [latitude, longitude, g_address.content]
g_name = geocoder.google(name, key = key)
if g_name.content['status'] == 'OVER_QUERY_LIMIT':
return False
if g_name.ok:
latitude= g_name.lat
longitude = g_name.lng
return [latitude, longitude, g_name.content]
else:
latitude = None
longitude = None
return [latitude, longitude, None]
with open('api_key_list.config') as api_key_list_file:
api_key_list = json.load(api_key_list_file)
api_key = api_key_list['api_key_list']
In [ ]:
def find_geo_location(full_address, name):
query_result = google_places.nearby_search(location= full_address, keyword=name)
if len(query_result.places) >0:
best_result = query_result.places[0]
latitude = best_result.geo_location["lat"]
longitude = best_result.geo_location["lng"]
google_result_name = best_result.name
return latitude, longitude, google_result_name
else:
print name, "google API cant find here."
return None, None, None
In [ ]:
poi_detail_state_park=pd.DataFrame(columns=['index','name','street_address','city','state_abb','state','postal_code','country','address','coord_lat','coord_long','num_reviews','review_score','ranking','tag','visit_length','fee','description','url',"geo_content"])
In [ ]:
error_message_df = pd.DataFrame(columns=['index','name','url','state_abb_error','address_error','geo_error','review_error','score_error','ranking_error','tag_error'])
In [ ]:
# poi_detail_state_park2=pd.DataFrame(columns=['index','name','street_address','city','state_abb','state','postal_code','country','address','coord_lat','coord_long','num_reviews','review_score','ranking','tag','visit_length','fee','description'])
In [ ]:
state_park_pages = db.TripAdvisor_state_park.find()
index = 0
for page in state_park_pages[len(poi_detail_state_park):]:
s = BS(page['html'], "html.parser")
#index
#name
error_message = []
state_abb_error, address_error, geo_error, review_error, score_error, ranking_error, tag_error = 0,0,0,0,0,0,0
input_list = []
# print name
url = page['url']
name = s.find('h1', attrs = {'class':'heading_name'}).text.strip()
#street_address
street_address = s.find('span', attrs = {'class':'street-address'}).text.strip()
#city
city = s.find('span', attrs = {'property':'addressLocality'}).text.strip()
#state
state_abb = s.find('span', attrs = {'property':'addressRegion'}).text.strip()
if state_abb:
try:
state = state_abb_dict[state_abb]
except:
state_abb_error = 1
state = state_abb
else:
state_abb = None
state = None
#postal_code
postal_code = s.find('span', attrs = {'property':'postalCode'}).text.strip()
#country
if s.find('span', attrs = {'property':'addressCountry'}).get('content'):
country = s.find('span',{'property':'addressCountry'}).get('content')
elif s.find('span',{'property':'addressCountry'}).get('content') == None:
country = s.find('span',{'property':'addressCountry'}).text.strip()
else:
country = 'United States'
#address
if state_abb:
full_address = street_address+', '+city+', '+state_abb+', '+postal_code[:5]+', '+country
else:
address_error =1
full_address = street_address+', '+city+', '+postal_code[:5]+', '+country
#coord
try:
latitude, longitude, geo_content = find_latlng(full_address, name)
except:
geo_error =1
latitude, longitude, geo_content = None, None, None
# break
#num_reviews
try:
num_reviews = s.find('div', attrs = {'class': 'rs rating'}).find('a').get('content')
if num_reviews == None:
num_reviews = s.find('a', {'property': "reviewCount"}).get('content')
except:
num_reviews = 0
review_error=1
#review_score
try:
review_score = s.find('div', attrs = {'class': 'heading_rating separator'}).find('img').get('content')
if review_score == None:
review_score = s.find('a', {'property': "ratingValue"}).get('content')
except:
review_score = 0
score_error =1
#ranking
try:
ranking = s.find('b', attrs = {'class':'rank_text wrap'}).text.strip().replace('#',"")
except:
ranking = 999
ranking_error=1
#tag
try:
tags = ", ".join(label.text.strip() for label in s.select('div.detail > a') + s.select('span.collapse.hidden > a'))
except:
tags = None
tag_error =1
#visit_length
if s.find(text ="Recommended length of visit:"):
visit_length = s.find(text ="Recommended length of visit:").parent.next_sibling
else:
visit_length = None
#fee
if s.find(text= "Fee:"):
fee = s.find(text= "Fee:").parent.next_sibling.upper()
else:
fee = 'NO'
#description
if s.find('div', attrs = {'class': "listing_details"}):
description = s.find('div', attrs = {'class': "listing_details"}).text.strip()
else:
description = None
input_list = [index, name, street_address, city, state_abb, state, postal_code, country, full_address, latitude, longitude, num_reviews, review_score, ranking, tags, visit_length, fee, description, url, geo_content]
poi_detail_state_park.loc[len(poi_detail_state_park)] = input_list
error_message = [index, name, url,state_abb_error, address_error, geo_error, review_error, score_error, ranking_error, tag_error]
error_message_df.loc[len(poi_detail_state_park)] =error_message
index += 1
# time.sleep(1)
In [ ]:
poi_detail_state_park.shape
In [ ]:
url_df.shape
In [ ]:
import web_scraping_tripadvisor as web
In [ ]:
error_message_df.to_csv('error_message.csv', encoding=('utf-8'))
poi_detail_state_park.to_csv("poi_detail_state_park.csv", encoding=('utf-8'))
In [ ]:
try:
poi_additional_detail = poi_detail_state_park[['index','name','url','address','geo_content']]
geo_content_detail=poi_detail_state_park.pop('geo_content')
except:
None
In [ ]:
db.geo_content.insert_many(poi_additional_detail.to_dict('records'))
poi_detail_state_park.to_sql('poi_detail_state_park_table',engine, if_exists = "replace")
In [ ]:
# poi_detail_state_park[poi_detail_state_park['name']== 'Jessie M. Honeyman Memorial State Park']
In [ ]:
# poi_detail_state_park.loc[2065]
In [ ]:
# poi_detail_state_park.drop(poi_detail_state_park.index[2065:], inplace = True)
In [ ]:
poi_detail_state_park.to_csv("poi_detail_state_park.csv", encoding=('utf-8'))
In [ ]:
poi_detail_state_park = pd.read_csv('poi_detail_state_park.csv')
In [78]:
# poi_detail_df = pd.read_csv('poi_detail_df_coords_apr_24.csv', index_col = 0)
# np.isnan(poi_detail_df.coord_lat[0])
i = 0
for index in poi_detail_df[np.isnan(poi_detail_df.coord_long)].index:
full_address = poi_detail_df.loc[index].address
name = poi_detail_df.loc[index]['name']
result_longlat = find_latlng(full_address, name, api_key[i])
[latitude, longitude, geo_content] = result_longlat
poi_detail_df.set_value(index, 'coord_long', longitude)
poi_detail_df.set_value(index, 'coord_lat', latitude)
poi_detail_df.set_value(index, 'geo_content', geo_content)
# print result_longlat
In [1]:
poi_detail_df.coord_long[5182]
In [86]:
new_poi_df = poi_detail_df[~np.isnan(poi_detail_df.coord_long)]
In [88]:
new_poi_df.drop('index',axis =1 , inplace=True)
In [94]:
new_poi_df.reset_index(inplace=True)
In [96]:
new_poi_df.drop('index', axis = 1, inplace=True)
In [98]:
new_poi_df.to_csv('poi_detail_v2.csv', index_label=None)
In [ ]:
pd.read_csv('poi_detail_v2')
In [ ]:
update_idx = poi_detail_state_park[poi_detail_state_park.coord_long == incorrect_long].index.values
for index in update_idx:
full_address = poi_detail_state_park.loc[index].address
name = poi_detail_state_park.loc[index].name
try:
print 'start index: ', index
latitude, longitude, geo_content = find_latlng(full_address, name)
poi_detail_state_park.set_value(index, 'coord_long', longitude)
poi_detail_state_park.set_value(index, 'coord_lat', latitude)
poi_detail_state_park.set_value(index, 'geo_content', geo_content)
print poi_detail_state_park.loc[index][['coord_long','coord_lat','geo_content']]
except:
print 'why', index
break
In [ ]:
In [ ]:
poi_detail_state_park.to_csv('poi_detail_state_park_v2.csv', index=False)
In [ ]:
poi_additional_detail = poi_detail_state_park[['index','name','url','address','geo_content']]
In [ ]:
geo_content_detail=poi_detail_state_park.pop('geo_content')
In [ ]:
poi_detail_state_park['geo_content'] = geo_content_detail
In [ ]:
db.geo_content.insert_many(poi_additional_detail.to_dict('records'))
In [ ]:
poi_detail_state_park.to_sql('poi_detail_state_park_table',engine, if_exists = "replace")
In [ ]:
htmlurl = 'https://www.tripadvisor.com/Attraction_Review-g35805-d1134861-Reviews-Cloud_Gate-Chicago_Illinois.html'
htmlurl = 'https://www.tripadvisor.com/Attraction_Review-g60713-d127854-Reviews-San_Francisco_Zoo-San_Francisco_California.html'
htmlurl = 'https://www.tripadvisor.com/Attraction_Review-g60750-d104122-Reviews-San_Diego_Zoo-San_Diego_California.html'
htmlurl = 'https://www.tripadvisor.com/Attraction_Review-g60713-d102523-Reviews-Alcatraz_Island-San_Francisco_California.html'
# htmlurl = 'https://www.tripadvisor.com/Attraction_Review-g32474-d4236729-Reviews-Harmony_Headlands_State_Park-Harmony_San_Luis_Obispo_County_California.html'
# htmlurl = 'https://www.tripadvisor.com/Attraction_Review-g42926-d142814-Reviews-Cannon_Valley_Trail-Cannon_Falls_Minnesota.html'
# htmlurl = 'https://www.tripadvisor.com/Attraction_Review-g42891-d126627-Reviews-Paul_Bunyan_State_Trail-Brainerd_Minnesota.html'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
r=requests.get(htmlurl,headers=headers)
s = BS(r.text, 'html.parser')
# for div in s.find('div', attrs = {'class' : "separator" }):
# for tag in div.:
# if tag.name == 'div' and tag.get('class', '') == ['detail']:
# print tag.text
# for item in div.contents:
# # print item
# if type(item)== 'bs4.element.Tag' and item.name == "detail":
# print 1234567890
st = time.time()
for div in s.findAll("div", {"class": "separator"}):
for tag in div.contents:
if isinstance(tag, bs4.element.Tag) and tag.get('class',"") == ['detail'] :
tags = tag.text.encode('utf8').strip()
print time.time() - st
tags
In [ ]:
# s.find('span',{'property':'addressCountry'}).get('content')
# s.select('span[property="addressCountry"]').get('content')
In [ ]:
#index
#name
name = s.find('h1', attrs = {'class':'heading_name'}).text.strip()
#city
city = s.find('span', attrs = {'property':'addressLocality'}).text.strip()
street_address = s.find('span', attrs = {'class':'street-address'}).text.strip()
#state
state_abb = s.find('span', attrs = {'property':'addressRegion'}).text.strip()
# state = state_abb_dict.keys()[state_abb_dict.values().index(state_abb)]
postal_code = s.find('span', attrs = {'property':'postalCode'}).text.strip()
#country
country = s.find('span',{'property':'addressCountry'}).get('content')
#address
full_address = street_address+', '+city+', '+state_abb+', '+postal_code+', '+country
# from geopy.geocoders import Nominatim
# geolocator = Nominatim()
# location =geolocator.geocode(street_address+', '+city+', '+state_abb+', '+country)
# #coord_lat
# coord_lat = location.latitude
# #coord_long
# coord_long =location.longitude
#num_reviews
# num_reviews = s.find('div', attrs = {'class': 'rs rating'}).find('a').get('content')
#review_score
# review_score = s.find('div', attrs = {'class': 'heading_rating separator'}).find('img').get('content')
#ranking
ranking = s.find('b', attrs = {'class':'rank_text wrap'}).text.strip().replace('#',"")
#tag
tags = ", ".join(label.text for label in s.select('div.detail > a') + s.select('span[class="collapse hidden"] > a'))
#visit_length
# visit_length = s.find(text ="Recommended length of visit:").parent.next_sibling
# #fee
# fee = s.find(text= "Fee:").parent.next_sibling
#description
description = s.find('div', attrs = {'class': "listing_details"}).text.strip()
In [ ]:
st =time.time()
d =", ".join(label.text.strip() for label in s.select('div.listing_details'))
# print d
ed = time.time() -st
print ed
In [ ]:
st =time.time()
s.find('div', attrs = {'class': "listing_details"}).text.strip()
ed = time.time() -st
print ed
In [ ]:
# s.select('span.hidden.collapse > a')
postal_code = s.find('span', attrs = {'property':'postalCode'}).text.strip()
print postal_code[:5]
In [ ]:
# num_reviews = s.find('div', attrs = {'class': 'rs rating'}).find('a').get('content')
t1 = time.time()
s.select('a[property="reviewCount"]')[0].get("content")
t2 = time.time()
s.find('a', {'property': "reviewCount"}).get('content')
et = time.time()
print et -t1, et-t2
In [ ]:
!pip install python-google-places
In [ ]:
from googleplaces import GooglePlaces, types, lang
YOUR_API_KEY = 'AIzaSyDJh9EWCA_v0_B3SvjzjUA3OSVYufPJeGE'
google_places = GooglePlaces(YOUR_API_KEY)
print name, full_address
address1 = "393 County Road 174, Grove Hill, AL, 35975, United States"
query_result = google_places.nearby_search(location = address1, keyword=name)
query_result
In [ ]:
In [ ]:
name, full_address
In [ ]:
# s.select('div[class="detail"] > a')
In [ ]:
# <span class="collapse hidden">, <a href="/Attractions-g60713-Activities-c57-t68-San_Francisco_California.html">Nature & Wildlife Areas</a></span>
In [ ]:
# detail = {}
# addition_info = s.find('div', attrs = {'class':'details_wrapper'}).text.strip('\n').replace("\n\n","\n").split('\n')
# # if addition_info[0] == 'Description':
# # print addition_info[1]
# addition_info
# for info in addition_info:
# info_list = info.split(':')
# if info_list[0]=="Fee":
# details["Fee"] = info_list[1]
# else:
# details["length of visit"] = info_list[1]
# details
In [ ]:
# fee = s.find('div', {'class':'details_wrapper'})
# fee
# length_visit = s.find(text ="Recommended length of visit:").parent.next_sibling
# length_visit
# fee = s.find(text= "Fee:").parent.next_sibling
# fee
In [ ]:
# description = s.find('div', attrs = {'class': "listing_details"}).text.strip()
# print description
In [ ]:
len(query_result.places)
In [ ]:
## different api try
# try:
# YOUR_API_KEY = 'AIzaSyDMbpmHBLl7dTOXUOMZP7Vi3zbMJlByEKM'
# google_places = GooglePlaces(YOUR_API_KEY)
# latitude, longitude, google_result_name = find_geo_location(full_address, name)
# except:
# print "API error, try different key"
# time.sleep(20)
# try:
# YOUR_API_KEY = 'AIzaSyAwx3xg6oJ0yiPV3MIunBa1kx6N7v5Tcw8'
# google_places = GooglePlaces(YOUR_API_KEY)
# latitude, longitude, google_result_name = find_geo_location(full_address, name)
# except:
# print "both Key dont work"
# print" location not found: ", name, "address : ", full_address
# break
# if location:
# #coord_lat
# poi_detail_state_park['coord_lat'] = location.latitude
# #coord_long
# poi_detail_state_park['coord_long'] =location.longitude
# else:
# print" location not found: ", name, "address : ", full_address
In [ ]:
state_abb_error_ix = error_message_df[error_message_df['state_abb_error']==1]['index']
address_error_ix = error_message_df[error_message_df['address_error']==1]['index']
In [ ]:
# poi_detail_state_park_df.ix[state_abb_error_ix][['state_abb','state','country']]
poi_detail_state_park_df.ix[address_error_ix][['address','country']]
In [ ]:
error_message_df.columns
In [ ]:
poi_detail_state_park_df.columns
In [ ]:
In [ ]:
# poi_detail_state_park.fee[poi_detail_state_park.fee == 'NO']
In [ ]:
poi_detail_state_park.shape
In [ ]:
err = error_message_df[error_message_df.review_error == 1].index
In [ ]:
for i, link in enumerate(poi_detail_state_park_df.ix[err][['name','url']].url):
print i, link
In [ ]:
error_message_df.tail()
In [ ]:
poi_detail_state_park_df.drop_duplicates('coord_lat').shape
In [299]:
# htmlurl = 'https://en.wikipedia.org/wiki/List_of_areas_in_the_United_States_National_Park_System'
htmlurl= 'https://en.wikipedia.org/wiki/List_of_national_parks_of_the_United_States'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
r=requests.get(htmlurl,headers=headers)
s = BS(r.text, 'html.parser')
In [333]:
national_park_US_df= pd.DataFrame(columns=["name","state"])
name, state =None, None
table = s.find('table', {"class" : "wikitable"})
for row in table.findAll("tr")[1:]:
if row.find('th', {'scope':"row"}) != None:
name = row.find('th', {'scope':"row"}).next_element.get('title')
cells = row.findAll("td")
#For each "tr", assign each "td" to a variable.
if len(cells) == 6:
state = cells[1].find(text=True)
national_park_US_df.loc[len(national_park_US_df)] = [name, state]
In [357]:
national_park_US_df
Out[357]:
In [358]:
for index in national_park_US_df.index:
keyword = national_park_US_df.name[index].replace(' ','+')+"+"+national_park_US_df.state[index].replace(' ','+')
# keyword = national_park_US_df.name[index].replace(' ','+')
trip_url = "https://www.tripadvisor.com/Search?q=" +keyword+"&queryParsed=true&searchSessionId"
# headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
# r=requests.get(trip_url,headers=headers)
# test_s = BS(r.text, 'html.parser')
print index, trip_url
In [361]:
import wikipedia
wiki = wikipedia.page('List_of_national_parks_of_the_United_States')
In [ ]:
In [ ]:
In [298]:
https://www.tripadvisor.com/Search?q=Acadia+National+Park&geo=28940&queryParsed=true&searchSessionId=F658A1719FACDE7E30D13912D3D1B3381492826820567ssid
Out[298]:
In [ ]:
https://www.tripadvisor.com/Search?q=Pinnacles+national+park&queryParsed=true&searchSessionId
In [ ]:
https://www.tripadvisor.com/Search?q=Acadia+National+Park&queryParsed=true&searchSessionId
In [347]:
test =national_park_US_df.name[0].replace(" ", "+")
In [349]:
trip_url = "https://www.tripadvisor.com/Search?q=" +test+"&queryParsed=true&searchSessionId"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
r=requests.get(trip_url,headers=headers)
test_s = BS(r.text, 'html.parser')
In [418]:
# trip_url_30 = "https://www.tripadvisor.com/Search?geo=191&redirect&q=national+parks&uiOrigin=MASTHEAD&ssrc=A&returnTo=__2F__Tourism__2D__g143030__2D__Great__5F__Basin__5F__National__5F__Park__5F__Nevada__2D__Vacations__2E__html&pid=3825&startTime=1492837392267&searchSessionId=F658A1719FACDE7E30D13912D3D1B3381492834657203ssid#&ssrc=g&o=0"
# trip_url_60 = "https://www.tripadvisor.com/Search?geo=191&redirect&q=national+parks&uiOrigin=MASTHEAD&ssrc=A&returnTo=__2F__Tourism__2D__g143030__2D__Great__5F__Basin__5F__National__5F__Park__5F__Nevada__2D__Vacations__2E__html&pid=3825&startTime=1492837392267&searchSessionId=F658A1719FACDE7E30D13912D3D1B3381492834657203ssid#&ssrc=g&o=30"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
r=requests.get(trip_url_30,headers=headers)
# trip_30 = BS(r.text, 'html.parser')
# r=requests.get(trip_url_60,headers=headers)
# trip_60 = BS(r.text, 'html.parser')
import urllib
trip_30_html = urllib.urlopen("trip_30.html").read()
trip_60_html = urllib.urlopen("trip_60.html").read()
trip_30 = BS(trip_30_html, 'html.parser')
trip_60 = BS(trip_60_html, 'html.parser')
In [491]:
import re
to_do = re.compile("Things to do")
# trip_search_result = pd.DataFrame(columns=["name","url"])
for poi in trip_60.findAll('div', {"class": "title"}):
name = poi.text
for child in poi.next_siblings:
if child.find(text=to_do) != None:
url = child.find(text=to_do).parent.get('href')
trip_search_result.loc[len(trip_search_result)] = [name, url]
# for link in trip_30.findAll(text = to_do):
# print link.parent.get('href')
# name = poi.text
# url = poi.get('onclick').replace("ta.setEvtCookie('Search_Results_Page', 'POI_Name', '', 0, '", "").replace("')","")
In [671]:
# trip_search_result
In [493]:
# # for url in trip_search_result.url:
# url = trip_search_result.url[0]
# headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
# r=requests.get(url,headers=headers)
# s = BS(r.text, 'html.parser')
In [663]:
def request_s(url):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
r=requests.get(url,headers=headers)
s = BS(r.text, 'html.parser')
return s
def thing_to_do_in_national_park(s):
thing_to_do = pd.DataFrame(columns=["national_park_name","activate_name","url","num_reviews","score","ranking","tags"])
national_park_name = s.find('h1', {"id": "HEADING"}).text.strip('\n').replace("Things to Do in ","")
print "park name: ",national_park_name
for activate in s.findAll('div', {"class":"listing_title"}):
activate_name = activate.text.strip()
url ="https://www.tripadvisor.com"+ activate.find('a').get("href")
if activate.find_next('div', {"class":"rs rating"}) ==None:
score, num_reviews = 0, 0
else:
score = activate.find_next('div', {"class":"rs rating"}).find('span').get('alt').replace(" of 5 bubbles","")
num_reviews = activate.find_next('div', {"class":"rs rating"}).find('span', {'class': "more"}).text.strip().replace("reviews","")
ranking = activate.find_next('div', {'class':"popRanking wrap"}).text.strip().replace("#","")[0]
if activate.find_next('div',{'class':"tag_line"}).find('span') == None:
tags = None
else:
tags = activate.find_next('div',{'class':"tag_line"}).find('span').text
list_thing = [national_park_name, activate_name, url, num_reviews, score, ranking, tags]
thing_to_do.loc[len(thing_to_do)] = list_thing
return thing_to_do
In [672]:
poi_detail_national_park_df = pd.DataFrame(columns=["national_park_name","activate_name","url","num_reviews","score","ranking","tags"])
for url in trip_search_result.url:
thing_to_do_page = request_s(url)
result = (thing_to_do_in_national_park(thing_to_do_page))
poi_detail_national_park_df = poi_detail_national_park_df.append(result, ignore_index=True)
time.sleep(5)
In [673]:
poi_detail_national_park_df.to_csv('poi_detail_national_park_df.csv',encoding=('utf-8'))
poi_detail_national_park_df
Out[673]:
In [6]:
df_city_coords_path = '/Users/zoesh/Desktop/travel_with_friends/travel_with_friends/all_cities_coords.csv'
In [71]:
cities_coords = pd.read_csv(df_city_coords_path)
cities_coords = cities_coords[['city', 'state','nation','coord0','coord1']].drop_duplicates()
In [7]:
import pandas as pd
cities_coords = pd.read_csv(df_city_coords_path)
In [15]:
import psycopg2
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute("drop table all_cities_coords.csv;")
conn.commit()
conn.close()
In [17]:
poi_detail_df
In [243]:
poi_detail_path = '/Users/zoesh/Desktop/travel_with_friends/travel_with_friends/new_poi_df_v2.csv'
new_poi_df = pd.read_csv(poi_detail_path, index_col=0)
In [244]:
new_poi_df[new_poi_df['name'] == 'Black Sheep Bike Rental']
Out[244]:
In [75]:
new_poi_df[(new_poi_df.raw_visit_length.astype(str) == 'nan') & (new_poi_df.city == 'New York City')]
Out[75]:
In [203]:
#set type!
new_poi_df['poi_type'] = new_poi_df.tag
for index in new_poi_df.index:
# if str(new_poi_df.raw_visit_length[index]) == 'nan':
try:
if 'Museums' in new_poi_df.poi_type[index]:
#hrs spent: 120min
# # print new_poi_df.poi_type[index]
new_poi_df.set_value(index, 'poi_type', 'Museum')
elif 'Theme Parks' in new_poi_df.poi_type[index]:
#hrs spent: 480min
# print new_poi_df['name'][index]
new_poi_df.set_value(index, 'poi_type', 'ThemePark')
elif 'Zoos' in new_poi_df.poi_type[index]:
# print new_poi_df['name'][index]
new_poi_df.set_value(index, 'poi_type', 'Zoo')
elif 'Parks' in new_poi_df.poi_type[index]:
if 'National Historical Park' in new_poi_df['name'][index]:
#hrs spent: 90min
new_poi_df.set_value(index, 'poi_type', 'NationalHistoricalPark')
elif 'National Aquarium' in new_poi_df['name'][index]:
#hrs spent: 180min
new_poi_df.set_value(index, 'poi_type', 'NationalAquarium')
elif 'National Memorial' in new_poi_df['name'][index]:
#hrs spent: 120min
new_poi_df.set_value(index, 'poi_type', 'NationalMemorial')
elif 'National Wildlife Refuge' in new_poi_df['name'][index]:
#hrs spent: 90min
new_poi_df.set_value(index, 'poi_type', 'NationalWildlifeRefuge')
elif 'National Monument' in new_poi_df['name'][index]:
#hrs spent: 120min
new_poi_df.set_value(index, 'poi_type', 'NationalMonument')
elif 'National Forest' in new_poi_df['name'][index]:
#hrs spent: 180min
new_poi_df.set_value(index, 'poi_type', 'NationalForest')
elif 'National Beach' in new_poi_df['name'][index]:
#hrs spent: 180min
new_poi_df.set_value(index, 'poi_type', 'NationalBeach')
elif 'State Park' in new_poi_df['name'][index]:
#hrs spent: 120 min?
new_poi_df.set_value(index, 'poi_type', 'StatePark')
else:
#regional parks and gardens: hrs spent:60min
new_poi_df.set_value(index, 'poi_type', 'Park')
elif 'State Park' in new_poi_df['name'][index]:
new_poi_df.set_value(index, 'poi_type', 'StatePark')
elif 'Nightlife' in new_poi_df.poi_type[index]:
# print new_poi_df['name'][index]
new_poi_df.set_value(index, 'poi_type', 'Nightlife')
elif 'Landmark' in new_poi_df.poi_type[index]:
#hrs spent: 30 min
new_poi_df.set_value(index, 'poi_type', 'Landmark')
elif 'Shopping' in new_poi_df.poi_type[index]:
#hrs spent: 60 min?
if 'Shopping' in new_poi_df['name'][index]:
new_poi_df.set_value(index, 'poi_type', 'ShoppingMall')
else:
#hrs spent: 30 min?
new_poi_df.set_value(index, 'poi_type', 'Shopping')
elif 'Theater' in new_poi_df.poi_type[index]:
#hrs spent: 120 min
new_poi_df.set_value(index, 'poi_type', 'Theater')
elif 'Casino' in new_poi_df.poi_type[index]:
#hrs spent: 90 min
new_poi_df.set_value(index, 'poi_type', 'Casino')
elif 'Stadium' in new_poi_df['name'][index]:
#hrs spent: 15 min
new_poi_df.set_value(index, 'poi_type', 'Stadium')
# elif 'Stadium' in new_poi_df['name'][index]:
# #hrs spent: 15 min
# new_poi_df.set_value(index, 'poi_type', 'Stadium')
elif 'Outdoor Activities' in new_poi_df.poi_type[index]:
#hrs spent: 150 min
new_poi_df.set_value(index, 'poi_type', 'OutdoorActivities')
elif 'Food' in new_poi_df.poi_type[index]:
#hrs spent: 60 min
new_poi_df.set_value(index, 'poi_type', 'Food')
elif 'Visitor Center' in new_poi_df.poi_type[index]:
#hrs spent: 15 min
new_poi_df.set_value(index, 'poi_type', 'VisotorCenter')
elif 'Spa' in new_poi_df.poi_type[index]:
#hrs spent: 120 min
new_poi_df.set_value(index, 'poi_type', 'Spa')
elif 'Games' in new_poi_df.poi_type[index]:
#hrs spent: 90 min
new_poi_df.set_value(index, 'poi_type', 'Game')
elif 'Libraries' in new_poi_df.poi_type[index]:
#hrs spent: 15 min
new_poi_df.set_value(index, 'poi_type', 'Library')
elif 'Tours' in new_poi_df.poi_type[index]:
#hrs spent: 120 min
new_poi_df.set_value(index, 'poi_type', 'Tour')
elif 'Transportation' in new_poi_df.poi_type[index]:
#hrs spent: 60 min
new_poi_df.set_value(index, 'poi_type', 'Transportation')
elif 'Show' in new_poi_df.poi_type[index]:
#hrs spent: 180 min
new_poi_df.set_value(index, 'poi_type', 'Show')
else:
#hrs spent: 15 min
new_poi_df.set_value(index, 'poi_type', 'Other')
# print new_poi_df['name'][index], new_poi_df['url'][index]
# print new_poi_df['name'][index]
# print new_poi_df.poi_type[index]
except:
#hrs spent 15min
new_poi_df.set_value(index, 'poi_type', 'Other')
In [332]:
new_poi_df['adjusted_visit_length'] = None
for index in new_poi_df.index:
if new_poi_df.raw_visit_length[index] == '1-2 hours':
new_poi_df.set_value(index, 'adjusted_visit_length', 120)
elif new_poi_df.raw_visit_length[index] == '<1 hour':
new_poi_df.set_value(index, 'adjusted_visit_length', 60)
elif new_poi_df.raw_visit_length[index] == '2-3 hours':
new_poi_df.set_value(index, 'adjusted_visit_length', 180)
elif new_poi_df.raw_visit_length[index] == 'More than 3 hours':
new_poi_df.set_value(index, 'adjusted_visit_length', 360)
elif str(new_poi_df.raw_visit_length[index]) == 'nan':
if 'Museum' == new_poi_df.poi_type[index]:
#hrs spent: 120min
new_poi_df.set_value(index, 'adjusted_visit_length', 120)
elif 'ThemePark' == new_poi_df.poi_type[index]:
#hrs spent: 480min
new_poi_df.set_value(index, 'adjusted_visit_length', 480)
elif 'Zoo' == new_poi_df.poi_type[index]:
# hrs spent: 120min
new_poi_df.set_value(index, 'adjusted_visit_length', 120)
elif 'NationalHistoricalPark' == new_poi_df.poi_type[index]:
#hrs spent: 90min
new_poi_df.set_value(index, 'adjusted_visit_length', 90)
elif 'NationalAquarium' == new_poi_df.poi_type[index]:
#hrs spent: 180min
new_poi_df.set_value(index, 'adjusted_visit_length', 180)
elif 'NationalMemorial' == new_poi_df.poi_type[index]:
#hrs spent: 120min
new_poi_df.set_value(index, 'adjusted_visit_length', 120)
elif 'NationalWildlifeRefuge' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 90)
elif 'NationalMonument' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 120)
elif 'NationalForest' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 180)
elif 'NationalBeach' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 180)
elif 'StatePark' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 120)
elif 'Park' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 60)
elif 'Nightlife' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 60)
elif 'Landmark' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 30)
elif 'Shopping' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 30)
elif 'ShoppingMall' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 60)
elif 'Theater' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 60)
elif 'Casino' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 90)
elif 'Stadium' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 15)
elif 'OutdoorActivities' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 150)
elif 'Food' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 60)
elif 'VisotorCenter' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 15)
elif 'Spa' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 120)
elif 'Game' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 90)
elif 'Library' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 15)
elif 'Tour' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 120)
elif 'Transportation' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 60)
elif 'Show' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 180)
elif 'Other' == new_poi_df.poi_type[index]:
new_poi_df.set_value(index, 'adjusted_visit_length', 15)
In [338]:
new_poi_df_v2 = new_poi_df[new_poi_df.review_score>0].reset_index().drop('index',axis =1)
In [357]:
new_poi_df_v2.to_csv('new_poi_df_v2.csv')
In [356]:
new_poi_df_v2 = new_poi_df_v2[new_poi_df_v2['name'] != "Cliff's Variety"].reset_index().drop('index',axis =1)
In [321]:
import psycopg2
import simplejson
import numpy as np
from distance import *
conn_str = "dbname='travel_with_friends' user='zoesh' host='localhost'"
my_key = 'AIzaSyDJh9EWCA_v0_B3SvjzjUA3OSVYufPJeGE'
def find_county(state, city):
'''
Only valid within the U.S.
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
city = city.replace('_',' ')
cur.execute("select county from county_table where city = '%s' and state = '%s';" %(city.title(), state.title()))
county = cur.fetchone()
conn.close()
if county:
return county[0]
else:
return None
county_list= []
county = None
for index in new_poi_df.index:
try:
if (new_poi_df.city[index] == 'New York City') and (new_poi_df.state[index] == 'New York'):
county = 'NEW YORK'
elif "administrative_area_level_2" in new_poi_df.geo_content[index]:
geo_content = ast.literal_eval(new_poi_df.geo_content[index])['results']
for info in geo_content[0]['address_components']:
if info['types'][0] == "administrative_area_level_2":
county = info['short_name'].replace(' County', '').upper().encode('utf-8').strip()
else:
county = find_county(new_poi_df.state[index], new_poi_df.city[index])
except:
county = None
county_list.append(county)
new_poi_df['county'] = county_list
In [322]:
new_poi_df.to_csv('new_poi_df_v2.csv')
In [292]:
index = 4
"administrative_area_level_2" in new_poi_df.geo_content[4]
print new_poi_df.state[index], new_poi_df.city[index]
find_county(new_poi_df.state[index], new_poi_df.city[index])
In [309]:
new_poi_df['county'].value_counts()
Out[309]:
In [285]:
for info in geo_content[0]['address_components']:
if info['types'][0] == "administrative_area_level_2":
county = info['short_name'].replace(' County', '').upper()
print county
In [323]:
new_poi_df
Out[323]:
In [329]:
new_poi_df.review_score.value_counts()
Out[329]:
In [331]:
new_poi_df.loc[363]
Out[331]:
In [284]:
for info in geo_content[0]['address_components']:
if info['types'][0] == "administrative_area_level_2":
print info['types']
In [261]:
import ast
ast.literal_eval(new_poi_df.geo_content[0])['results']
Out[261]:
In [195]:
for index in new_poi_df.index:
if str(new_poi_df.raw_visit_length[index]) == 'nan':
Out[195]:
In [324]:
np.minimum?
In [325]:
test = []
test.extend([1,2,3])
In [326]:
test
Out[326]:
In [361]:
labels = '1 2 1 1 1 0 3 1 2 1 0 0 1 2 3 1 1 1 1 2 1 0 1 2 2 1'
labels = map(int,labels.split(' '))
In [376]:
from collections import Counter
c = Counter(labels)
In [181]:
########helpers.py
import psycopg2
import simplejson
import numpy as np
from distance import *
from collections import Counter
# conn_str = "dbname='travel_with_friends' user='Gon' host='localhost'"
my_key = 'AIzaSyDJh9EWCA_v0_B3SvjzjUA3OSVYufPJeGE'
my_key = "AIzaSyAA9Te-Dpi6ruT3SDpaZzVXQtlRshf_jsk"
my_key = "AIzaSyB6KDP3b55OlvCHb8U4PyVMfkLvNJEBidM"
my_key = "AIzaSyDQexgUGlMYF5pkQ3w42xQoEqsTONfMI-I"
def check_valid_state(state):
'''
Only valid within the U.S.
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
state = state.replace('_',' ')
cur.execute("select distinct state from poi_detail_table_v2 where state = '%s';" %(state.title()))
c = cur.fetchone()
return bool(c)
def check_valid_city(city,state):
'''
Only valid within the U.S.
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
state = state.replace('_',' ')
city = city.replace('_',' ')
cur.execute("select distinct city, state from poi_detail_table_v2 where city = '%s' and state = '%s';" %(city.title(), state.title()))
c = cur.fetchone()
return bool(c)
def find_county(state, city):
'''
Only valid within the U.S.
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
city = city.replace('_',' ')
cur.execute("select distinct county from county_table where city = '%s' and state = '%s';" %(city.title(), state.title()))
county = cur.fetchone()
conn.close()
if county:
return county[0]
else:
return None
def db_start_location(county, state, city):
'''
Get numpy array of county related POIs.
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
if county:
cur.execute("select index, coord_lat, coord_long, adjusted_visit_length, ranking, review_score, num_reviews from poi_detail_table_v2 where county = '%s' and state = '%s'; "%(county.upper(), state.title()))
else:
cur.execute("select index, coord_lat, coord_long, adjusted_visit_length, ranking, review_score, num_reviews from poi_detail_table_v2 where city = '%s' and state = '%s'; "%(city.title(), state.title()))
a = cur.fetchall()
conn.close()
return np.array(a)
def get_event_ids_list(trip_locations_id):
'''
Input: trip_locations_id
Output: evnet_ids, event_type = ['big', 'small', 'med', 'add',]
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute("select event_ids,event_type from day_trip_table where trip_locations_id = '%s' " %(trip_locations_id))
event_ids,event_type = cur.fetchone()
event_ids = ast.literal_eval(event_ids)
conn.close()
return event_ids,event_type
def db_event_cloest_distance(trip_locations_id=None,event_ids=None, event_type = 'add',new_event_id = None, city_name =None):
'''
Get matrix cloest distance
'''
if new_event_id or not event_ids:
event_ids, event_type = get_event_ids_list(trip_locations_id)
if new_event_id:
event_ids.append(new_event_id)
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
points=[]
# points = np.zeros((len(event_ids), 3))
for i,v in enumerate(event_ids):
cur.execute("select index, coord_lat, coord_long, city , ranking from poi_detail_table_v2 where index = %i;"%(float(v)))
points.append(cur.fetchone())
conn.close()
points = check_NO_1(points, city_name)
# print 'db_distance',points
n,D = mk_matrix(points[:,1:3], geopy_dist)
if len(points) >= 3:
if event_type == 'add':
tour = nearest_neighbor(n, 0, D)
# create a greedy tour, visiting city 'i' first
z = length(tour, D)
z = localsearch(tour, z, D)
return np.array(event_ids)[tour], event_type
#need to figure out other cases
else:
tour = nearest_neighbor(n, 0, D)
# create a greedy tour, visiting city 'i' first
z = length(tour, D)
z = localsearch(tour, z, D)
return np.array(event_ids)[tour], event_type
else:
return np.array(event_ids), event_type
def check_NO_1(poi_list, city_name):
city_name = city_name.replace('_',' ')
if len(poi_list)==1:
return np.array(poi_list)
for i, poi in enumerate(poi_list):
if (poi[3] == city_name) and (poi[4]==1):
number_one =poi_list.pop(i)
return np.vstack((np.array(number_one),np.array(poi_list)))
return np.array(poi_list)
def check_full_trip_id(full_trip_id, debug):
'''
Check full trip id exist or not.
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute("select details from full_trip_table where full_trip_id = '%s'" %(full_trip_id))
a = cur.fetchone()
conn.close()
if bool(a):
if not debug:
return a[0]
else:
return True
else:
return False
def check_day_trip_id(day_trip_id, debug):
'''
Check day trip id exist or not.
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute("select details from day_trip_table where trip_locations_id = '%s'" %(day_trip_id))
a = cur.fetchone()
conn.close()
if bool(a):
if not debug:
return a[0]
else:
return True
else:
return False
def check_travel_time_id(new_id):
'''
Check google driving time exisit or not for the 2 point poi id.
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute("select google_driving_time from google_travel_time_table where id_field = '%s'" %(new_id))
a = cur.fetchone()
conn.close()
if bool(a):
return True
else:
return False
#May need to improve by adding #reviews in this. :)
def sorted_events(info,ix):
'''
find the event_id, ranking and review_score, num_reviews columns
sorted base on ranking then review_score, num_reviews
return sorted list
'''
event_ = info[ix][:,[0,4,5,6]]
return np.array(sorted(event_, key=lambda x: (x[1], -x[3], -x[2])))
#Need to make this more efficient
def create_event_id_list(big_,medium_,small_):
# print big_,medium_,small_
event_type = ''
if big_.shape[0] >= 1:
if (medium_.shape[0] < 2) or (big_[0,1] <= medium_[0,1]):
if small_.shape[0] >= 6:
event_ids = list(np.concatenate((big_[:1,0], small_[0:6,0]),axis=0))
elif small_.shape[0]>0:
event_ids = list(np.concatenate((big_[:1,0], small_[:,0]),axis=0))
else:
event_ids = list(np.array(sorted(big_[0:,:], key=lambda x: (x[1],-x[2])))[:,0])
event_type = 'big'
else:
if small_.shape[0] >= 8:
event_ids = list(np.concatenate((medium_[0:2,0], small_[0:8,0]),axis=0))
elif small_.shape[0]>0:
event_ids = list(np.concatenate((medium_[0:2,0], small_[:,0]),axis=0))
else:
event_ids = list(np.array(sorted(medium_[0:,:], key=lambda x: (x[1],-x[2])))[:,0])
event_type = 'med'
elif medium_.shape[0] >= 2:
if small_.shape[0] >= 8:
event_ids = list(np.concatenate((medium_[0:2,0], small_[0:8,0]),axis=0))
elif small_.shape[0]>0:
event_ids = list(np.concatenate((medium_[0:2,0], small_[:,0]),axis=0))
else:
event_ids = list(np.array(sorted(medium_[0:,:], key=lambda x: (x[1],-x[2])))[:,0])
event_type = 'med'
else:
if small_.shape[0] >= 10:
if medium_.shape[0]==0:
event_ids = list(np.array(sorted(small_[0:10,:], key=lambda x: (x[1],-x[2])))[:,0])
else:
event_ids = list(np.array(sorted(np.vstack((medium_[:1,:], small_[0:10,:])), key=lambda x: (x[1],-x[2])))[:,0])
elif small_.shape[0] > 0:
if medium_.shape[0]==0:
event_ids = list(np.array(sorted(small_[0:,:], key=lambda x: (x[1],-x[2])))[:,0])
else:
event_ids = list(np.array(sorted(np.vstack((medium_, small_)), key=lambda x: (x[1],-x[2])))[:,0])
else:
event_ids = list(np.array(sorted(medium_[0:,:], key=lambda x: (x[1],-x[2])))[:,0])
event_type = 'small'
# else:
return event_ids, event_type
def db_google_driving_walking_time(event_ids, event_type):
'''
Get estimated travel time from google api.
Limit 1000 calls per day.
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
google_ids = []
driving_time_list = []
walking_time_list = []
name_list = []
for i,v in enumerate(event_ids[:-1]):
id_ = str(v) + '0000'+str(event_ids[i+1])
result_check_travel_time_id = check_travel_time_id(id_)
if not result_check_travel_time_id:
cur.execute("select name, coord_lat, coord_long from poi_detail_table_v2 where index = %s"%(v))
orig_name, orig_coord_lat, orig_coord_long = cur.fetchone()
orig_idx = v
cur.execute("select name, coord_lat, coord_long from poi_detail_table_v2 where index = %s "%(event_ids[i+1]))
dest_name, dest_coord_lat, dest_coord_long = cur.fetchone()
dest_idx = event_ids[i+1]
orig_coords = str(orig_coord_lat)+','+str(orig_coord_long)
dest_coords = str(dest_coord_lat)+','+str(dest_coord_long)
google_driving_url = "https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&mode=driving&language=en-EN&sensor=false&key={2}".\
format(orig_coords.replace(' ',''),dest_coords.replace(' ',''),my_key)
google_walking_url = "https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&mode=walking&language=en-EN&sensor=false&key={2}".\
format(orig_coords.replace(' ',''),dest_coords.replace(' ',''),my_key)
driving_result= simplejson.load(urllib.urlopen(google_driving_url))
walking_result= simplejson.load(urllib.urlopen(google_walking_url))
if driving_result['rows'][0]['elements'][0]['status'] == 'ZERO_RESULTS':
google_driving_url = "https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&mode=driving&language=en-EN&sensor=false&key={2}".\
format(orig_name.replace(' ','+').replace('-','+'),dest_name.replace(' ','+').replace('-','+'),my_key)
print 'wrror driving?', google_driving_url
driving_result= simplejson.load(urllib.urlopen(google_driving_url))
if walking_result['rows'][0]['elements'][0]['status'] == 'ZERO_RESULTS':
google_walking_url = "https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&mode=walking&language=en-EN&sensor=false&key={2}".\
format(orig_name.replace(' ','+').replace('-','+'),dest_name.replace(' ','+').replace('-','+'),my_key)
print 'error walking?',google_walking_url
walking_result= simplejson.load(urllib.urlopen(google_walking_url))
if (driving_result['rows'][0]['elements'][0]['status'] == 'NOT_FOUND') and (walking_result['rows'][0]['elements'][0]['status'] == 'NOT_FOUND'):
new_event_ids = list(event_ids)
new_event_ids.pop(i+1)
new_event_ids = db_event_cloest_distance(event_ids=new_event_ids, event_type = event_type)
return db_google_driving_walking_time(new_event_ids, event_type)
try:
google_driving_time = driving_result['rows'][0]['elements'][0]['duration']['value']/60
except:
print v, id_, driving_result #need to debug for this
try:
google_walking_time = walking_result['rows'][0]['elements'][0]['duration']['value']/60
except:
google_walking_time = 9999
# print 'google_driving time: ', google_driving_time
google_driving_url = google_driving_url.replace("'s","%27")
google_walking_url = google_walking_url.replace("'s","%27")
cur.execute("select max(index) from google_travel_time_table")
index = cur.fetchone()[0]+1
driving_result = str(driving_result).replace("'",'"')
walking_result = str(walking_result).replace("'",'"')
orig_name = orig_name.replace("'","''")
dest_name = dest_name.replace("'","''")
cur.execute("INSERT INTO google_travel_time_table VALUES (%i, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s','%s', '%s', '%s', '%s', '%s', '%s', %s, %s);"%(index, id_, orig_name, orig_idx, dest_name, dest_idx, orig_coord_lat, orig_coord_long, dest_coord_lat,\
dest_coord_long, orig_coords, dest_coords, google_driving_url, google_walking_url,\
str(driving_result), str(walking_result), google_driving_time, google_walking_time))
conn.commit()
name_list.append(orig_name+" to "+ dest_name)
google_ids.append(id_)
driving_time_list.append(google_driving_time)
walking_time_list.append(google_walking_time)
else:
cur.execute("select orig_name, dest_name, google_driving_time, google_walking_time from google_travel_time_table \
where id_field = '%s'" %(id_))
orig_name, dest_name, google_driving_time, google_walking_time = cur.fetchone()
name_list.append(orig_name+" to "+ dest_name)
google_ids.append(id_)
driving_time_list.append(google_driving_time)
walking_time_list.append(google_walking_time)
conn.close()
# return event_ids, google_ids, name_list, driving_time_list, walking_time_list
return event_ids, driving_time_list, walking_time_list
def db_remove_extra_events(event_ids, driving_time_list,walking_time_list, max_time_spent=600):
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
if len(event_ids) > 1:
cur.execute("SELECT DISTINCT SUM(adjusted_visit_length) FROM poi_detail_table_v2 WHERE index IN %s;" %(tuple(event_ids),))
time_spent = cur.fetchone()[0]
conn.close()
else:
cur.execute("SELECT DISTINCT SUM(adjusted_visit_length) FROM poi_detail_table_v2 WHERE index = %s;" %(event_ids))
time_spent = cur.fetchone()[0]
conn.close()
travel_time = int(sum(np.minimum(np.array(driving_time_list),np.array(walking_time_list))))
time_spent = int(time_spent) + travel_time
if time_spent > max_time_spent:
update_event_ids = event_ids[:-1]
update_driving_time_list = driving_time_list[:-1]
update_walking_time_list = walking_time_list[:-1]
return db_remove_extra_events(update_event_ids, update_driving_time_list, update_walking_time_list)
else:
return event_ids, driving_time_list, walking_time_list, time_spent
def db_adjust_events(event_ids, driving_time_list,walking_time_list, not_visited_poi_lst, event_type, city, max_time_spent=600):
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
if len(event_ids) > 1:
cur.execute("SELECT DISTINCT SUM(adjusted_visit_length) FROM poi_detail_table_v2 WHERE index IN %s;" %(tuple(event_ids),))
time_spent = cur.fetchone()[0]
conn.close()
else:
cur.execute("SELECT DISTINCT SUM(adjusted_visit_length) FROM poi_detail_table_v2 WHERE index = %s;" %(event_ids[0]))
time_spent = cur.fetchone()[0]
conn.close()
travel_time = int(sum(np.minimum(np.array(driving_time_list),np.array(walking_time_list))))
time_spent = int(time_spent) + travel_time
if time_spent > max_time_spent:
update_event_ids = event_ids[:-1]
update_driving_time_list = driving_time_list[:-1]
update_walking_time_list = walking_time_list[:-1]
not_visited_poi_lst.append(event_ids[-1])
return db_adjust_events(update_event_ids, update_driving_time_list, update_walking_time_list,not_visited_poi_lst, event_type, city)
elif (time_spent < max_time_spent - 240) and bool(not_visited_poi_lst):
event_ids = list(event_ids)
event_ids.extend(not_visited_poi_lst)
event_ids, event_type = db_event_cloest_distance(event_ids = event_ids, event_type = event_type, city_name = city)
event_ids, driving_time_list, walking_time_list = \
db_google_driving_walking_time(event_ids, event_type)
return db_adjust_events(event_ids, driving_time_list, walking_time_list, [], event_type, city)
else:
return event_ids, driving_time_list, walking_time_list, time_spent, not_visited_poi_lst
def db_day_trip_details(event_ids, i):
conn=psycopg2.connect(conn_str)
cur = conn.cursor()
details = []
#details dict includes: id, name,address, day
for event_id in event_ids:
cur.execute("select index, name, address, coord_lat, coord_long from poi_detail_table_v2 where index = %s;" %(event_id))
a = cur.fetchone()
details.append(str({'id': a[0],'name': a[1],'address': a[2], 'day': i, 'coord_lat': a[3], 'coord_long': a[4]}))
conn.close()
return details
def check_address(index):
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute("select address from poi_detail_table_v2 where index = %s;"%(index))
a = cur.fetchone()[0]
conn.close()
if a:
return True
else:
return False
def db_address(event_ids):
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
for i in event_ids[:-1]:
if not check_address(i):
cur.execute("select driving_result from google_travel_time_table where orig_idx = %s;" %(i))
a= cur.fetchone()[0]
add = ast.literal_eval(a)['origin_addresses'][0]
cur.execute("update poi_detail_table_v2 set address = '%s' where index = %s;" %(add, i))
conn.commit()
last = event_ids[-1]
if not check_address(last):
cur.execute("select driving_result from google_travel_time_table where dest_idx = %s;" %(last))
a= cur.fetchone()[0]
add = ast.literal_eval(a)['destination_addresses'][0]
cur.execute("update poi_detail_table_v2 set address = '%s' where index = %s;" %(add, last))
conn.commit()
conn.close()
def kmeans_leabels_day_order(day_labels):
return [k for k, v in Counter(day_labels).most_common()]
In [ ]:
In [231]:
# -*- coding: utf-8 -*-
import psycopg2
import ast
import numpy as np
import simplejson
import urllib
# from helpers import *
conn_str = "dbname='travel_with_friends' user='zoesh' host='localhost'"
def ajax_available_events(county, state):
county=county.upper()
state = state.title()
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute("select index, name from poi_detail_table_v2 where county='%s' and state='%s'" %(county,state))
poi_lst = [item for item in cur.fetchall()]
conn.close()
return poi_lst
def add_event(trip_locations_id, event_day, new_event_id=None, event_name=None, full_day = True, unseen_event = False):
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute("select * from day_trip_table where trip_locations_id='%s'" %(trip_locations_id))
(index, trip_locations_id, full_day, regular, county, state, detail, event_type, event_ids) = cur.fetchone()
if unseen_event:
index += 1
trip_locations_id = '-'.join([str(eval(i)['id']) for i in eval(detail)])+'-'+event_name.replace(' ','-')+'-'+event_day
cur.execute("select details from day_trip_locations where trip_locations_id='%s'" %(trip_locations_id))
a = cur.fetchone()
if bool(a):
conn.close()
return trip_locations_id, a[0]
else:
cur.execute("select max(index) from day_trip_locations")
index = cur.fetchone()[0]+1
detail = list(eval(detail))
#need to make sure the type is correct for detail!
new_event = "{'address': 'None', 'id': 'None', 'day': %s, 'name': u'%s'}"%(event_day, event_name)
detail.append(new_event)
#get the right format of detail: change from list to string and remove brackets and convert quote type
new_detail = str(detail).replace('"','').replace('[','').replace(']','').replace("'",'"')
cur.execute("INSERT INTO day_trip_locations VALUES (%i, '%s',%s,%s,'%s','%s','%s');" %(index, trip_locations_id, full_day, False, county, state, new_detail))
conn.commit()
conn.close()
return trip_locations_id, detail
else:
event_ids = db_event_cloest_distance(trip_locations_id, new_event_id)
event_ids, google_ids, name_list, driving_time_list, walking_time_list = db_google_driving_walking_time(event_ids,event_type = 'add')
trip_locations_id = '-'.join(event_ids)+'-'+event_day
cur.execute("select details from day_trip_locations where trip_locations_id='%s'" %(trip_locations_id))
if not cur.fetchone():
details = []
db_address(event_ids)
for item in event_ids:
cur.execute("select index, name, address from poi_detail_table_v2 where index = '%s';" %(item))
a = cur.fetchone()
detail = {'id': a[0],'name': a[1],'address': a[2], 'day': event_day}
details.append(detail)
#need to make sure event detail can append to table!
cur.execute("insert into day_trip_table (trip_locations_id,full_day, regular, county, state, details, event_type, event_ids) VALUES ( '%s', %s, %s, '%s', '%s', '%s', '%s', '%s')" %( trip_location_id, full_day, False, county, state, details, event_type, event_ids))
conn.commit()
conn.close()
return trip_locations_id, details
else:
conn.close()
#need to make sure type is correct.
return trip_locations_id, a[0]
def remove_event(trip_locations_id, remove_event_id, remove_event_name=None, event_day=None, full_day = True):
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute("select * from day_trip_table where trip_locations_id='%s'" %(trip_locations_id))
(index, trip_locations_id, full_day, regular, county, state, detail, event_type, event_ids) = cur.fetchone()
new_event_ids = ast.literal_eval(event_ids)
new_event_ids.remove(remove_event_id)
new_trip_locations_id = '-'.join(str(event_id) for event_id in new_event_ids)
cur.execute("select * from day_trip_table where trip_locations_id='%s'" %(new_trip_locations_id))
check_id = cur.fetchone()
if check_id:
return new_trip_locations_id, check_id[-3]
detail = ast.literal_eval(detail[1:-1])
for index, trip_detail in enumerate(detail):
if ast.literal_eval(trip_detail)['id'] == remove_event_id:
remove_index = index
break
new_detail = list(detail)
new_detail.pop(remove_index)
new_detail = str(new_detail).replace("'","''")
regular = False
cur.execute("select max(index) from day_trip_table where trip_locations_id='%s'" %(trip_locations_id))
new_index = cur.fetchone()[0]
new_index+=1
cur.execute("INSERT INTO day_trip_table VALUES (%i, '%s', %s, %s, '%s', '%s', '%s', '%s','%s');" \
%(new_index, new_trip_locations_id, full_day, regular, county, state, new_detail, event_type, new_event_ids))
conn.commit()
conn.close()
return new_trip_locations_id, new_detail
def event_type_time_spent(adjusted_normal_time_spent):
if adjusted_normal_time_spent > 180:
return 'big'
elif adjusted_normal_time_spent >= 120:
return 'med'
else:
return 'small'
def switch_event_list(full_trip_id, trip_locations_id, switch_event_id, switch_event_name=None, event_day=None, full_day = True):
# new_trip_locations_id, new_detail = remove_event(trip_locations_id, switch_event_id)
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute("select name, city, county, state, coord_lat, coord_long,ranking, adjusted_visit_length from poi_detail_table_v2 where index=%s" %(switch_event_id))
name, city, county, state,coord_lat, coord_long,poi_rank, adjusted_normal_time_spent = cur.fetchone()
event_type = event_type_time_spent(adjusted_normal_time_spent)
avialable_lst = ajax_available_events(county, state)
cur.execute("select trip_location_ids,details from full_trip_table where full_trip_id=%s" %(full_trip_id))
full_trip_detail = cur.fetchone()
full_trip_detail = ast.literal_eval(full_trip_detail)
full_trip_ids = [ast.literal_eval(item)['id'] for item in full_trip_detail]
switch_lst = []
for item in avialable_lst:
index = item[0]
if index not in full_trip_ids:
event_ids = [switch_event_id, index]
event_ids, google_ids, name_list, driving_time_list, walking_time_list = db_google_driving_walking_time(event_ids, event_type='switch')
if min(driving_time_list[0], walking_time_list[0]) <= 60:
cur.execute("select ranking, review_score, adjusted_visit_length from poi_detail_table_v2 where index=%s" %(index))
target_poi_rank, target_rating, target_adjusted_normal_time_spent = cur.fetchone()
target_event_type = event_type_time_spent(target_adjusted_normal_time_spent)
switch_lst.append([target_poi_rank, target_rating, target_event_type==event_type])
#need to sort target_event_type, target_poi_rank and target_rating
return {switch_event_id: switch_lst}
def switch_event(trip_locations_id, switch_event_id, final_event_id, event_day):
new_trip_locations_id, new_detail = remove_event(trip_locations_id, switch_event_id)
new_trip_locations_id, new_detail = add_event(new_trip_locations_id, event_day, final_event_id, full_day = True, unseen_event = False)
return new_trip_locations_id, new_detail
def angle_between(p1, p2):
ang1 = np.arctan2(*p1[::-1])
ang2 = np.arctan2(*p2[::-1])
return np.rad2deg((ang1 - ang2) % (2 * np.pi))
def calculate_initial_compass_bearing(pointA, pointB):
"""
Calculates the bearing between two points.
The formulae used is the following:
theta = atan2(sin(delta(long)).cos(lat2),
cos(lat1).sin(lat2) − sin(lat1).cos(lat2).cos(delta(long)))
:Parameters:
- `pointA: The tuple representing the latitude/longitude for the
first point. Latitude and longitude must be in decimal degrees
- `pointB: The tuple representing the latitude/longitude for the
second point. Latitude and longitude must be in decimal degrees
:Returns:
The bearing in degrees
:Returns Type:
float
"""
if (type(pointA) != tuple) or (type(pointB) != tuple):
raise TypeError("Only tuples are supported as arguments")
lat1 = math.radians(pointA[0])
lat2 = math.radians(pointB[0])
diffLong = math.radians(pointB[1] - pointA[1])
x = math.sin(diffLong) * math.cos(lat2)
y = math.cos(lat1) * math.sin(lat2) - (math.sin(lat1)
* math.cos(lat2) * math.cos(diffLong))
initial_bearing = math.atan2(x, y)
# Now we have the initial bearing but math.atan2 return values
# from -180° to + 180° which is not what we want for a compass bearing
# The solution is to normalize the initial bearing as shown below
initial_bearing = math.degrees(initial_bearing)
compass_bearing = (initial_bearing + 360) % 360
return compass_bearing
def direction_from_orgin(start_coord_long, start_coord_lat, target_coord_long, target_coord_lat):
angle = calculate_initial_compass_bearing((start_coord_lat, start_coord_long), (target_coord_lat, target_coord_long))
if (angle > 45) and (angle < 135):
return 'E'
elif (angle > 135) and (angle < 215):
return 'S'
elif (angle > 215) and (angle < 305):
return 'W'
else:
return 'N'
def travel_outside_coords(current_city, current_state, direction=None, n_days=1):
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
#coord_long, coord_lat
cur.execute("select index, coord_lat, coord_long from all_cities_coords_table where city ='%s' and state = '%s';" %(current_city, current_state))
id_, coord_lat, coord_long = cur.fetchone()
#city, coord_lat, coord_long
cur.execute("select distinct city, coord_lat, coord_long from all_cities_coords_table where city !='%s' and state = '%s';" %(current_city, current_state))
coords = cur.fetchall()
conn.close()
return id_, coords, coord_lat, coord_long
def check_outside_trip_id(outside_trip_id, debug):
'''
Check outside trip id exist or not.
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute("select outside_trip_id from outside_trip_table where outside_trip_id = '%s'" %(outside_trip_id))
a = cur.fetchone()
print 'outside stuff id', a, bool(a)
conn.close()
if bool(a):
if not debug:
return a[0]
else:
return True
else:
return False
def db_outside_route_trip_details(event_ids, route_i):
conn=psycopg2.connect(conn_str)
cur = conn.cursor()
details = []
#details dict includes: id, name,address, day
for event_id in event_ids:
cur.execute("select index, name, address, coord_lat, coord_long from poi_detail_table_v2 where index = %s;" %(event_id))
a = cur.fetchone()
details.append({'id': a[0],'name': a[1],'address': a[2], 'coord_lat': a[3], 'coord_long':a[4], 'route': route_i})
conn.close()
return details
def db_outside_google_driving_walking_time(city_id, start_coord_lat, start_coord_long, event_ids, event_type, origin_city, origin_state):
'''
Get estimated travel time from google api.
Limit 1000 calls per day.
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
google_ids = []
driving_time_list = []
walking_time_list = []
name_list = []
city_to_poi_id = str(city_id) + '0000'+str(event_ids[0])
if not check_city_to_poi(city_to_poi_id):
cur.execute("select name, coord_lat, coord_long from poi_detail_table_v2 where index = %s "%(event_ids[0]))
dest_name, dest_coord_lat, dest_coord_long = cur.fetchone()
orig_coords = str(start_coord_lat)+','+str(start_coord_long)
dest_coords = str(dest_coord_lat)+','+str(dest_coord_long)
google_driving_url = "https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&mode=driving&language=en-EN&sensor=false&key={2}".\
format(orig_coords.replace(' ',''),dest_coords.replace(' ',''),my_key)
google_walking_url = "https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&mode=walking&language=en-EN&sensor=false&key={2}".\
format(orig_coords.replace(' ',''),dest_coords.replace(' ',''),my_key)
driving_result= simplejson.load(urllib.urlopen(google_driving_url))
walking_result= simplejson.load(urllib.urlopen(google_walking_url))
orig_name = origin_city.upper().replace(' ','+').replace('-','+') + '+' + origin_state.upper().replace(' ','+').replace('-','+')
if driving_result['rows'][0]['elements'][0]['status'] == 'ZERO_RESULTS':
google_driving_url = "https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&mode=driving&language=en-EN&sensor=false&key={2}".\
format(orig_name,dest_name.replace(' ','+').replace('-','+'),my_key)
driving_result= simplejson.load(urllib.urlopen(google_driving_url))
if walking_result['rows'][0]['elements'][0]['status'] == 'ZERO_RESULTS':
google_walking_url = "https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&mode=walking&language=en-EN&sensor=false&key={2}".\
format(orig_name,dest_name.replace(' ','+').replace('-','+'),my_key)
walking_result= simplejson.load(urllib.urlopen(google_walking_url))
if (driving_result['rows'][0]['elements'][0]['status'] == 'NOT_FOUND') and (walking_result['rows'][0]['elements'][0]['status'] == 'NOT_FOUND'):
new_event_ids = list(event_ids)
new_event_ids.pop(0)
new_event_ids = db_outside_event_cloest_distance(start_coord_lat, start_coord_long, event_ids=new_event_ids, event_type = event_type)
return db_outside_google_driving_walking_time(city_id, start_coord_lat, start_coord_long, new_event_ids, event_type,origin_city, origin_state)
try:
city_to_poi_driving_time = driving_result['rows'][0]['elements'][0]['duration']['value']/60
except:
print city, state, dest_name, driving_result #need to debug for this
try:
city_to_poi_walking_time = walking_result['rows'][0]['elements'][0]['duration']['value']/60
except:
city_to_poi_walking_time = 9999
'''
Need to work on rest of it!
'''
cur.execute("select max(index) from google_city_to_poi_table")
index = cur.fetchone()[0]+1
driving_result = str(driving_result).replace("'",'"')
walking_result = str(walking_result).replace("'",'"')
orig_name = orig_name.replace("'","''")
dest_name = dest_name.replace("'","''")
cur.execute("INSERT INTO google_city_to_poi_table VALUES (%i, %s, %i, '%s','%s', '%s','%s', '%s', '%s', '%s', '%s', '%s','%s', '%s', '%s', '%s', '%s', '%s', %s, %s);" \
%(index, city_to_poi_id, city_id, origin_city.replace("'","''"), origin_state, orig_name, dest_name, event_ids[0], start_coord_lat, start_coord_long, dest_coord_lat,\
dest_coord_long, orig_coords, dest_coords, google_driving_url, google_walking_url,\
str(driving_result), str(walking_result), city_to_poi_driving_time,city_to_poi_walking_time))
conn.commit()
name_list.extend([orig_name+" to "+ dest_name,dest_name+" to "+ orig_name])
google_ids.extend([city_to_poi_id]*2)
driving_time_list.extend([city_to_poi_driving_time]*2)
walking_time_list.extend([city_to_poi_walking_time]*2)
else:
cur.execute("select orig_name, dest_name, city_to_poi_driving_time, city_to_poi_walking_time from google_city_to_poi_table \
where city_to_poi_id = %s " %(city_to_poi_id))
orig_name, dest_name, city_to_poi_driving_time, city_to_poi_walking_time = cur.fetchone()
name_list.append(orig_name+" to "+ dest_name)
google_ids.extend([city_to_poi_id]*2)
driving_time_list.extend([city_to_poi_driving_time]*2)
walking_time_list.extend([city_to_poi_walking_time]*2)
for i,v in enumerate(event_ids[:-1]):
id_ = str(v) + '0000'+str(event_ids[i+1])
result_check_travel_time_id = check_travel_time_id(id_)
if not result_check_travel_time_id:
cur.execute("select name, coord_lat, coord_long from poi_detail_table_v2 where index = %s"%(v))
orig_name, orig_coord_lat, orig_coord_long = cur.fetchone()
orig_idx = v
cur.execute("select name, coord_lat, coord_long from poi_detail_table_v2 where index = %s "%(event_ids[i+1]))
dest_name, dest_coord_lat, dest_coord_long = cur.fetchone()
dest_idx = event_ids[i+1]
orig_coords = str(orig_coord_lat)+','+str(orig_coord_long)
dest_coords = str(dest_coord_lat)+','+str(dest_coord_long)
google_driving_url = "https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&mode=driving&language=en-EN&sensor=false&key={2}".\
format(orig_coords.replace(' ',''),dest_coords.replace(' ',''),my_key)
google_walking_url = "https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&mode=walking&language=en-EN&sensor=false&key={2}".\
format(orig_coords.replace(' ',''),dest_coords.replace(' ',''),my_key)
driving_result= simplejson.load(urllib.urlopen(google_driving_url))
walking_result= simplejson.load(urllib.urlopen(google_walking_url))
if driving_result['rows'][0]['elements'][0]['status'] == 'ZERO_RESULTS':
google_driving_url = "https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&mode=driving&language=en-EN&sensor=false&key={2}".\
format(orig_name.replace(' ','+').replace('-','+'),dest_name.replace(' ','+').replace('-','+'),my_key)
driving_result= simplejson.load(urllib.urlopen(google_driving_url))
if walking_result['rows'][0]['elements'][0]['status'] == 'ZERO_RESULTS':
google_walking_url = "https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&mode=walking&language=en-EN&sensor=false&key={2}".\
format(orig_name.replace(' ','+').replace('-','+'),dest_name.replace(' ','+').replace('-','+'),my_key)
walking_result= simplejson.load(urllib.urlopen(google_walking_url))
if (driving_result['rows'][0]['elements'][0]['status'] == 'NOT_FOUND') and (walking_result['rows'][0]['elements'][0]['status'] == 'NOT_FOUND'):
new_event_ids = list(event_ids)
new_event_ids.pop(i+1)
new_event_ids = db_event_cloest_distance(event_ids=new_event_ids, event_type = event_type)
return db_google_driving_walking_time(new_event_ids, event_type)
try:
google_driving_time = driving_result['rows'][0]['elements'][0]['duration']['value']/60
except:
print v, id_, driving_result #need to debug for this
try:
google_walking_time = walking_result['rows'][0]['elements'][0]['duration']['value']/60
except:
google_walking_time = 9999
cur.execute("select max(index) from google_travel_time_table")
index = cur.fetchone()[0]+1
driving_result = str(driving_result).replace("'",'"')
walking_result = str(walking_result).replace("'",'"')
orig_name = orig_name.replace("'","''")
dest_name = dest_name.replace("'","''")
cur.execute("INSERT INTO google_travel_time_table VALUES (%i, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s','%s', '%s', '%s', '%s', '%s', '%s', %s, %s);"%(index, id_, orig_name, orig_idx, dest_name, dest_idx, orig_coord_lat, orig_coord_long, dest_coord_long,\
dest_coord_long, orig_coords, dest_coords, google_driving_url, google_walking_url,\
str(driving_result), str(walking_result), google_driving_time, google_walking_time))
conn.commit()
name_list.append(orig_name+" to "+ dest_name)
google_ids.append(id_)
driving_time_list.append(google_driving_time)
walking_time_list.append(google_walking_time)
else:
cur.execute("select orig_name, dest_name, google_driving_time, google_walking_time from google_travel_time_table \
where id_field = '%s'" %(id_))
orig_name, dest_name, google_driving_time, google_walking_time = cur.fetchone()
name_list.append(orig_name+" to "+ dest_name)
google_ids.append(id_)
driving_time_list.append(google_driving_time)
walking_time_list.append(google_walking_time)
conn.close()
return event_ids, google_ids, name_list, driving_time_list, walking_time_list
def db_outside_event_cloest_distance(coord_lat, coord_long, trip_locations_id=None,event_ids=None, event_type = 'add',new_event_id = None):
'''
Get matrix cloest distance
'''
if new_event_id or not event_ids:
event_ids, event_type = get_event_ids_list(trip_locations_id)
if new_event_id:
event_ids.append(new_event_id)
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
points = np.zeros((len(event_ids), 3))
for i,v in enumerate(event_ids):
cur.execute("select index, coord_lat, coord_long from poi_detail_table_v2 where index = %i;"%(float(v)))
points[i] = cur.fetchone()
conn.close()
points = np.vstack((np.array([0, coord_lat, coord_long]),points))
n,D = mk_matrix(points[:,1:], geopy_dist)
if len(points) >= 3:
if event_type == 'add':
tour = nearest_neighbor(n, 0, D)
# create a greedy tour, visiting city 'i' first
z = length(tour, D)
z = localsearch(tour, z, D)
tour = np.array(tour[1:])-1
event_ids = np.array(event_ids)
return np.array(event_ids)[tour[1:]], event_type
#need to figure out other cases
else:
tour = nearest_neighbor(n, 0, D)
# create a greedy tour, visiting city 'i' first
z = length(tour, D)
z = localsearch(tour, z, D)
tour = np.array(tour[1:])-1
event_ids = np.array(event_ids)
return event_ids[tour], event_type
else:
return np.array(event_ids), event_type
def check_city_to_poi(city_to_poi_id):
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute("select index from google_city_to_poi_table \
where city_to_poi_id = %s " %(city_to_poi_id))
a = cur.fetchone()
conn.close()
if bool(a):
return True
else:
return False
def db_remove_outside_extra_events(event_ids, driving_time_list,walking_time_list, max_time_spent=600):
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
if len(event_ids) == 1:
cur.execute("SELECT DISTINCT SUM(adjusted_visit_length) FROM poi_detail_table_v2 WHERE index = %s;" %(event_ids[0]))
else:
cur.execute("SELECT DISTINCT SUM(adjusted_visit_length) FROM poi_detail_table_v2 WHERE index IN %s;" %(tuple(event_ids),))
total_travel_time = sum(np.minimum(np.array(driving_time_list),np.array(walking_time_list)))
time_spent = float(cur.fetchone()[0]) + float(total_travel_time)
conn.close()
if len(event_ids) == 1:
return event_ids, driving_time_list, walking_time_list, time_spent
if time_spent > max_time_spent:
update_event_ids = event_ids[:-1]
update_driving_time_list = driving_time_list[:-1]
update_walking_time_list = walking_time_list[:-1]
return db_remove_extra_events(update_event_ids, update_driving_time_list, update_walking_time_list)
else:
return event_ids, driving_time_list, walking_time_list, time_spent
In [232]:
#Get events outside the city!!!
import numpy as np
from distance import *
'''
Outside trip table: user_id, outside_trip_id, route_ids, origin_city, state, direction, n_days, default, full_day, details
outside route table: route_id, event_id_lst, event_type, origin_city, state, direction, details, default,
'''
# target_direction = 'N'
# origin_city = 'San Francisco'
# origin_state = 'California'
# conn_str = "dbname='travel_with_friends' user='zoesh' host='localhost'"
def outside_trip_poi(origin_city, origin_state, target_direction = 'N', n_days = 1, \
full_day = True, regular = True, debug = True, user_id = 'admin'):
outside_trip_id = '-'.join([str(origin_state.upper().replace(' ','-')), str(origin_city.upper().replace(' ','-')), \
target_direction,str(int(regular)), str(n_days)])
if not check_outside_trip_id(outside_trip_id, debug):
furthest_len = 140
if n_days == 1:
furthest_len = 140
#possible city coords, target city coord_lat, target city coord_long
city_id, coords, coord_lat, coord_long = travel_outside_coords(origin_city, origin_state)
#coords: city, lat, long
check_cities_info = []
for item in coords:
direction = direction_from_orgin(coord_long, coord_lat, item[2], item[1])
if (target_direction == direction) and (geopy_dist((item[1], item[2]), (coord_lat, coord_long)) < furthest_len):
check_cities_info.append(item)
city_infos = []
for city, _, _ in check_cities_info:
county = None
#index, coord0, coord1, adjusted_normal_time_spent, poi_rank, rating
city_info = db_start_location(county, origin_state, city)
city_infos.extend(city_info)
city_infos = np.array(city_infos)
poi_coords = city_infos[:,1:3]
n_routes = sum(1 for t in np.array(city_infos)[:,3] if t >= 120)/10
if (n_routes>1) and (city_infos.shape[0]>=10):
kmeans = KMeans(n_clusters=n_routes).fit(poi_coords)
elif (city_infos.shape[0]> 20) or (n_routes>1):
kmeans = KMeans(n_clusters=2).fit(poi_coords)
else:
kmeans = KMeans(n_clusters=1).fit(poi_coords)
route_labels = kmeans.labels_
# print n_routes, len(route_labels), city_infos.shape
# print route_labels
outside_route_ids, outside_trip_details,event_id_lst =[],[],[]
for i in range(n_routes):
current_events, big_ix, med_ix, small_ix = [], [],[], []
for ix, label in enumerate(route_labels):
if label == i:
time = city_infos[ix,3]
event_ix = city_infos[ix,0]
current_events.append(event_ix)
if time > 180 :
big_ix.append(ix)
elif time >= 120 :
med_ix.append(ix)
else:
small_ix.append(ix)
big_ = sorted_events(city_infos, big_ix)
med_ = sorted_events(city_infos, med_ix)
small_ = sorted_events(city_infos, small_ix)
# need to update!!!!!!!!
event_ids, event_type = create_event_id_list(big_, med_, small_)
event_ids, event_type = db_outside_event_cloest_distance(coord_lat, coord_long, event_ids = event_ids, event_type = event_type)
event_ids, google_ids, name_list, driving_time_list, walking_time_list =db_outside_google_driving_walking_time(city_id, coord_lat,coord_long, event_ids, event_type, origin_city = origin_city, origin_state = origin_state)
#why bug????
event_ids, driving_time_list, walking_time_list, total_time_spent = db_remove_outside_extra_events(event_ids, driving_time_list, walking_time_list)
outside_route_id = outside_trip_id + '-'+str(i)
# values = db_outside_route_trip_details(outside_route_id, event_ids, origin_city, origin_state, regular, full_day,n_days,i)
details = db_outside_route_trip_details(event_ids,i)
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute('select max(index) from outside_route_table;')
new_index = cur.fetchone()[0] + 1
cur.execute("insert into outside_route_table (index, outside_route_id, full_day, regular, origin_city, origin_state, target_direction, details, event_type, event_ids, route_num) \
VALUES (%s, '%s', %s, %s, '%s', '%s', '%s', '%s', '%s', '%s', %s);" \
%(new_index, outside_route_id, full_day, regular, origin_city, origin_state, target_direction, str(details).replace("'","''"), event_type, str(event_ids) , i))
conn.commit()
conn.close()
outside_route_ids.append(outside_route_id)
outside_trip_details.extend(details)
event_id_lst.extend(event_ids)
username = "zoesh"
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute('SELECT MAX(index) from outside_trip_table;')
new_index = cur.fetchone()[0] +1
cur.execute("INSERT into outside_trip_table(index, username, outside_trip_id, outside_route_ids, event_id_lst, origin_city, origin_state, target_direction, n_routes, regular, full_day, details) \
VALUES (%s,'%s', '%s', '%s','%s', '%s', '%s', '%s', %s,%s,%s,'%s');" \
%(new_index, username, outside_trip_id, str(outside_route_ids).replace("'","''"), str(event_id_lst), origin_city, origin_state, target_direction, n_routes, regular, full_day, str(outside_trip_details).replace("'","''")))
conn.commit()
conn.close()
print "finish update %s, %s, direction %s into database" %(origin_state, origin_city, target_direction)
return outside_trip_id, outside_trip_details
else:
print "ALERT: %s, %s, direction %s already in database" %(origin_state, origin_city, target_direction)
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute("SELECT DISTINCT outside_trip_id, details FROM outside_trip_table WHERE outside_trip_id = '%s';" %(outside_trip_id))
outside_trip_id, details = cur.fetchone()
details = ast.literal_eval(details)
conn.close()
return outside_trip_id, details
In [234]:
outside_trip_poi(origin_city, origin_state, target_direction = 'E', n_days = 1, \
full_day = True, regular = True, debug = True, user_id = 'zoesh')
Out[234]:
In [230]:
import psycopg2
import simplejson
import numpy as np
from distance import *
from collections import Counter
# conn_str = "dbname='travel_with_friends' user='Gon' host='localhost'"
# my_key = 'AIzaSyDJh9EWCA_v0_B3SvjzjUA3OSVYufPJeGE'
# my_key = "AIzaSyCwIBKkWAkAcPjpO840fGPc1vnmK7h2UnY"
# my_key1 = "AIzaSyBrYcGsb8kIlIfa011bSbVF8X4NueqzZBo"
def check_valid_state(state):
'''
Only valid within the U.S.
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
state = state.replace('_',' ')
cur.execute("select distinct state from poi_detail_table_v2 where state = '%s';" %(state.title()))
c = cur.fetchone()
return bool(c)
def check_valid_city(city,state):
'''
Only valid within the U.S.
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
state = state.replace('_',' ')
city = city.replace('_',' ')
cur.execute("select distinct city, state from poi_detail_table_v2 where city = '%s' and state = '%s';" %(city.title(), state.title()))
c = cur.fetchone()
return bool(c)
def find_county(state, city):
'''
Only valid within the U.S.
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
city = city.replace('_',' ')
cur.execute("select distinct county from county_table where city = '%s' and state = '%s';" %(city.title(), state.title()))
county = cur.fetchone()
conn.close()
if county:
return county[0]
else:
return None
def db_start_location(county, state, city):
'''
Get numpy array of county related POIs.
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
if county:
cur.execute("select index, coord_lat, coord_long, adjusted_visit_length, ranking, review_score, num_reviews from poi_detail_table_v2 where county = '%s' and state = '%s'; "%(county.upper(), state.title()))
else:
cur.execute("select index, coord_lat, coord_long, adjusted_visit_length, ranking, review_score, num_reviews from poi_detail_table_v2 where city = '%s' and state = '%s'; "%(city.title(), state.title()))
a = cur.fetchall()
conn.close()
return np.array(a)
def get_event_ids_list(trip_locations_id):
'''
Input: trip_locations_id
Output: evnet_ids, event_type = ['big', 'small', 'med', 'add',]
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute("select event_ids,event_type from day_trip_table where trip_locations_id = '%s' " %(trip_locations_id))
event_ids,event_type = cur.fetchone()
event_ids = litast.eral_eval(event_ids)
conn.close()
return event_ids,event_type
def db_event_cloest_distance(trip_locations_id=None,event_ids=None, event_type = 'add',new_event_id = None, city_name =None):
'''
Get matrix cloest distance
'''
if new_event_id or not event_ids:
event_ids, event_type = get_event_ids_list(trip_locations_id)
if new_event_id:
event_ids.append(new_event_id)
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
points=[]
# points = np.zeros((len(event_ids), 3))
for i,v in enumerate(event_ids):
cur.execute("select index, coord_lat, coord_long, city , ranking from poi_detail_table_v2 where index = %i;"%(float(v)))
points.append(cur.fetchone())
conn.close()
points = check_NO_1(points, city_name)
# print 'db_distance',points
n,D = mk_matrix(points[:,1:3], geopy_dist)
if len(points) >= 3:
if event_type == 'add':
tour = nearest_neighbor(n, 0, D)
# create a greedy tour, visiting city 'i' first
z = length(tour, D)
z = localsearch(tour, z, D)
return np.array(event_ids)[tour], event_type
#need to figure out other cases
else:
tour = nearest_neighbor(n, 0, D)
# create a greedy tour, visiting city 'i' first
z = length(tour, D)
z = localsearch(tour, z, D)
return np.array(event_ids)[tour], event_type
else:
return np.array(event_ids), event_type
def check_NO_1(poi_list, city_name):
city_name = city_name.replace('_',' ')
if len(poi_list)==1:
return np.array(poi_list)
for i, poi in enumerate(poi_list):
if (poi[3] == city_name) and (poi[4]==1):
number_one =poi_list.pop(i)
return np.vstack((np.array(number_one),np.array(poi_list)))
return np.array(poi_list)
def check_full_trip_id(full_trip_id, debug):
'''
Check full trip id exist or not.
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute("select details from full_trip_table where full_trip_id = '%s'" %(full_trip_id))
a = cur.fetchone()
conn.close()
if bool(a):
if not debug:
return a[0]
else:
return True
else:
return False
def check_day_trip_id(day_trip_id, debug):
'''
Check day trip id exist or not.
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute("select details from day_trip_table where trip_locations_id = '%s'" %(day_trip_id))
a = cur.fetchone()
conn.close()
if bool(a):
if not debug:
return a[0]
else:
return True
else:
return False
def check_travel_time_id(new_id):
'''
Check google driving time exisit or not for the 2 point poi id.
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute("select google_driving_time from google_travel_time_table where id_field = '%s'" %(new_id))
a = cur.fetchone()
conn.close()
if bool(a):
return True
else:
return False
#May need to improve by adding #reviews in this. :)
def sorted_events(info,ix):
'''
find the event_id, ranking and review_score, num_reviews columns
sorted base on ranking then review_score, num_reviews
return sorted list
'''
event_ = info[ix][:,[0,4,5,6]]
return np.array(sorted(event_, key=lambda x: (x[1], -x[3], -x[2])))
#Need to make this more efficient
def create_event_id_list(big_,medium_,small_):
# print big_,medium_,small_
event_type = ''
if big_.shape[0] >= 1:
if (medium_.shape[0] < 2) or (big_[0,1] <= medium_[0,1]):
if small_.shape[0] >= 6:
event_ids = list(np.concatenate((big_[:1,0], small_[0:6,0]),axis=0))
elif small_.shape[0]>0:
event_ids = list(np.concatenate((big_[:1,0], small_[:,0]),axis=0))
else:
event_ids = list(np.array(sorted(big_[0:,:], key=lambda x: (x[1],-x[2])))[:,0])
event_type = 'big'
else:
if small_.shape[0] >= 8:
event_ids = list(np.concatenate((medium_[0:2,0], small_[0:8,0]),axis=0))
elif small_.shape[0]>0:
event_ids = list(np.concatenate((medium_[0:2,0], small_[:,0]),axis=0))
else:
event_ids = list(np.array(sorted(medium_[0:,:], key=lambda x: (x[1],-x[2])))[:,0])
event_type = 'med'
elif medium_.shape[0] >= 2:
if small_.shape[0] >= 8:
event_ids = list(np.concatenate((medium_[0:2,0], small_[0:8,0]),axis=0))
elif small_.shape[0]>0:
event_ids = list(np.concatenate((medium_[0:2,0], small_[:,0]),axis=0))
else:
event_ids = list(np.array(sorted(medium_[0:,:], key=lambda x: (x[1],-x[2])))[:,0])
event_type = 'med'
else:
if small_.shape[0] >= 10:
if medium_.shape[0]==0:
event_ids = list(np.array(sorted(small_[0:10,:], key=lambda x: (x[1],-x[2])))[:,0])
else:
event_ids = list(np.array(sorted(np.vstack((medium_[:1,:], small_[0:10,:])), key=lambda x: (x[1],-x[2])))[:,0])
elif small_.shape[0] > 0:
if medium_.shape[0]==0:
event_ids = list(np.array(sorted(small_[0:,:], key=lambda x: (x[1],-x[2])))[:,0])
else:
event_ids = list(np.array(sorted(np.vstack((medium_, small_)), key=lambda x: (x[1],-x[2])))[:,0])
else:
event_ids = list(np.array(sorted(medium_[0:,:], key=lambda x: (x[1],-x[2])))[:,0])
event_type = 'small'
# else:
return event_ids, event_type
def db_google_driving_walking_time(event_ids, event_type):
'''
Get estimated travel time from google api.
Limit 1000 calls per day.
'''
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
google_ids = []
driving_time_list = []
walking_time_list = []
name_list = []
for i,v in enumerate(event_ids[:-1]):
id_ = str(v) + '0000'+str(event_ids[i+1])
result_check_travel_time_id = check_travel_time_id(id_)
if not result_check_travel_time_id:
cur.execute("select name, coord_lat, coord_long from poi_detail_table_v2 where index = %s"%(v))
orig_name, orig_coord_lat, orig_coord_long = cur.fetchone()
orig_idx = v
cur.execute("select name, coord_lat, coord_long from poi_detail_table_v2 where index = %s "%(event_ids[i+1]))
dest_name, dest_coord_lat, dest_coord_long = cur.fetchone()
dest_idx = event_ids[i+1]
orig_coords = str(orig_coord_lat)+','+str(orig_coord_long)
dest_coords = str(dest_coord_lat)+','+str(dest_coord_long)
google_driving_url = "https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&mode=driving&language=en-EN&sensor=false&key={2}".\
format(orig_coords.replace(' ',''),dest_coords.replace(' ',''),my_key)
google_walking_url = "https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&mode=walking&language=en-EN&sensor=false&key={2}".\
format(orig_coords.replace(' ',''),dest_coords.replace(' ',''),my_key)
driving_result= simplejson.load(urllib.urlopen(google_driving_url))
walking_result= simplejson.load(urllib.urlopen(google_walking_url))
if driving_result['rows'][0]['elements'][0]['status'] == 'ZERO_RESULTS':
google_driving_url = "https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&mode=driving&language=en-EN&sensor=false&key={2}".\
format(orig_name.replace(' ','+').replace('-','+'),dest_name.replace(' ','+').replace('-','+'),my_key)
driving_result= simplejson.load(urllib.urlopen(google_driving_url))
if walking_result['rows'][0]['elements'][0]['status'] == 'ZERO_RESULTS':
google_walking_url = "https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&mode=walking&language=en-EN&sensor=false&key={2}".\
format(orig_name.replace(' ','+').replace('-','+'),dest_name.replace(' ','+').replace('-','+'),my_key)
walking_result= simplejson.load(urllib.urlopen(google_walking_url))
if (driving_result['rows'][0]['elements'][0]['status'] == 'NOT_FOUND') and (walking_result['rows'][0]['elements'][0]['status'] == 'NOT_FOUND'):
new_event_ids = list(event_ids)
new_event_ids.pop(i+1)
new_event_ids = db_event_cloest_distance(event_ids=new_event_ids, event_type = event_type)
return db_google_driving_walking_time(new_event_ids, event_type)
try:
google_driving_time = driving_result['rows'][0]['elements'][0]['duration']['value']/60
except:
print v, id_, driving_result #need to debug for this
try:
google_walking_time = walking_result['rows'][0]['elements'][0]['duration']['value']/60
except:
google_walking_time = 9999
# print 'google_driving time: ', google_driving_time
google_driving_url = google_driving_url.replace("'s","%27")
google_walking_url = google_walking_url.replace("'s","%27")
cur.execute("select max(index) from google_travel_time_table")
index = cur.fetchone()[0]+1
driving_result = str(driving_result).replace("'",'"')
walking_result = str(walking_result).replace("'",'"')
orig_name = orig_name.replace("'","''")
dest_name = dest_name.replace("'","''")
cur.execute("INSERT INTO google_travel_time_table VALUES (%i, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s','%s', '%s', '%s', '%s', '%s', '%s', %s, %s);"%(index, id_, orig_name, orig_idx, dest_name, dest_idx, orig_coord_lat, orig_coord_long, dest_coord_lat,\
dest_coord_long, orig_coords, dest_coords, google_driving_url, google_walking_url,\
str(driving_result), str(walking_result), google_driving_time, google_walking_time))
conn.commit()
name_list.append(orig_name+" to "+ dest_name)
google_ids.append(id_)
driving_time_list.append(google_driving_time)
walking_time_list.append(google_walking_time)
else:
cur.execute("select orig_name, dest_name, google_driving_time, google_walking_time from google_travel_time_table \
where id_field = '%s'" %(id_))
orig_name, dest_name, google_driving_time, google_walking_time = cur.fetchone()
name_list.append(orig_name+" to "+ dest_name)
google_ids.append(id_)
driving_time_list.append(google_driving_time)
walking_time_list.append(google_walking_time)
conn.close()
# return event_ids, google_ids, name_list, driving_time_list, walking_time_list
return event_ids, driving_time_list, walking_time_list
def db_remove_extra_events(event_ids, driving_time_list,walking_time_list, max_time_spent=600):
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
if len(event_ids) > 1:
cur.execute("SELECT DISTINCT SUM(adjusted_visit_length) FROM poi_detail_table_v2 WHERE index IN %s;" %(tuple(event_ids),))
time_spent = cur.fetchone()[0]
conn.close()
else:
cur.execute("SELECT DISTINCT SUM(adjusted_visit_length) FROM poi_detail_table_v2 WHERE index = %s;" %(event_ids))
time_spent = cur.fetchone()[0]
conn.close()
travel_time = int(sum(np.minimum(np.array(driving_time_list),np.array(walking_time_list))))
time_spent = int(time_spent) + travel_time
if time_spent > max_time_spent:
update_event_ids = event_ids[:-1]
update_driving_time_list = driving_time_list[:-1]
update_walking_time_list = walking_time_list[:-1]
return db_remove_extra_events(update_event_ids, update_driving_time_list, update_walking_time_list)
else:
return event_ids, driving_time_list, walking_time_list, time_spent
def db_adjust_events(event_ids, driving_time_list,walking_time_list, not_visited_poi_lst, event_type, city, max_time_spent=600):
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
if len(event_ids) > 1:
cur.execute("SELECT DISTINCT SUM(adjusted_visit_length) FROM poi_detail_table_v2 WHERE index IN %s;" %(tuple(event_ids),))
time_spent = cur.fetchone()[0]
conn.close()
else:
cur.execute("SELECT DISTINCT SUM(adjusted_visit_length) FROM poi_detail_table_v2 WHERE index = %s;" %(event_ids[0]))
time_spent = cur.fetchone()[0]
conn.close()
travel_time = int(sum(np.minimum(np.array(driving_time_list),np.array(walking_time_list))))
time_spent = int(time_spent) + travel_time
if time_spent > max_time_spent:
update_event_ids = event_ids[:-1]
update_driving_time_list = driving_time_list[:-1]
update_walking_time_list = walking_time_list[:-1]
not_visited_poi_lst.append(event_ids[-1])
return db_adjust_events(update_event_ids, update_driving_time_list, update_walking_time_list,not_visited_poi_lst, event_type, city)
elif (time_spent < max_time_spent - 240) and (len(not_visited_poi_lst)>1):
event_ids = list(event_ids)
event_ids.extend(not_visited_poi_lst)
event_ids, event_type = db_event_cloest_distance(event_ids = event_ids, event_type = event_type, city_name = city)
event_ids, driving_time_list, walking_time_list = db_google_driving_walking_time(event_ids, event_type)
return db_adjust_events(event_ids, driving_time_list, walking_time_list, [], event_type, city)
else:
return event_ids, driving_time_list, walking_time_list, time_spent, not_visited_poi_lst
def db_day_trip_details(event_ids, i):
conn=psycopg2.connect(conn_str)
cur = conn.cursor()
details = []
#details dict includes: id, name,address, day
for event_id in event_ids:
cur.execute("select index, name, address, coord_lat, coord_long from poi_detail_table_v2 where index = %s;" %(event_id))
a = cur.fetchone()
details.append(str({'id': a[0],'name': a[1],'address': a[2], 'day': i, 'coord_lat': a[3], 'coord_long': a[4]}))
conn.close()
return details
def check_address(index):
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
cur.execute("select address from poi_detail_table_v2 where index = %s;"%(index))
a = cur.fetchone()[0]
conn.close()
if a:
return True
else:
return False
def db_address(event_ids):
conn = psycopg2.connect(conn_str)
cur = conn.cursor()
for i in event_ids[:-1]:
if not check_address(i):
cur.execute("select driving_result from google_travel_time_table where orig_idx = %s;" %(i))
a= cur.fetchone()[0]
add = ast.literal_eval(a)['origin_addresses'][0]
cur.execute("update poi_detail_table_v2 set address = '%s' where index = %s;" %(add, i))
conn.commit()
last = event_ids[-1]
if not check_address(last):
cur.execute("select driving_result from google_travel_time_table where dest_idx = %s;" %(last))
a= cur.fetchone()[0]
add = ast.literal_eval(a)['destination_addresses'][0]
cur.execute("update poi_detail_table_v2 set address = '%s' where index = %s;" %(add, last))
conn.commit()
conn.close()
def kmeans_leabels_day_order(day_labels):
return [k for k, v in Counter(day_labels).most_common()]
In [ ]: