In [14]:
%load_ext autoreload
%autoreload 2
In [15]:
import os
# import pfile.to
# import pstr.to
from bs4 import BeautifulSoup
import requests
# import pstr
import re
# import pickle
# import urllib2
# from selenium import webdriver
# from selenium.webdriver.common.keys import Keys
import urlparse
import urllib
from datetime import datetime
In [16]:
##########
# SETTINGS
save_folder = os.path.join(os.environ['GD_FOLDER'], 'Shared/ms_otosense')
user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.5 Safari/532.2'
################################################
# UTILS
html_re = re.compile('\.html$')
def url_encode(query):
# look at urlparse for cleaner ways to do this
return urllib.urlencode({'':query})[1:]
def hms_message(msg=''):
t = datetime.now().time()
return "%d:%d:%d - %s" % (t.hour, t.minute, t.second, msg)
def save_text_to_file(s, filepath):
text_file = open(filepath, "w")
text_file.write(s.encode('utf-8'))
text_file.close()
def filename_from_url(url):
return url.replace('/','§').replace(':','{') + '.html'
def url_from_filename(filename):
return html_re.sub('', filename.replace('§','/').replace('{',':'))
# def get_dogpile_html_from_query(query):
# dogpile_url_prefix = 'http://www.dogpile.com/search/web'
# response = requests.get(dogpile_url_prefix, params={'q': query})
# if response:
# return response.text
# else:
# return None
# def acquire_query_result_from_dogpile(query, save_folder=os.path.join(os.environ['MS_DATA'], 'misc')):
# html = get_dogpile_html_from_query(query)
# if html:
# file_name = url_encode(query) + '.html'
# file_path = os.path.join(save_folder, file_name)
# save_html(html, file_path)
# else:
# raise ValueError("There was a problem in acquiring %s" % query)
################################################
# SPECIFIC METHODS
dogpile_base_url = 'http://www.dogpile.com'
dogpile_search_url = '/search/web?'
google_base_url = 'https://www.google.com/search?'
google_search_url = '/search?'
gshop_default_params = {
'tbm':'shop', # the thing that makes it look on google shopping
'tbs':'p_ord:rv', # type of view - could be vw:g,p_ord%3Arv for gridded view
'num':'100', # number of results - maximum 100
'start': '1' # number of results to start with
}
def qsi_from_result_page_number(page_number):
return page_number*10 + 1
def get_dogpile_request_url(query, result_page_number=0):
'''
returns a url
'''
first_item_number = qsi_from_result_page_number(result_page_number)
return urlparse.urljoin(base=dogpile_base_url,
url=dogpile_search_url
+ urllib.urlencode(query={'q': query, 'qsi': first_item_number}))
def get_gshop_request_url(query, result_page_number=0, number_of_results_per_page=100):
'''
returns a url to get a google shopping result page
'''
start_result_number = "%d" % (result_page_number*number_of_results_per_page + 1)
get_params = dict(gshop_default_params,
**{'start':start_result_number,
'num':number_of_results_per_page,
'query':query})
return urlparse.urljoin(base=google_base_url,
url=google_search_url
+ urllib.urlencode(query=get_params))
################################################
# GENERAL METHODS
# def get_url_from_seed(seed):
# return get_dogpile_request_url(seed)
# def get_html_from_seed(seed):
# url = get_url_from_seed(seed)
# return get_html_of_url(url)
def url_slurper(url):
# headers = {'User-Agent': user_agent}
# response = requests.get(url=url, headers=headers)
response = requests.get(url=url, verify=False)
if response and response.ok:
return response.text
else:
return None
def html_is_valid(html):
if html:
return True
else:
return False
def log_progress(msg):
print hms_message(msg)
def log_error(msg):
print hms_message('ERROR: ' + msg)
def file_path_of_slurp(slurp_spec):
return os.path.join(save_folder, filename_from_url(slurp_spec))
def save_html_of_slurp(html, slurp_spec):
save_text_to_file(s=html, filepath=file_path_of_slurp(slurp_spec))
In [17]:
# seed specification
seed_list = [
'"smoke alarm" flash', '"fire alarm" flash', 'doorbell flash']
n_result_pages = 2
get_url = get_gshop_request_url
get_html_of_url = url_slurper
for i, seed in enumerate(seed_list[:2]):
for j, result_page in enumerate(range(n_result_pages)):
# slurp
url = get_url(seed, result_page)
log_progress('seed %d, %d: %s (slurping %s)' % (i, j, seed, url))
try:
html = get_html_of_url(url)
except BaseException as e:
log_error('seed %d, %d (%s): get_html_of_url(%s)' % (i, j, seed, url))
continue # go to the next seed
# process
if html_is_valid:
save_html_of_slurp(html, url)
else:
log_error('seed %d, %d (%s): html not valid: %s' % (i, j, seed, url))
pause(7)
In [28]:
e
Out[28]:
In [23]:
print "asdf"
pause(5)
print 'assss'
In [ ]:
In [19]:
seed_list = [
'"smoke alarm" flash', '"fire alarm" flash', 'doorbell flash']
In [38]:
url = get_gshop_request_url(seed_list[0])
url
Out[38]:
In [39]:
html = url_slurper(url)
In [32]:
html = get_html_of_url('http://www.google.com/search?start=1&num=100&tbm=shop&query=%22smoke+alarm%22+flash&tbs=p_ord%3Arv')
In [34]:
import pstr.to
In [40]:
pstr.to.file(html, 'test.html')
In [ ]:
verify=False
In [ ]:
html = get_html_of_url('https://www.google.com/search?start=1&num=100&tbm=shop&query=%22smoke+alarm%22+flash&tbs=p_ord%3Arv')
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [191]:
def get_link_from_results(results_soup):
urlpane = results_soup.find('div', attrs={'class':'resultDisplayUrlPane'})
return urlparse.parse_qs(urlpane.find('a', attrs='resultDisplayUrl').attrs['href'])['ru'][0]
def get_title_text_from_results(results_soup):
return results_soup.find('div', attrs={'class':'resultTitlePane'}).get_text()
def get_description_text_from_results(results_soup):
return results_soup.find('div', attrs={'class':'resultDescription'}).get_text()
def get_web_results_dict_from_results_soup(results_soup):
return {
'link':get_link_from_results(results_soup),
'title':get_title_text_from_results(results_soup),
'description':get_description_text_from_results(results_soup)
}
def parse_dogpile_html(html):
b = BeautifulSoup(html)
result_tags = ['resultsAdsTop', 'resultsMain', 'resultsAdsBottom']
parse_dict = {k: b.find('div',attrs={'id':k}) for k in result_tags}
parse_dict['resultsAdsTop'] = parse_dict['resultsAdsTop'].findAll('div',attrs={'class':'searchResult adResult'})
parse_dict['resultsMain'] = parse_dict['resultsMain'].findAll('div',attrs={'class':'searchResult webResult'})
parse_dict['resultsAdsBottom'] = parse_dict['resultsAdsBottom'].findAll('div',attrs={'class':'searchResult adResult'})
parse_dict['resultsMain'] = [get_web_results_dict_from_results_soup(r) for r in parse_dict['resultsMain']]
return parse_dict
def diagnose_parse_dict(parse_dict):
print "parse_dict_keys: %s" % d.keys()
print "number of resultsMain: %d" % len(d['resultsMain'])
print d['resultsMain'][0]
In [192]:
d = parse_dogpile_html(html)
diagnose_parse_dict(d)
In [ ]:
In [ ]:
In [ ]:
url = 'https://www.google.com/search?hl=en&tbm=shop&q=deaf+alarm+clock'
In [3]:
'https://www.google.com/search?hl=en&tbm=shop&q=deaf+alarm+clock&tbs=vw:g,p_ord:rv'
Out[3]:
In [ ]:
'num=100'
In [ ]:
https://www.google.com/search?hl=en&tbm=shop&q=deaf+alarm+clock&tbs=vw:g,p_ord:rv
In [4]:
https://www.google.com/search?tbm=shop&q=deaf+alarm+clock&tbs=vw:g,p_ord%3Arv&start=1&num=100
In [14]:
default_gshop_params = {
'tbm':'shop', # the thing that makes it look on google shopping
'tbs':'p_ord%3Arv', # type of view - could be vw:g,p_ord%3Arv for gridded view
'num':'100', # number of results - maximum 100
'start': '1' # number of results to start with
}
default_gshop_params
Out[14]:
In [13]:
dict(default_gshop_params, **{'start':'77', 'q':'adsf+asdf'})
Out[13]:
In [ ]:
def mk_
In [12]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
http://www.youtube.com/results?search_query=deaf&page=5
linkto:http://www.gallaudet.edu/clerc_center/information_and_resources/info_to_go/hearing_and_communication_technology/alerting_devices/
In [3]:
searchterms_dict = dict()
searchterms_dict['who'] = ['deaf', '"hard of hearing"', '"hearing impaired"',
'"hearing impairment"', '"hearing loss"']
searchterms_dict['what'] = ['clock', '"baby monitor"', '"fire alarm"', '"smoke alarm"', 'alarm']
searchterms_dict['how'] = ['flash', 'flashing', 'vibrate', 'vibrating']
searchterms_dict['where'] = ['', 'site:facebook.com', 'site:youtube.com', 'site:twitter.com']
import itertools
query_list = [' '.join(x) for x in itertools.product(searchterms_dict['who'],
searchterms_dict['what'],searchterms_dict['how'], searchterms_dict['where'] )]
print len(query_list)
print query_list[:4]
In [30]:
In [26]:
for query in query_list: print query
In [ ]: