In [3]:
from bs4 import BeautifulSoup
import string
import requests
import pandas as pd
import re
import pickle
from fake_useragent import UserAgent
ua = UserAgent()
In [4]:
def remove_punctuation(x):
x = str(x)
return x.translate(str.maketrans({a:None for a in string.punctuation}))
In [5]:
def get_soup(url, timeout=5):
headers = {'User-Agent':ua.random}
try:
response = requests.get(url,headers=headers)
except:
print("FAILED "+ url)
return 0
attempts = 0
while(not response.ok):
print((url+' failed with code: '+str(response.status_code)))
if attempts > timeout:
print(url+' failed with code: '+str(response.status_code))
return BeautifulSoup('')
response = requests.get(url)
attempts += 1
page = response.text
soup = BeautifulSoup(page)
return soup
In [ ]:
# Get all urls for each beer style
url = 'http://www.beeradvocate.com/beer/style/'
soup = get_soup(url)
beer_styles = {}
for style in soup.find('table').find_all('a'):
beer_styles[style.get_text()] = style['href']
In [ ]:
ba_url = 'http://www.beeradvocate.com'
style_suffix = '?sort=revsD&start=0'
soup = get_soup(ba_url+style_url+style_suffix)
In [ ]:
int(re.findall(r'(?<=\(out of )\d*',soup.find('tr').get_text())[0])
In [ ]:
In [ ]:
beer_styles.items()[0]
In [ ]:
# Pulls the name and url to all beers with more than 25 'hads'
ba_url = 'http://www.beeradvocate.com'
style_suffix = '?sort=revsD&start='
columns = ['name','url']
temp = {}
for style in beer_styles.items():
print(url)
url = ba_url+style[1]+style_suffix
soup = get_soup(url)
num_beers = int(re.findall(r'(?<=\(out of )\d*',soup.find('tr').get_text())[0])
print num_beers
min_beer = False
for i in range(num_beers//50):
if min_beer:
break
url = ba_url+style[1]+style_suffix+str(i*50)
soup = get_soup(url)
for row in soup.find_all('tr')[3:-1]:
cells = row.find_all('td')
if int(cells[4].get_text().replace(',','')) < 25:
min_beer = True
break
temp[cells[0].find('a').get_text()] = cells[0].find('a')['href']
In [7]:
# Pulls the name and url to all beers with more than 25 'hads'
ba_url = 'http://www.beeradvocate.com'
style_suffix = '?sort=revsD&start='
columns = ['name','url']
temp = {}
url = ba_url+'/beer/style/149/'+style_suffix
soup = get_soup(url)
num_beers = int(re.findall(r'(?<=\(out of )\d*',soup.find('tr').get_text())[0])
print num_beers
min_beer = False
for i in range(num_beers//50):
if min_beer:
break
url = ba_url+'/beer/style/149/'+style_suffix+str(i*50)
soup = get_soup(url)
for row in soup.find_all('tr')[3:-1]:
cells = row.find_all('td')
if int(cells[4].get_text().replace(',','')) < 25:
min_beer = True
break
temp[cells[0].find('a').get_text()] = cells[0].find('a')['href']
temp
Out[7]:
In [ ]:
pickle.dump(temp,open('beer_list.pkl','wb'))
In [ ]:
beer_urls = pickle.load(open('beer_list.pkl','rb'))
In [ ]:
columns = ['name','url']
beers = pd.DataFrame(columns=columns)
beers['name'] = beer_urls.keys()
beers['url'] = beer_urls.values()
pd.to_pickle(beers,'beers.pkl')
In [8]:
beers = pd.read_pickle('beers.pkl')
In [9]:
beers[beers.name == 'Death & Taxes Black Beer']
Out[9]:
In [18]:
def get_beer_soup(url):
ba_url = 'http://www.beeradvocate.com'
url = ba_url+url
soup = get_soup(url)
return soup
In [20]:
import sys
sys.setrecursionlimit(100000000)
In [31]:
for i in tqdm(range(0,beers.shape[0],230)):
temp = beers.iloc[i:i+230-1,:]
temp['soup'] = temp.url.map(get_beer_soup)
temp.to_pickle('beer_soup_'+str(i+230-1)+'.pkl')
In [ ]:
temp = beers.iloc[229:beers.shape[0]:230]
temp['soup'] = temp.url.map(get_beer_soup)
In [21]:
temp.head()
Out[21]:
In [22]:
temp.to_pickle('beer_soup_missing.pkl')
In [14]:
from tqdm import tqdm
for i in tqdm(range(229,beers.shape[0],230)):
print(i)
In [ ]:
In [ ]:
In [ ]:
beer_reviews = {}
ba_url = 'http://www.beeradvocate.com'
for beer in beers.items():
soup = get_soup(ba_url+beer[1])
print(beer[0])
beer_reviews[beer[0]] = get_beer_reviews(soup)