In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_pandas, tqdm_notebook
import re
import requests
from bs4 import BeautifulSoup
import glob
from fake_useragent import UserAgent
ua = UserAgent()
tqdm_pandas(tqdm())
In [3]:
import sys
sys.setrecursionlimit(100000000)
In [6]:
def get_soup(url, timeout=5):
headers = {'User-Agent':ua.random}
try:
response = requests.get(url,headers=headers)
except:
print("FAILED "+ url)
return 0
attempts = 0
while(not response.ok):
#print((url+' failed with code: '+str(response.status_code)))
if attempts > timeout:
print(url+' failed with code: '+str(response.status_code))
return BeautifulSoup('','lxml')
response = requests.get(url)
attempts += 1
page = response.text
soup = BeautifulSoup(page,'lxml')
return soup
In [7]:
def get_beer_stats(row):
soup = row['soup']
stats = soup.find(id='item_stats').find('dl')
row['ba_score'] = soup.find(class_='BAscore_big ba-score').get_text()
row['num_reviews'] = int(stats.find(class_='ba-reviews').get_text().replace(',',''))
row['num_ratings'] = int(stats.find(class_='ba-ratings').get_text().replace(',',''))
row['ravg'] = float(stats.find(class_='ba-ravg').get_text().replace(',',''))
row['pdev'] = float(stats.find(class_='ba-pdev').get_text().replace(',','').replace('%',''))
row['wants'] = int(stats.find(class_='ba-wants').get_text().replace(',',''))
row['gots'] = int(stats.find(class_='ba-gots').get_text().replace(',',''))
row['for_trade'] = int(stats.find_all('dt')[-1].get_text().replace(',',''))
info_links = soup.find('div',style="float:right;width:70%;").find_all('a')
row['brewery_name'] = info_links[0].get_text()
row['brewery_loation'] = info_links[1].get_text()
try:
row['brewery_website'] = info_links[3]['href']
except:
row['brewery_website'] = ''
row['beer_style'] = info_links[-1].get_text()
row['style_url'] = info_links[-1]['href']
if row['brewery_website'] == row['style_url']:
row['brewery_website'] = ''
try:
row['abv'] = float(re.findall(r'(?<=\(ABV\): )\d+\.\d+',soup.find('div',style="float:right;width:70%;").get_text())[0])
except:
row['abv'] = np.nan
row['availability'] = re.findall(r'(?<=Availability: )[\w\-]*',soup.find('div',style="float:right;width:70%;").get_text())[0]
return row
In [159]:
beers = pd.read_pickle('beer_soup_229.pkl')
In [160]:
tqdm_pandas(tqdm())
beers = beers.progress_apply(get_beer_stats,axis=1)
In [8]:
def get_beer_df_reviews(row):
ba_url = 'http://www.beeradvocate.com'
url_suffix = '?sort=topr&start='
row['reviews'] = get_beer_reviews(row['soup'])
if row['num_reviews'] > 25:
if row['num_reviews'] > 100:
num_reviews = 100
else:
num_reviews = row['num_reviews']
for i in range(1,num_reviews//25):
url = ba_url + row['url'] + url_suffix + str(i*25)
soup = get_soup(url)
reviews = get_beer_reviews(soup)
row['reviews'] += reviews
return row
def get_beer_reviews(soup):
reviews = []
for rating in soup.find_all(id='rating_fullview_content_2'):
for span in rating.find_all('span'):
span.extract()
review = rating.get_text().strip().encode('utf-8')
review = review.replace('rDev','')
reviews.append(str(review))
return reviews
In [162]:
tqdm_pandas(tqdm())
beers = beers.progress_apply(get_beer_df_reviews,axis = 1)
beers.head()
Out[162]:
In [165]:
beers.to_pickle('test.pkl')
In [180]:
for pkl in glob.glob('data/*.pkl'):
filename = pkl.split('/')[1]
In [181]:
tqdm_pandas(tqdm())
for pkl in tqdm(glob.glob('data/*.pkl')):
temp = pd.read_pickle(pkl)
temp = temp.apply(get_beer_stats, axis=1)
temp = temp.apply(get_beer_df_reviews, axis=1)
filename = pkl.split('/')[1]
temp.to_pickle('temp/'+filename)
In [6]:
beers = pd.read_pickle(glob.glob('temp/*.pkl')[0])
In [11]:
pkl = 'data/beer_soup_missing.pkl'
temp = pd.read_pickle(pkl)
temp = temp.apply(get_beer_stats, axis=1)
temp = temp.apply(get_beer_df_reviews, axis=1)
In [12]:
dfs = [temp]
for pkl in tqdm(glob.glob('temp/*.pkl')):
temp = pd.read_pickle(pkl)
temp.drop('soup',axis='columns',inplace=True)
dfs.append(temp)
beer_reviews = pd.concat(dfs)
dfs = []
beer_reviews.to_pickle('all_beers_reviews.pkl')
In [20]:
In [ ]: