notebook.community

Edit and run



In [2]:

    
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_pandas, tqdm_notebook
import re
import requests
from bs4 import BeautifulSoup
import glob


from fake_useragent import UserAgent
ua = UserAgent()

tqdm_pandas(tqdm())









    



0it [00:00, ?it/s]



In [3]:

    
import sys
sys.setrecursionlimit(100000000)



In [6]:

    
def get_soup(url, timeout=5):
    headers  = {'User-Agent':ua.random}
    try:
        response = requests.get(url,headers=headers)
    except:
        print("FAILED "+ url)
        return 0
    attempts = 0
    while(not response.ok):
            #print((url+' failed with code: '+str(response.status_code)))
            if attempts > timeout:
                print(url+' failed with code: '+str(response.status_code))
                return BeautifulSoup('','lxml')
            response = requests.get(url)
            attempts += 1
    page = response.text
    soup = BeautifulSoup(page,'lxml')
    return soup



In [7]:

    
def get_beer_stats(row):
    soup = row['soup']
    stats = soup.find(id='item_stats').find('dl')
    row['ba_score']    = soup.find(class_='BAscore_big ba-score').get_text()
    row['num_reviews'] = int(stats.find(class_='ba-reviews').get_text().replace(',',''))
    row['num_ratings'] = int(stats.find(class_='ba-ratings').get_text().replace(',',''))
    row['ravg']        = float(stats.find(class_='ba-ravg').get_text().replace(',',''))
    row['pdev']        = float(stats.find(class_='ba-pdev').get_text().replace(',','').replace('%',''))
    row['wants']       = int(stats.find(class_='ba-wants').get_text().replace(',',''))
    row['gots']        = int(stats.find(class_='ba-gots').get_text().replace(',',''))
    row['for_trade']   = int(stats.find_all('dt')[-1].get_text().replace(',',''))

    info_links = soup.find('div',style="float:right;width:70%;").find_all('a')
    row['brewery_name'] = info_links[0].get_text()
    row['brewery_loation'] = info_links[1].get_text()
    try:
        row['brewery_website'] = info_links[3]['href']
    except:
        row['brewery_website'] = ''
    row['beer_style'] = info_links[-1].get_text()
    row['style_url'] = info_links[-1]['href']
    if row['brewery_website'] == row['style_url']:
        row['brewery_website'] = ''
    try:
        row['abv'] = float(re.findall(r'(?<=\(ABV\): )\d+\.\d+',soup.find('div',style="float:right;width:70%;").get_text())[0])
    except:
        row['abv'] = np.nan
    row['availability'] = re.findall(r'(?<=Availability: )[\w\-]*',soup.find('div',style="float:right;width:70%;").get_text())[0]
    return row



In [159]:

    
beers = pd.read_pickle('beer_soup_229.pkl')



In [160]:

    
tqdm_pandas(tqdm())
beers = beers.progress_apply(get_beer_stats,axis=1)









    



0it [00:00, ?it/s]
100%|██████████| 3/3 [00:00<00:00, 21.46it/s]
5it [00:00, 19.82it/s]                       
7it [00:00, 19.30it/s]
9it [00:00, 18.70it/s]
12it [00:00, 19.55it/s]
14it [00:00, 19.19it/s]
17it [00:00, 20.19it/s]
19it [00:00, 19.93it/s]
21it [00:01, 19.35it/s]
23it [00:01, 19.23it/s]
25it [00:01, 18.03it/s]
27it [00:01, 18.53it/s]
30it [00:01, 19.66it/s]
32it [00:01, 18.74it/s]
34it [00:01, 18.33it/s]
36it [00:01, 18.63it/s]
39it [00:02, 19.38it/s]
41it [00:02, 18.92it/s]
43it [00:02, 18.44it/s]
45it [00:02, 18.30it/s]
47it [00:02, 18.25it/s]
50it [00:02, 19.42it/s]
52it [00:02, 18.97it/s]
54it [00:02, 19.17it/s]
57it [00:02, 20.29it/s]
60it [00:03, 18.44it/s]
62it [00:03, 18.53it/s]
65it [00:03, 19.49it/s]
68it [00:03, 20.20it/s]
71it [00:03, 20.86it/s]
74it [00:03, 21.40it/s]
77it [00:03, 21.17it/s]
80it [00:04, 18.57it/s]
82it [00:04, 17.06it/s]
84it [00:04, 17.60it/s]
87it [00:04, 17.90it/s]
89it [00:04, 17.23it/s]
91it [00:04, 17.93it/s]
93it [00:04, 17.85it/s]
95it [00:04, 18.33it/s]
97it [00:05, 17.85it/s]
99it [00:05, 17.82it/s]
101it [00:05, 15.70it/s]
103it [00:05, 14.99it/s]
105it [00:05, 15.41it/s]
107it [00:05, 15.86it/s]
109it [00:05, 16.13it/s]
111it [00:06, 15.24it/s]
113it [00:06, 15.94it/s]
115it [00:06, 16.84it/s]
117it [00:06, 14.95it/s]
119it [00:06, 13.30it/s]
121it [00:06, 12.87it/s]
123it [00:06, 13.33it/s]
125it [00:07, 13.23it/s]
127it [00:07, 11.97it/s]
129it [00:07, 12.14it/s]
131it [00:07, 12.85it/s]
133it [00:07, 13.26it/s]
135it [00:07, 13.21it/s]
137it [00:08, 13.01it/s]
139it [00:08, 13.03it/s]
141it [00:08, 12.86it/s]
143it [00:08, 12.90it/s]
145it [00:08, 12.71it/s]
147it [00:08, 13.79it/s]
149it [00:08, 13.05it/s]
151it [00:09, 13.26it/s]
153it [00:09, 14.73it/s]
155it [00:09, 14.98it/s]
157it [00:09, 15.99it/s]
159it [00:09, 15.68it/s]
161it [00:09, 14.30it/s]
163it [00:09, 14.26it/s]
165it [00:09, 14.88it/s]
168it [00:10, 15.96it/s]
170it [00:10, 14.63it/s]
172it [00:10, 14.36it/s]
174it [00:10, 12.79it/s]
176it [00:10, 12.96it/s]
179it [00:10, 14.97it/s]
182it [00:11, 16.47it/s]
184it [00:11, 15.74it/s]
186it [00:11, 15.15it/s]
189it [00:11, 16.76it/s]
191it [00:11, 17.57it/s]
193it [00:11, 17.90it/s]
195it [00:11, 18.31it/s]
198it [00:11, 19.43it/s]
201it [00:12, 20.42it/s]
204it [00:12, 18.06it/s]
206it [00:12, 16.31it/s]
208it [00:12, 16.10it/s]
210it [00:12, 16.17it/s]
212it [00:12, 15.91it/s]
214it [00:12, 14.92it/s]
216it [00:13, 15.37it/s]
218it [00:13, 15.42it/s]
220it [00:13, 14.47it/s]
222it [00:13, 14.32it/s]
224it [00:13, 14.62it/s]
226it [00:13, 15.79it/s]
229it [00:13, 17.19it/s]
230it [00:13, 16.43it/s]



In [8]:

    
def get_beer_df_reviews(row):
    ba_url = 'http://www.beeradvocate.com'
    url_suffix = '?sort=topr&start='
    row['reviews'] = get_beer_reviews(row['soup'])
    if row['num_reviews'] > 25:
        if row['num_reviews'] > 100:
            num_reviews = 100
        else:
            num_reviews = row['num_reviews']
        for i in range(1,num_reviews//25):
            url = ba_url + row['url'] + url_suffix + str(i*25)
            soup = get_soup(url)
            reviews = get_beer_reviews(soup)
            row['reviews'] += reviews
    return row

def get_beer_reviews(soup):
    reviews = []
    for rating in soup.find_all(id='rating_fullview_content_2'):
        for span in rating.find_all('span'):
            span.extract()
        review = rating.get_text().strip().encode('utf-8')
        review = review.replace('rDev','')
        reviews.append(str(review))
    return reviews



In [162]:

    
tqdm_pandas(tqdm())
beers = beers.progress_apply(get_beer_df_reviews,axis = 1)
beers.head()









    



0it [00:00, ?it/s]
 17%|█▋        | 3/18 [00:00<00:00, 21.87it/s]
 33%|███▎      | 6/18 [00:00<00:01,  9.82it/s]
 44%|████▍     | 8/18 [00:02<00:03,  2.94it/s]
 50%|█████     | 9/18 [00:04<00:07,  1.26it/s]
 67%|██████▋   | 12/18 [00:04<00:03,  1.76it/s]
 78%|███████▊  | 14/18 [00:05<00:02,  1.74it/s]
 94%|█████████▍| 17/18 [00:05<00:00,  2.40it/s]
20it [00:07,  2.37it/s]                        
22it [00:08,  1.83it/s]
25it [00:10,  1.75it/s]
27it [00:10,  2.40it/s]
30it [00:11,  3.30it/s]
33it [00:11,  4.47it/s]
35it [00:12,  2.37it/s]
38it [00:13,  3.25it/s]
40it [00:13,  3.31it/s]
42it [00:14,  2.64it/s]
44it [00:15,  2.86it/s]
47it [00:15,  3.91it/s]
49it [00:15,  3.77it/s]
52it [00:16,  5.04it/s]
56it [00:17,  3.72it/s]
59it [00:19,  2.76it/s]
62it [00:20,  3.10it/s]
64it [00:22,  1.63it/s]
65it [00:24,  1.01it/s]
67it [00:26,  1.04it/s]
68it [00:27,  1.18it/s]
71it [00:27,  1.65it/s]
73it [00:27,  2.25it/s]
75it [00:29,  1.69it/s]
76it [00:31,  1.10s/it]
78it [00:33,  1.03s/it]
80it [00:33,  1.36it/s]
83it [00:33,  1.90it/s]
85it [00:36,  1.29it/s]
87it [00:36,  1.79it/s]
90it [00:36,  2.46it/s]
93it [00:37,  2.48it/s]
95it [00:39,  1.90it/s]
96it [00:39,  1.78it/s]
97it [00:41,  1.36it/s]
101it [00:42,  1.52it/s]
102it [00:44,  1.22it/s]
106it [00:44,  1.71it/s]
108it [00:46,  1.43it/s]
111it [00:46,  2.00it/s]
113it [00:47,  1.86it/s]
114it [00:48,  1.36it/s]
116it [00:51,  1.15it/s]
117it [00:52,  1.08s/it]
119it [00:53,  1.18it/s]
122it [00:55,  1.27it/s]
126it [00:55,  1.79it/s]
129it [00:55,  2.47it/s]
131it [00:58,  1.25it/s]
134it [00:59,  1.58it/s]
137it [01:00,  1.77it/s]
138it [01:02,  1.09it/s]
139it [01:03,  1.22it/s]
140it [01:04,  1.06s/it]
143it [01:04,  1.32it/s]
144it [01:06,  1.04s/it]
146it [01:06,  1.33it/s]
149it [01:06,  1.84it/s]
151it [01:08,  1.52it/s]
154it [01:08,  2.12it/s]
156it [01:10,  1.93it/s]
159it [01:10,  2.68it/s]
162it [01:10,  3.66it/s]
164it [01:12,  2.21it/s]
166it [01:13,  1.94it/s]
168it [01:15,  1.53it/s]
170it [01:24,  1.81s/it]
174it [01:24,  1.28s/it]
176it [01:27,  1.32s/it]





    



http://www.beeradvocate.com/beer/profile/13947/30771/?sort=topr&start=75 failed with code: 403






    



178it [01:29,  1.23s/it]
180it [01:31,  1.15s/it]





    



http://www.beeradvocate.com/beer/profile/18134/13906/?sort=topr&start=50 failed with code: 403






    



182it [01:33,  1.06s/it]
184it [01:34,  1.08it/s]
185it [01:36,  1.19s/it]
188it [01:38,  1.03s/it]
191it [01:38,  1.37it/s]
194it [01:38,  1.92it/s]
197it [01:38,  2.67it/s]
199it [01:40,  1.89it/s]
201it [01:40,  2.18it/s]
204it [01:40,  3.01it/s]
207it [01:42,  2.48it/s]
209it [01:44,  1.81it/s]
211it [01:44,  2.10it/s]
214it [01:45,  2.88it/s]
218it [01:45,  3.96it/s]
221it [01:45,  5.32it/s]
225it [01:47,  3.75it/s]
228it [01:49,  2.61it/s]
230it [01:51,  1.57it/s]






    Out[162]:






  
    
      
      name
      url
      soup
      ba_score
      num_reviews
      num_ratings
      ravg
      pdev
      wants
      gots
      for_trade
      brewery_name
      brewery_loation
      brewery_website
      beer_style
      style_url
      abv
      availability
      reviews
    
  
  
    
      0
      Imperial Eclipse Stout - High West Rye
      /beer/profile/14936/107388/
      <!DOCTYPE html>
<html class="Public NoJs Logge...
      92
      30
      162
      4.24
      17.92
      27
      64
      17
      FiftyFifty Brewing Co.
      California
      http://www.fiftyfiftybrewing.com
      American Double / Imperial Stout
      /beer/style/157/
      11.9
      Fall
      [ 2013 version poured into a snifter. Comes ou...
    
    
      1
      River Horse India Pale Ale
      /beer/profile/877/138007/
      <!DOCTYPE html>
<html class="Public NoJs Logge...
      83
      14
      90
      3.60
      31.94
      0
      32
      0
      River Horse Brewing Co.
      New Jersey
      http://www.riverhorse.com
      American IPA
      /beer/style/116/
      5.7
      Year-round
      [ No date on the bottle, but purchased from th...
    
    
      2
      Hibiscus Wit
      /beer/profile/24428/66018/
      <!DOCTYPE html>
<html class="Public NoJs Logge...
      84
      10
      57
      3.75
      16.80
      2
      7
      0
      2nd Shift Brewing
      Missouri
      http://www.2ndshiftbrewing.com
      Witbier
      /beer/style/48/
      5.2
      Rotating
      [ Gold with pure white headLight fresh stone f...
    
    
      3
      Kozel
      /beer/profile/448/5430/
      <!DOCTYPE html>
<html class="Public NoJs Logge...
      78
      59
      134
      3.36
      24.70
      2
      21
      0
      Pivovar Velké Popovice a.s.
      Czech Republic
      
      Czech Pilsener
      /beer/style/40/
      5.0
      Year-round
      [ On tap into a dimple beer mugA - Beer is cle...
    
    
      4
      Summer Wheat Ale
      /beer/profile/12375/61223/
      <!DOCTYPE html>
<html class="Public NoJs Logge...
      84
      19
      45
      3.74
      15.24
      1
      9
      0
      Mt. Carmel Brewing Company
      Ohio
      http://www.mtcarmelbrewingcompany.com
      American Pale Wheat Ale
      /beer/style/93/
      4.8
      Summer
      [ A: The beer is slightly hazy light yellow in...



In [165]:

    
beers.to_pickle('test.pkl')



In [180]:

    
for pkl in glob.glob('data/*.pkl'):
    filename = pkl.split('/')[1]









    



beer_soup_10119.pkl
beer_soup_10349.pkl
beer_soup_10579.pkl
beer_soup_10809.pkl
beer_soup_11039.pkl
beer_soup_11269.pkl
beer_soup_1149.pkl
beer_soup_11499.pkl
beer_soup_11729.pkl
beer_soup_11959.pkl
beer_soup_12189.pkl
beer_soup_12419.pkl
beer_soup_12649.pkl
beer_soup_12879.pkl
beer_soup_13109.pkl
beer_soup_13339.pkl
beer_soup_13569.pkl
beer_soup_1379.pkl
beer_soup_13799.pkl
beer_soup_14029.pkl
beer_soup_14259.pkl
beer_soup_14489.pkl
beer_soup_14719.pkl
beer_soup_14949.pkl
beer_soup_15179.pkl
beer_soup_15409.pkl
beer_soup_15639.pkl
beer_soup_15869.pkl
beer_soup_1609.pkl
beer_soup_16099.pkl
beer_soup_16329.pkl
beer_soup_16559.pkl
beer_soup_16789.pkl
beer_soup_17019.pkl
beer_soup_17249.pkl
beer_soup_17479.pkl
beer_soup_17709.pkl
beer_soup_17939.pkl
beer_soup_18169.pkl
beer_soup_1839.pkl
beer_soup_18399.pkl
beer_soup_18629.pkl
beer_soup_18859.pkl
beer_soup_19089.pkl
beer_soup_19319.pkl
beer_soup_19549.pkl
beer_soup_19779.pkl
beer_soup_20009.pkl
beer_soup_20239.pkl
beer_soup_20469.pkl
beer_soup_2069.pkl
beer_soup_229.pkl
beer_soup_2299.pkl
beer_soup_2529.pkl
beer_soup_2759.pkl
beer_soup_2989.pkl
beer_soup_3219.pkl
beer_soup_3449.pkl
beer_soup_3679.pkl
beer_soup_3909.pkl
beer_soup_4139.pkl
beer_soup_4369.pkl
beer_soup_459.pkl
beer_soup_4599.pkl
beer_soup_4829.pkl
beer_soup_5059.pkl
beer_soup_5289.pkl
beer_soup_5519.pkl
beer_soup_5749.pkl
beer_soup_5979.pkl
beer_soup_6209.pkl
beer_soup_6439.pkl
beer_soup_6669.pkl
beer_soup_689.pkl
beer_soup_6899.pkl
beer_soup_7129.pkl
beer_soup_7359.pkl
beer_soup_7589.pkl
beer_soup_7819.pkl
beer_soup_8049.pkl
beer_soup_8279.pkl
beer_soup_8509.pkl
beer_soup_8739.pkl
beer_soup_8969.pkl
beer_soup_919.pkl
beer_soup_9199.pkl
beer_soup_9429.pkl
beer_soup_9659.pkl
beer_soup_9889.pkl



In [181]:

    
tqdm_pandas(tqdm())
for pkl in tqdm(glob.glob('data/*.pkl')):
    temp = pd.read_pickle(pkl)
    temp = temp.apply(get_beer_stats, axis=1)
    temp = temp.apply(get_beer_df_reviews, axis=1)
    filename = pkl.split('/')[1]
    temp.to_pickle('temp/'+filename)









    







0it [00:00, ?it/s]









  0%|          | 0/89 [00:00<?, ?it/s]





  1%|          | 1/89 [04:50<7:05:45, 290.29s/it]





  2%|▏         | 2/89 [08:44<6:36:36, 273.53s/it]





  3%|▎         | 3/89 [11:50<5:54:11, 247.11s/it]





  4%|▍         | 4/89 [14:18<5:08:03, 217.45s/it]





  6%|▌         | 5/89 [19:27<5:43:01, 245.01s/it]





  7%|▋         | 6/89 [22:48<5:20:25, 231.63s/it]





  8%|▊         | 7/89 [25:45<4:54:25, 215.44s/it]





  9%|▉         | 8/89 [29:49<5:02:19, 223.95s/it]





 10%|█         | 9/89 [32:37<4:36:13, 207.17s/it]





 11%|█         | 10/89 [35:23<4:16:20, 194.68s/it]





 12%|█▏        | 11/89 [38:28<4:09:28, 191.90s/it]





 13%|█▎        | 12/89 [41:23<3:59:51, 186.90s/it]





 15%|█▍        | 13/89 [43:37<3:36:35, 170.99s/it]





 16%|█▌        | 14/89 [46:44<3:39:31, 175.62s/it]





 17%|█▋        | 15/89 [50:08<3:47:25, 184.40s/it]





 18%|█▊        | 16/89 [53:31<3:51:03, 189.91s/it]





 19%|█▉        | 17/89 [57:20<4:01:55, 201.60s/it]





 20%|██        | 18/89 [1:00:48<4:00:39, 203.37s/it]





 21%|██▏       | 19/89 [1:05:40<4:28:29, 230.14s/it]





 22%|██▏       | 20/89 [1:09:19<4:20:40, 226.68s/it]





 24%|██▎       | 21/89 [1:12:34<4:06:01, 217.08s/it]





 25%|██▍       | 22/89 [1:16:44<4:13:28, 226.99s/it]





 26%|██▌       | 23/89 [1:20:29<4:09:07, 226.48s/it]





 27%|██▋       | 24/89 [1:24:15<4:05:11, 226.34s/it]





 28%|██▊       | 25/89 [1:28:27<4:09:32, 233.95s/it]





 29%|██▉       | 26/89 [1:31:54<3:57:20, 226.04s/it]





 30%|███       | 27/89 [1:35:30<3:50:27, 223.02s/it]





 31%|███▏      | 28/89 [1:39:28<3:51:07, 227.34s/it]





 33%|███▎      | 29/89 [1:43:30<3:51:53, 231.89s/it]





 34%|███▎      | 30/89 [1:47:02<3:42:10, 225.95s/it]





 35%|███▍      | 31/89 [1:50:49<3:38:45, 226.30s/it]





 36%|███▌      | 32/89 [1:54:28<3:32:52, 224.07s/it]





 37%|███▋      | 33/89 [1:57:20<3:14:30, 208.41s/it]





 38%|███▊      | 34/89 [2:00:10<3:00:19, 196.72s/it]





 39%|███▉      | 35/89 [2:03:15<2:54:06, 193.45s/it]





 40%|████      | 36/89 [2:06:23<2:49:28, 191.86s/it]





 42%|████▏     | 37/89 [2:09:32<2:45:29, 190.96s/it]





 43%|████▎     | 38/89 [2:12:29<2:38:47, 186.81s/it]





 44%|████▍     | 39/89 [2:16:21<2:46:54, 200.29s/it]





 45%|████▍     | 40/89 [2:18:59<2:33:06, 187.48s/it]





 46%|████▌     | 41/89 [2:21:58<2:27:55, 184.90s/it]





 47%|████▋     | 42/89 [2:25:42<2:34:02, 196.65s/it]





 48%|████▊     | 43/89 [2:28:28<2:23:45, 187.50s/it]





 49%|████▉     | 44/89 [2:31:47<2:23:13, 190.97s/it]





 51%|█████     | 45/89 [2:35:16<2:23:55, 196.26s/it]





 52%|█████▏    | 46/89 [2:38:46<2:23:36, 200.39s/it]





 53%|█████▎    | 47/89 [2:42:56<2:30:49, 215.46s/it]





 54%|█████▍    | 48/89 [2:46:22<2:25:11, 212.47s/it]





 55%|█████▌    | 49/89 [2:49:48<2:20:29, 210.73s/it]





 56%|█████▌    | 50/89 [2:53:40<2:21:01, 216.96s/it]





 57%|█████▋    | 51/89 [2:58:00<2:25:31, 229.77s/it]





 58%|█████▊    | 52/89 [3:01:46<2:21:03, 228.75s/it]





 60%|█████▉    | 53/89 [3:04:49<2:09:02, 215.07s/it]





 61%|██████    | 54/89 [3:07:27<1:55:29, 197.97s/it]





 62%|██████▏   | 55/89 [3:09:53<1:43:14, 182.19s/it]





 63%|██████▎   | 56/89 [3:12:16<1:33:52, 170.69s/it]





 64%|██████▍   | 57/89 [3:14:52<1:28:36, 166.15s/it]





 65%|██████▌   | 58/89 [3:17:40<1:26:07, 166.69s/it]





 66%|██████▋   | 59/89 [3:20:08<1:20:37, 161.25s/it]





 67%|██████▋   | 60/89 [3:22:39<1:16:24, 158.09s/it]





 69%|██████▊   | 61/89 [3:25:17<1:13:46, 158.10s/it]





 70%|██████▉   | 62/89 [3:27:39<1:08:53, 153.08s/it]





 71%|███████   | 63/89 [3:29:54<1:03:59, 147.69s/it]





 72%|███████▏  | 64/89 [3:32:41<1:03:58, 153.53s/it]





 73%|███████▎  | 65/89 [3:35:25<1:02:40, 156.69s/it]





 74%|███████▍  | 66/89 [3:37:44<58:02, 151.42s/it]  





 75%|███████▌  | 67/89 [3:39:57<53:31, 145.96s/it]





 76%|███████▋  | 68/89 [3:42:37<52:34, 150.23s/it]





 78%|███████▊  | 69/89 [3:44:46<47:53, 143.66s/it]





 79%|███████▊  | 70/89 [3:47:10<45:31, 143.76s/it]





 80%|███████▉  | 71/89 [3:49:49<44:31, 148.43s/it]





 81%|████████  | 72/89 [3:52:16<41:52, 147.81s/it]





 82%|████████▏ | 73/89 [3:54:26<38:02, 142.68s/it]





 83%|████████▎ | 74/89 [3:56:35<34:36, 138.40s/it]





 84%|████████▍ | 75/89 [3:59:01<32:53, 140.93s/it]





 85%|████████▌ | 76/89 [4:01:30<31:01, 143.21s/it]





 87%|████████▋ | 77/89 [4:04:00<29:02, 145.21s/it]





 88%|████████▊ | 78/89 [4:06:41<27:30, 150.05s/it]





 89%|████████▉ | 79/89 [4:08:32<23:03, 138.31s/it]





 90%|████████▉ | 80/89 [4:11:03<21:17, 141.97s/it]





 91%|█████████ | 81/89 [4:12:59<17:53, 134.22s/it]





 92%|█████████▏| 82/89 [4:15:48<16:52, 144.65s/it]





 93%|█████████▎| 83/89 [4:18:39<15:15, 152.59s/it]





 94%|█████████▍| 84/89 [4:20:56<12:20, 148.02s/it]





 96%|█████████▌| 85/89 [4:23:23<09:50, 147.60s/it]





 97%|█████████▋| 86/89 [4:25:35<07:09, 143.09s/it]





 98%|█████████▊| 87/89 [4:28:05<04:49, 144.89s/it]





 99%|█████████▉| 88/89 [4:29:53<02:13, 133.88s/it]





100%|██████████| 89/89 [4:32:37<00:00, 143.00s/it]



In [6]:

    
beers = pd.read_pickle(glob.glob('temp/*.pkl')[0])



In [11]:

    
pkl = 'data/beer_soup_missing.pkl'
temp = pd.read_pickle(pkl)
temp = temp.apply(get_beer_stats, axis=1)
temp = temp.apply(get_beer_df_reviews, axis=1)



In [12]:

    
dfs = [temp]
for pkl in tqdm(glob.glob('temp/*.pkl')):
    temp = pd.read_pickle(pkl)
    temp.drop('soup',axis='columns',inplace=True)
    dfs.append(temp)
    
beer_reviews = pd.concat(dfs)
dfs = []
beer_reviews.to_pickle('all_beers_reviews.pkl')









    



  0%|          | 0/89 [00:00<?, ?it/s]
  1%|          | 1/89 [00:10<15:53, 10.84s/it]
  2%|▏         | 2/89 [00:22<15:57, 11.00s/it]
  3%|▎         | 3/89 [00:34<16:13, 11.32s/it]
  4%|▍         | 4/89 [00:45<15:51, 11.19s/it]
  6%|▌         | 5/89 [00:57<16:05, 11.49s/it]
  7%|▋         | 6/89 [01:08<15:39, 11.32s/it]
  8%|▊         | 7/89 [01:21<16:04, 11.76s/it]
  9%|▉         | 8/89 [01:34<16:27, 12.19s/it]
 10%|█         | 9/89 [01:47<16:35, 12.44s/it]
 11%|█         | 10/89 [01:59<16:12, 12.31s/it]
 12%|█▏        | 11/89 [02:10<15:45, 12.12s/it]
 13%|█▎        | 12/89 [02:24<15:57, 12.43s/it]
 15%|█▍        | 13/89 [02:36<15:31, 12.26s/it]
 16%|█▌        | 14/89 [02:48<15:28, 12.38s/it]
 17%|█▋        | 15/89 [03:00<14:59, 12.15s/it]
 18%|█▊        | 16/89 [03:13<14:59, 12.32s/it]
 19%|█▉        | 17/89 [03:25<14:47, 12.32s/it]
 20%|██        | 18/89 [03:36<14:14, 12.04s/it]
 21%|██▏       | 19/89 [03:49<14:27, 12.39s/it]
 22%|██▏       | 20/89 [04:01<14:03, 12.23s/it]
 24%|██▎       | 21/89 [04:13<13:39, 12.05s/it]
 25%|██▍       | 22/89 [04:25<13:38, 12.21s/it]
 26%|██▌       | 23/89 [04:38<13:22, 12.16s/it]
 27%|██▋       | 24/89 [04:51<13:29, 12.45s/it]
 28%|██▊       | 25/89 [05:02<13:01, 12.21s/it]
 29%|██▉       | 26/89 [05:16<13:09, 12.52s/it]
 30%|███       | 27/89 [05:29<13:10, 12.75s/it]
 31%|███▏      | 28/89 [05:40<12:37, 12.41s/it]
 33%|███▎      | 29/89 [05:53<12:20, 12.35s/it]
 34%|███▎      | 30/89 [06:05<12:05, 12.29s/it]
 35%|███▍      | 31/89 [06:17<11:51, 12.27s/it]
 36%|███▌      | 32/89 [06:29<11:42, 12.32s/it]
 37%|███▋      | 33/89 [06:42<11:35, 12.43s/it]
 38%|███▊      | 34/89 [06:54<11:11, 12.21s/it]
 39%|███▉      | 35/89 [07:07<11:06, 12.35s/it]
 40%|████      | 36/89 [07:18<10:40, 12.09s/it]
 42%|████▏     | 37/89 [07:31<10:39, 12.29s/it]
 43%|████▎     | 38/89 [07:43<10:20, 12.17s/it]
 44%|████▍     | 39/89 [07:54<09:57, 11.95s/it]
 45%|████▍     | 40/89 [08:07<09:55, 12.15s/it]
 46%|████▌     | 41/89 [08:18<09:37, 12.03s/it]
 47%|████▋     | 42/89 [08:30<09:18, 11.89s/it]
 48%|████▊     | 43/89 [08:42<09:10, 11.97s/it]
 49%|████▉     | 44/89 [08:54<08:57, 11.94s/it]
 51%|█████     | 45/89 [09:07<09:00, 12.28s/it]
 52%|█████▏    | 46/89 [09:18<08:34, 11.96s/it]
 53%|█████▎    | 47/89 [09:30<08:17, 11.84s/it]
 54%|█████▍    | 48/89 [09:42<08:06, 11.88s/it]
 55%|█████▌    | 49/89 [09:54<07:58, 11.97s/it]
 56%|█████▌    | 50/89 [10:07<07:54, 12.16s/it]
 57%|█████▋    | 51/89 [10:18<07:32, 11.91s/it]
 58%|█████▊    | 52/89 [10:29<07:09, 11.62s/it]
 60%|█████▉    | 53/89 [10:40<06:56, 11.58s/it]
 61%|██████    | 54/89 [10:51<06:32, 11.22s/it]
 62%|██████▏   | 55/89 [11:02<06:19, 11.15s/it]
 63%|██████▎   | 56/89 [11:13<06:05, 11.06s/it]
 64%|██████▍   | 57/89 [11:24<05:59, 11.24s/it]
 65%|██████▌   | 58/89 [11:35<05:47, 11.22s/it]
 66%|██████▋   | 59/89 [11:47<05:38, 11.28s/it]
 67%|██████▋   | 60/89 [11:58<05:22, 11.13s/it]
 69%|██████▊   | 61/89 [12:08<05:06, 10.94s/it]
 70%|██████▉   | 62/89 [12:19<04:57, 11.03s/it]
 71%|███████   | 63/89 [12:31<04:49, 11.12s/it]
 72%|███████▏  | 64/89 [12:42<04:38, 11.13s/it]
 73%|███████▎  | 65/89 [12:53<04:29, 11.24s/it]
 74%|███████▍  | 66/89 [13:04<04:12, 10.99s/it]
 75%|███████▌  | 67/89 [13:14<03:59, 10.90s/it]
 76%|███████▋  | 68/89 [13:27<03:56, 11.25s/it]
 78%|███████▊  | 69/89 [13:37<03:41, 11.06s/it]
 79%|███████▊  | 70/89 [13:48<03:29, 11.00s/it]
 80%|███████▉  | 71/89 [14:00<03:22, 11.26s/it]
 81%|████████  | 72/89 [14:11<03:08, 11.07s/it]
 82%|████████▏ | 73/89 [14:22<03:00, 11.28s/it]
 83%|████████▎ | 74/89 [14:34<02:50, 11.35s/it]
 84%|████████▍ | 75/89 [14:45<02:36, 11.18s/it]
 85%|████████▌ | 76/89 [14:57<02:30, 11.59s/it]
 87%|████████▋ | 77/89 [15:08<02:17, 11.44s/it]
 88%|████████▊ | 78/89 [15:19<02:02, 11.17s/it]
 89%|████████▉ | 79/89 [15:31<01:55, 11.52s/it]
 90%|████████▉ | 80/89 [15:42<01:41, 11.31s/it]
 91%|█████████ | 81/89 [15:53<01:30, 11.30s/it]
 92%|█████████▏| 82/89 [16:05<01:19, 11.31s/it]
 93%|█████████▎| 83/89 [16:16<01:07, 11.29s/it]
 94%|█████████▍| 84/89 [16:27<00:56, 11.32s/it]
 96%|█████████▌| 85/89 [16:39<00:45, 11.36s/it]
 97%|█████████▋| 86/89 [16:50<00:34, 11.41s/it]
 98%|█████████▊| 87/89 [17:01<00:22, 11.27s/it]
 99%|█████████▉| 88/89 [17:12<00:11, 11.31s/it]
100%|██████████| 89/89 [17:23<00:00, 11.20s/it]






    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-2528f3e88433> in <module>()
      7 beers_reviews = pd.concat(dfs)
      8 dfs = []
----> 9 beer_reviews.to_pickle('all_beers_reviews.pkl')

NameError: name 'beer_reviews' is not defined



In [20]:









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-20-3c01785abe90> in <module>()
----> 1 beer_reviews.head()

NameError: name 'beer_reviews' is not defined



In [ ]:

	name	url	soup	ba_score	num_reviews	num_ratings	ravg	pdev	wants	gots	for_trade	brewery_name	brewery_loation	brewery_website	beer_style	style_url	abv	availability	reviews
0	Imperial Eclipse Stout - High West Rye	/beer/profile/14936/107388/	<!DOCTYPE html> <html class="Public NoJs Logge...	92	30	162	4.24	17.92	27	64	17	FiftyFifty Brewing Co.	California	http://www.fiftyfiftybrewing.com	American Double / Imperial Stout	/beer/style/157/	11.9	Fall	[ 2013 version poured into a snifter. Comes ou...
1	River Horse India Pale Ale	/beer/profile/877/138007/	<!DOCTYPE html> <html class="Public NoJs Logge...	83	14	90	3.60	31.94	0	32	0	River Horse Brewing Co.	New Jersey	http://www.riverhorse.com	American IPA	/beer/style/116/	5.7	Year-round	[ No date on the bottle, but purchased from th...
2	Hibiscus Wit	/beer/profile/24428/66018/	<!DOCTYPE html> <html class="Public NoJs Logge...	84	10	57	3.75	16.80	2	7	0	2nd Shift Brewing	Missouri	http://www.2ndshiftbrewing.com	Witbier	/beer/style/48/	5.2	Rotating	[ Gold with pure white headLight fresh stone f...
3	Kozel	/beer/profile/448/5430/	<!DOCTYPE html> <html class="Public NoJs Logge...	78	59	134	3.36	24.70	2	21	0	Pivovar Velké Popovice a.s.	Czech Republic		Czech Pilsener	/beer/style/40/	5.0	Year-round	[ On tap into a dimple beer mugA - Beer is cle...
4	Summer Wheat Ale	/beer/profile/12375/61223/	<!DOCTYPE html> <html class="Public NoJs Logge...	84	19	45	3.74	15.24	1	9	0	Mt. Carmel Brewing Company	Ohio	http://www.mtcarmelbrewingcompany.com	American Pale Wheat Ale	/beer/style/93/	4.8	Summer	[ A: The beer is slightly hazy light yellow in...