In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import boto3
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
import re
%matplotlib inline

Data base structures at chewy.com

categories:

cat id category link page range
1 Balls and Fetch Toys https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A317 19
2 Chew Toys https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A316 15
3 Plush Toys https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A320 17
4 Interactive Toys https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A319 8
5 Rope and Tug https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A321 9

for chewy.com the toy page is: catepage + '&page=?'

for each toy, build a toy table:
toys:

toyid name link catid picture_link price reviews review_link review_page_num
114660 usa-bones-chews-cotton-rope-bones https://www.chewy.com/usa-bones-chews-cotton-rope-bones/dp/114660 5 https://img.chewy.com/is/catalog/86885_MAIN._AC_SL400_V1477926797_.jpg 5.69 568 /usa-bones-chews-cotton-rope-bones/product-reviews/114660?reviewSort=NEWEST&reviewFilter=ALL_STARS&pageNumber=1 58

also build a review table:

reviewid toyid toy_name user_name stars review_text time image image_id useful
9809823 114660 usa-bones-chews-cotton-rope-bones Steffy 5 We have... Sep2, 2017 http://chewy.ugc.bazaarvoice.com/0090-en_us/2540529/photo.jpg 0090-en_us_2540529 0

also build a image table:

image_id image_link image_name
0090-en_us_254052 http://chewy.ugc.bazaarvoice.com/0090-en_us/2540529/photo.jpg 0090-en_us_254052.jpg

In [2]:
# create the category data frame
cat_id = [1,2,3,4,5]
category = ['Balls and Fetch Toys','Chew Toys','Plush Toys','Interactive Toys','Rope and Tug']
link = ['https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A317','https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A316',
       'https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A320','https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A319',
       'https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A321']
pagerange = [19,15,17,8,9]
df_cat = pd.DataFrame({'cat_id':cat_id,'category':category,'link':link,'page range':pagerange})

In [8]:
df_data = df_cat.to_csv(index=False)
s3_res = boto3.resource('s3')
s3_res.Bucket('dogfaces').put_object(Key='reviews/category.csv', Body=df_data)


Out[8]:
s3.Object(bucket_name='dogfaces', key='reviews/category.csv')

In [124]:
df_cat = pd.read_csv("s3://dogfaces/reviews/category.csv")
df_cat.head()


Out[124]:
cat_id category link page range
0 1 Balls and Fetch Toys https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2... 19
1 2 Chew Toys https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2... 15
2 3 Plush Toys https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2... 17
3 4 Interactive Toys https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2... 8
4 5 Rope and Tug https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2... 9

Start with rope and tug


In [14]:
def get_cat_link(cat_id):
    try:
        df = DF_CAT
    except NameError:
        df = pd.read_csv("s3://dogfaces/reviews/category.csv")    
    link = df[df['cat_id']==cat_id]['link'].values[0]    
    page_range = df[df['cat_id']==cat_id]['page range'].values[0]
    return link, page_range

In [17]:
link, page_range = get_cat_link(1)

In [109]:
def get_toys(cat_id):
    link, page_range = get_cat_link(cat_id)
    res = []
    for i in xrange(page_range):
        toys_url = link+'&page={}'.format(i+1)
        r = requests.get(toys_url)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content)
            for item in soup.select("article.product-holder.cw-card.cw-card-hover"):
                num_reviews = int(item.select('p.rating.item-rating')[0].find('span').get_text())
                if num_reviews > 10:
                    toy = {}
                    toy['num_reviews'] = num_reviews
                    raw_id = item.select('a')[0]['href']
                    toy['toy_link'] = "https://www.chewy.com"+item.select('a')[0]['href']
                    toy['toy_id'] = raw_id.split('/')[-1]
                    toy['toy_name'] = raw_id.split('/')[1]
                    toy['picture_link'] = "https:" + item.select('img')[0]['src']
                    toy['price'] = item.select('p.price')[0].get_text().split()[0]
                    res.append(toy)
    return res

In [121]:
temp = get_toys(3)
df_test = pd.DataFrame.from_dict(temp)

In [122]:
print df_test['toy_link'][10]


https://www.chewy.com/kong-tuggerknots-moose-dog-toy/dp/43095

Scrape reviews and images


In [3]:
df_toys = pd.read_csv("s3://dogfaces/reviews/toys.csv")

In [4]:
df_toys.head()


Out[4]:
cat_id num_reviews picture_link price toy_id toy_link toy_name
0 1 800 https://img.chewy.com/is/catalog/62758_MAIN._A... $1.19 47728 https://www.chewy.com/kong-airdog-squeakair-ba... kong-airdog-squeakair-ball-dog-toy
1 1 127 https://img.chewy.com/is/catalog/80753._AC_SS1... $2.99 108582 https://www.chewy.com/mammoth-monkey-fist-bar-... mammoth-monkey-fist-bar-dog-toy
2 1 292 https://img.chewy.com/is/catalog/62850._AC_SS1... $3.39 47880 https://www.chewy.com/kong-squeakair-birthday-... kong-squeakair-birthday-balls-dog
3 1 1233 https://img.chewy.com/is/catalog/77643._AC_SS1... $9.39 105502 https://www.chewy.com/kong-jumbler-ball-dog-to... kong-jumbler-ball-dog-toy-color
4 1 538 https://img.chewy.com/is/catalog/53235_MAIN._A... $6.85 38371 https://www.chewy.com/chuckit-ultra-rubber-bal... chuckit-ultra-rubber-ball-medium-2

In [3]:
test_url = df_toys['toy_link'][0]
review_url = test_url+'?'+'reviewSort=NEWEST&reviewFilter=ALL_STARS&pageNumber=1'
r = requests.get(test_url)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-5bd0eef431e2> in <module>()
----> 1 test_url = df_toys['toy_link'][0]
      2 review_url = test_url+'?'+'reviewSort=NEWEST&reviewFilter=ALL_STARS&pageNumber=1'
      3 r = requests.get(test_url)

NameError: name 'df_toys' is not defined

In [8]:
#|reviewid|toyid|toy_name | user_name| starts| review_text| time| image|image_id| useful|
num_reviews =


Out[8]:
200

In [9]:
review_url


Out[9]:
'https://www.chewy.com/kong-airdog-squeakair-ball-dog-toy/dp/47728?reviewSort=NEWEST&reviewFilter=ALL_STARS&pageNumber=1'

In [1]:
new_url = 'https://www.chewy.com/kong-airdog-squeakair-ball-dog-toy/product-reviews/47728?reviewSort=NEWEST&reviewFilter=ALL_STARS&pageNumber=3'
new_url


Out[1]:
'https://www.chewy.com/kong-airdog-squeakair-ball-dog-toy/product-reviews/47728?reviewSort=NEWEST&reviewFilter=ALL_STARS&pageNumber=3'

In [2]:
test_url.split('/')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-443b98e89d08> in <module>()
----> 1 test_url.split('/')

NameError: name 'test_url' is not defined

In [ ]: