In [106]:
import pandas as pd
import matplotlib.pyplot as plt
import boto3
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
import re
%matplotlib inline

Data base structures at chewy.com

categories:

cat id category link page range
1 Balls and Fetch Toys https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A317 19
2 Chew Toys https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A316 15
3 Plush Toys https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A320 17
4 Interactive Toys https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A319 8
5 Rope and Tug https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A321 9

for chewy.com the toy page is: catepage + '&page=?'

for each toy, build a toy table:
toys:

toyid name link catid picture_link price reviews review_link review_page_num
114660 usa-bones-chews-cotton-rope-bones https://www.chewy.com/usa-bones-chews-cotton-rope-bones/dp/114660 5 https://img.chewy.com/is/catalog/86885_MAIN._AC_SL400_V1477926797_.jpg 5.69 568 /usa-bones-chews-cotton-rope-bones/product-reviews/114660?reviewSort=NEWEST&reviewFilter=ALL_STARS&pageNumber=1 58

also build a review table:

reviewid toyid toy_name user_name starts review_text time image image_id useful
9809823 114660 usa-bones-chews-cotton-rope-bones Steffy 5 We have... Sep2, 2017 http://chewy.ugc.bazaarvoice.com/0090-en_us/2540529/photo.jpg 0090-en_us_2540529 0

also build a image table:

image_id image_link image_name
0090-en_us_254052 http://chewy.ugc.bazaarvoice.com/0090-en_us/2540529/photo.jpg 0090-en_us_254052.jpg

In [2]:
# create the category data frame
cat_id = [1,2,3,4,5]
category = ['Balls and Fetch Toys','Chew Toys','Plush Toys','Interactive Toys','Rope and Tug']
link = ['https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A317','https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A316',
       'https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A320','https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A319',
       'https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A321']
pagerange = [19,15,17,8,9]
df_cat = pd.DataFrame({'cat_id':cat_id,'category':category,'link':link,'page range':pagerange})

In [8]:
df_data = df_cat.to_csv(index=False)
s3_res = boto3.resource('s3')
s3_res.Bucket('dogfaces').put_object(Key='reviews/category.csv', Body=df_data)


Out[8]:
s3.Object(bucket_name='dogfaces', key='reviews/category.csv')

In [9]:
df_cat = pd.read_csv("s3://dogfaces/reviews/category.csv")
df_cat.head()


Out[9]:
cat_id category link page range
0 1 Balls and Fetch Toys https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2... 19
1 2 Chew Toys https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2... 15
2 3 Plush Toys https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2... 17
3 4 Interactive Toys https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2... 8
4 5 Rope and Tug https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2... 9

Start with rope and tug


In [14]:
def get_cat_link(cat_id):
    try:
        df = DF_CAT
    except NameError:
        df = pd.read_csv("s3://dogfaces/reviews/category.csv")    
    link = df[df['cat_id']==cat_id]['link'].values[0]    
    page_range = df[df['cat_id']==cat_id]['page range'].values[0]
    return link, page_range

In [17]:
link, page_range = get_cat_link(1)

In [109]:
def get_toys(cat_id):
    link, page_range = get_cat_link(cat_id)
    res = []
    for i in xrange(page_range):
        toys_url = link+'&page={}'.format(i+1)
        r = requests.get(toys_url)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content)
            for item in soup.select("article.product-holder.cw-card.cw-card-hover"):
                num_reviews = int(item.select('p.rating.item-rating')[0].find('span').get_text())
                if num_reviews > 10:
                    toy = {}
                    toy['num_reviews'] = num_reviews
                    raw_id = item.select('a')[0]['href']
                    toy['toy_link'] = "https://www.chewy.com"+item.select('a')[0]['href']
                    toy['toy_id'] = raw_id.split('/')[-1]
                    toy['toy_name'] = raw_id.split('/')[1]
                    toy['picture_link'] = "https:" + item.select('img')[0]['src']
                    toy['price'] = item.select('p.price')[0].get_text().split()[0]
                    res.append(toy)
    return res

In [121]:
temp = get_toys(3)
df_test = pd.DataFrame.from_dict(temp)

In [122]:
print df_test['toy_link'][10]


https://www.chewy.com/kong-tuggerknots-moose-dog-toy/dp/43095

In [ ]: