In [106]:
import pandas as pd
import matplotlib.pyplot as plt
import boto3
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
import re
%matplotlib inline
Data base structures at chewy.com
categories:
cat id | category | link | page range |
---|---|---|---|
1 | Balls and Fetch Toys | https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A317 | 19 |
2 | Chew Toys | https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A316 | 15 |
3 | Plush Toys | https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A320 | 17 |
4 | Interactive Toys | https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A319 | 8 |
5 | Rope and Tug | https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A321 | 9 |
for chewy.com the toy page is: catepage + '&page=?'
for each toy, build a toy table:
toys:
toyid | name | link | catid | picture_link | price | reviews | review_link | review_page_num |
---|---|---|---|---|---|---|---|---|
114660 | usa-bones-chews-cotton-rope-bones | https://www.chewy.com/usa-bones-chews-cotton-rope-bones/dp/114660 | 5 | https://img.chewy.com/is/catalog/86885_MAIN._AC_SL400_V1477926797_.jpg | 5.69 | 568 | /usa-bones-chews-cotton-rope-bones/product-reviews/114660?reviewSort=NEWEST&reviewFilter=ALL_STARS&pageNumber=1 | 58 |
also build a review table:
reviewid | toyid | toy_name | user_name | starts | review_text | time | image | image_id | useful |
---|---|---|---|---|---|---|---|---|---|
9809823 | 114660 | usa-bones-chews-cotton-rope-bones | Steffy | 5 | We have... | Sep2, 2017 | http://chewy.ugc.bazaarvoice.com/0090-en_us/2540529/photo.jpg | 0090-en_us_2540529 | 0 |
also build a image table:
image_id | image_link | image_name |
---|---|---|
0090-en_us_254052 | http://chewy.ugc.bazaarvoice.com/0090-en_us/2540529/photo.jpg | 0090-en_us_254052.jpg |
In [2]:
# create the category data frame
cat_id = [1,2,3,4,5]
category = ['Balls and Fetch Toys','Chew Toys','Plush Toys','Interactive Toys','Rope and Tug']
link = ['https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A317','https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A316',
'https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A320','https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A319',
'https://www.chewy.com/s?rh=c%3A288%2Cc%3A315%2Cc%3A321']
pagerange = [19,15,17,8,9]
df_cat = pd.DataFrame({'cat_id':cat_id,'category':category,'link':link,'page range':pagerange})
In [8]:
df_data = df_cat.to_csv(index=False)
s3_res = boto3.resource('s3')
s3_res.Bucket('dogfaces').put_object(Key='reviews/category.csv', Body=df_data)
Out[8]:
In [9]:
df_cat = pd.read_csv("s3://dogfaces/reviews/category.csv")
df_cat.head()
Out[9]:
In [14]:
def get_cat_link(cat_id):
try:
df = DF_CAT
except NameError:
df = pd.read_csv("s3://dogfaces/reviews/category.csv")
link = df[df['cat_id']==cat_id]['link'].values[0]
page_range = df[df['cat_id']==cat_id]['page range'].values[0]
return link, page_range
In [17]:
link, page_range = get_cat_link(1)
In [109]:
def get_toys(cat_id):
link, page_range = get_cat_link(cat_id)
res = []
for i in xrange(page_range):
toys_url = link+'&page={}'.format(i+1)
r = requests.get(toys_url)
if r.status_code == 200:
soup = BeautifulSoup(r.content)
for item in soup.select("article.product-holder.cw-card.cw-card-hover"):
num_reviews = int(item.select('p.rating.item-rating')[0].find('span').get_text())
if num_reviews > 10:
toy = {}
toy['num_reviews'] = num_reviews
raw_id = item.select('a')[0]['href']
toy['toy_link'] = "https://www.chewy.com"+item.select('a')[0]['href']
toy['toy_id'] = raw_id.split('/')[-1]
toy['toy_name'] = raw_id.split('/')[1]
toy['picture_link'] = "https:" + item.select('img')[0]['src']
toy['price'] = item.select('p.price')[0].get_text().split()[0]
res.append(toy)
return res
In [121]:
temp = get_toys(3)
df_test = pd.DataFrame.from_dict(temp)
In [122]:
print df_test['toy_link'][10]
In [ ]: