In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import boto3
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
import re
import time
%matplotlib inline
In [2]:
df_toys = pd.read_csv("s3://dogfaces/reviews/toys.csv")
In [3]:
df_toys.head()
Out[3]:
In [4]:
test_url = df_toys['toy_link'][0]
review_url = test_url.replace('/dp/','/product-reviews/') +'?'+'reviewSort=NEWEST&reviewFilter=ALL_STARS&pageNumber=6'
r = requests.get(review_url)
In [5]:
print review_url
In [6]:
'https://www.chewy.com/kong-airdog-squeakair-ball-dog-toy/product-reviews/47728?reviewSort=NEWEST&reviewFilter=ALL_STARS&pageNumber=1'
Out[6]:
In [7]:
soup = BeautifulSoup(r.content,'lxml')
In [8]:
for sec in soup.select("li.js-content"):
print sec.attrs
In [9]:
#|reviewid|toyid|toy_name | user_name| stars| review_text| time| image|image_id| useful|
review_id = sec['data-content-id']
In [10]:
rating_raw = sec.select_one("span.ugc-list__list__stars").select_one("source")['srcset']
rating = int(re.findall('rating-(\S*)\.svg',rating_raw)[0].split('_')[0])
print rating
In [11]:
user_name = sec.find_all("span",{"itemprop":"author"})[0].get_text()
print user_name
In [13]:
date = sec.find_all("span",{"itemprop":"datePublished"})[0].get_text()
print date
In [177]:
review_text = sec.select_one("span.ugc-list__review__display").get_text()
print review_text
In [198]:
test_review_img = soup.select("li.js-content")[-2]
pic = test_review_img.select_one('a.js-open-modal.js-swap')
if pic:
pic_link = pic["data-image"]
pic_items = pic_link.split("/")
pic_id = "_".join(pic_items[-3:-1])
pic_name = "_".join(pic_items[-3:])
In [192]:
test_review_img.select_one("a.js-open-modal.js-swap")["data-image"]
Out[192]:
In [199]:
pic_name
Out[199]:
In [54]:
from list_reviews import *
In [41]:
test_row = df_toys.iloc[100]
In [42]:
start_time = time.time()
reviews, pics = get_review_content(test_row)
end_time = time.time()
In [43]:
end_time - start_time
Out[43]:
In [44]:
len(reviews)
Out[44]:
In [45]:
df_temp = pd.DataFrame.from_dict(reviews)
df_temp.head()
Out[45]:
In [46]:
df_pic_temp = pd.DataFrame.from_dict(pics)
df_pic_temp.tail(10)
Out[46]:
In [51]:
print df_pic_temp['pic_url'][0]
In [38]:
df_pic_temp.shape
Out[38]:
In [52]:
time.time()
Out[52]:
In [72]:
save_code = "test11"
df_test = get_df(5)
fetch_and_store_reviews(df_test, save_code)
In [60]:
df_temp_1 = pd.read_csv("s3://dogfaces/reviews/reviewstest11-1.csv")
df_temp_2 = pd.read_csv("s3://dogfaces/reviews/reviewstest11-2.csv")
In [61]:
df_temp_1.tail()
Out[61]:
In [67]:
df_temp_2.head()
Out[67]:
In [65]:
df_temp_2.shape
Out[65]:
In [69]:
df_temp_3 = pd.read_csv("s3://dogfaces/reviews/picturestest11-1.csv")
In [70]:
df_temp_3.head()
Out[70]:
In [78]:
df_test_5 = pd.read_csv("s3://dogfaces/reviews/reviews-1504809136_cat5-final-.csv")
df_test_5.shape
Out[78]:
In [79]:
df_test_5.head()
Out[79]:
In [75]:
sum(df_test.num_reviews.values)
Out[75]:
In [ ]: