In [1]:
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup # For HTML parsing
import requests
import re # Regular expressions
from time import sleep # To prevent overwhelming the server between connections
from collections import Counter # Keep track of our term counts
#from nltk.corpus import stopwords # Filter out stopwords, such as 'the', 'or', 'and'
import nltk
import pandas as pd # For converting results to a dataframe and bar chart plots
%matplotlib inline
In [2]:
import csv
import datetime
import time
In [3]:
import sqlalchemy
from sqlalchemy import create_engine
In [4]:
%load_ext watermark
In [9]:
%watermark
In [10]:
def getVacuumTypeUrl(vacuumType,pageNum=1):
vcleaners={"central":11333709011,"canister":510108,"handheld":510114,"robotic":3743561,"stick":510112,"upright":510110,"wetdry":553022}
url_type_base="https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_"+str(pageNum)+"?ie=UTF8&node="
url=url_type_base+str(vacuumType)+"&page="+str(pageNum)
print (url)
return url
In [11]:
vcleaners={"central":11333709011,"canister":510108,"handheld":510114,"robotic":3743561,"stick":510112,"upright":510110,"wetdry":553022}
for key in vcleaners:
print(key,vcleaners[key])
getVacuumTypeUrl(vcleaners[key])
In [12]:
def getFinalPageNum(url,maxretrytime=20):
passed=False
cnt=0
while(passed==False):
cnt+=1
print("iteration from getFinalPageNum=",cnt)
if(cnt>maxretrytime):
raise Exception("Error from getFinalPageNum(url)! Tried too many times but we are still blocked by Amazon.")
try:
with requests.Session() as session:
session.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"}
r=session.get(url)
if (r.status_code==200):
soup=BeautifulSoup(r.content,"lxml")
if("Robot Check" in soup.text):
print("we are blocked!")
else:
tagsFinalPageNum=soup.select("span[class='pagnDisabled']")
finalPageNum=str(tagsFinalPageNum[0].text)
passed=True
else:
print("Connection failed. Reconnecting...")
except:
print("Error from getFinalPageNum(url)! Probably due to connection time out")
return finalPageNum
In [13]:
def InferFinalPageNum(vacuumType,pageNum=1,times=10):
url=getVacuumTypeUrl(vacuumType,pageNum)
list_finalpageNum=[]
for j in range(times):
finalpageNum=getFinalPageNum(url)
list_finalpageNum.append(finalpageNum)
FinalpageNum=min(list_finalpageNum)
return FinalpageNum
In [14]:
FinalPageNum=InferFinalPageNum(510114,pageNum=1)
print('FinalPageNum=',FinalPageNum)
In [15]:
def urlsGenerator(typenode,FinalPageNum):
#Note: 'typenode' and 'FinalpageNum' are both string
URLs=[]
pageIdx=1
while(pageIdx<=int(FinalPageNum)):
url_Type="https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_"+str(pageIdx)+"?ie=UTF8&node="
url=url_Type+str(typenode)+"&page="+str(pageIdx)
#print(url)
URLs.append(url)
pageIdx+=1
return URLs
For the moment, let us choose the vacuum type "handheld":
In [16]:
URLs=urlsGenerator(510114,FinalPageNum)
len(URLs)
for url in URLs:
print(url)
In [ ]:
def soupGenerator(URLs,maxretrytime=20):
soups=[]
urlindex=0
for URL in URLs:
urlindex+=1
print("urlindex=",urlindex)
passed=False
cnt=0
while(passed==False):
cnt+=1
print("iteration=",cnt)
if(cnt>maxretrytime):
raise Exception("Error from soupGenerator(url,maxretrytime=20)! Tried too many times but we are still blocked by Amazon.")
try:
with requests.Session() as session:
session.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"}
r=session.get(URL)
if (r.status_code==200):
soup=BeautifulSoup(r.content,"lxml")
if("Robot Check" in soup.text):
print("we are blocked!")
else:
print("we are not blocked!")
soups.append(soup)
passed=True
else:
print ("Connection failed. Reconnecting...")
except:
print("Error from soupGenerator(URLs,maxretrytime=20)! Probably due to connection time out")
return soups
In [19]:
soups=soupGenerator(URLs,maxretrytime=20)
How many soups have we created?
In [20]:
print(len(soups))
In [181]:
example='''
<span class="abc">
<div>
<a href="http://123xyz.com"></a>
hello_div01
</div>
</span>
<span class="def">
<a href="http://www.go.123xyz"></a>
<div>hello_div02</div>
</span>
'''
In [182]:
mysoup=BeautifulSoup(example,"lxml")
In [183]:
print(mysoup.prettify())
In [187]:
mysoup.select(".abc a")
Out[187]:
In [288]:
mysoup.select(".abc > a")
Out[288]:
the symbol > indicates that we'd like to look for a tags, which are direct descendents of the tag which its class=abc.
If we use ".abc a", it means that we would like to find all descendents of the tag which its class=abc.
In [289]:
mysoup.select(".abc > div")
Out[289]:
In [290]:
mysoup.select("a[href^='http']")
Out[290]:
In [291]:
mysoup.select("a[href$='http']")
Out[291]:
In [185]:
mysoup.select(".abc a")[0]["href"]
Out[185]:
https://developer.mozilla.org/en-US/docs/Web/CSS/Attribute_selectors
In [581]:
sp=soups[70].select('li[id^="result_"]')[0]
print(sp)
for s in sp:
try:
print(sp.span)
except:
print("error")
print the link of the first page:
In [21]:
URLs=urlsGenerator(510114,FinalPageNum)
len(URLs)
print(URLs[0])
#for url in URLs:
# print(url)
We found that the Product URL of the first item can be extracted via:
In [22]:
soups[0].select('li[id^="result_"]')[0].select("a[class='a-link-normal s-access-detail-page a-text-normal']")[0]
Out[22]:
where we have used the fact that each item has one unique id.
In [265]:
csrev_tag=soups[0].select('li[id^="result_"]')[0].select("a[href$='customerReviews']")[0]
print(csrev_tag)
This means we are able to obtain the total number of customer reviews (10,106) and also the link of the selected item:
The above link will then be replaced by the following one:
which shows 50 customer reviews per page (instead of 10 reviews per page by default).
Now, let's look for more information, e.g. the price of the selected product. We know that the tag we have found is stored at the end part of a big tag which contains all the info of a specific item. Now, to retrieve more info of that item, we'll move ourselves from the end part to the front gradually.
In [266]:
csrev_tag.parent
Out[266]:
In [317]:
csrev_tag.parent.previous_sibling.previous_sibling
Out[317]:
In [326]:
pricetag=csrev_tag.parent.previous_sibling.previous_sibling
price=pricetag.select(".sx-price-whole")[0].text
fraction_price=pricetag.select(".sx-price-fractional")[0].text
print(price,fraction_price)
print(int(price)+0.01*int(fraction_price))
so, we are able to obtain the price of the selected item.
In [333]:
pricetag.parent
Out[333]:
In [335]:
pricetag.previous_sibling.parent.select(".a-size-small")[2].text
Out[335]:
In [55]:
for j in range(30):
try:
#selected=soups[2].select('li[id^="result_"]')[j].select_one("span[class='a-declarative']")
selected=soups[2].select('li[id^="result_"]')[j].select_one("i[class='a-icon a-icon-popover']").previous_sibling
print(len(selected),selected.string.split(" ")[0])
except:
print("index= ",j,", 0 stars (no reviews yet)")
In [614]:
print(soups[10].select('li[id^="result_"]')[0].find_all("a")[2]["href"]) # 5stars (although only 2 reviews)
In [615]:
print(soups[12].select('li[id^="result_"]')[0].find_all("a")[2]["href"]) # 0 start (no customer reviews yet)
In [658]:
def items_info_extractor(soups):
item_links=[]
item_num_of_reviews=[]
item_prices=[]
item_names=[]
item_ids=[]
item_brands=[]
item_avestars=[]
for soup in soups:
items=soup.select('li[id^="result_"]')
for item in items:
link_item=item.select("a[href$='customerReviews']")
# ignore those items which contains 0 customer reviews. Those items are irrelevent to us.
if (link_item !=[]):
price_tag=link_item[0].parent.previous_sibling.previous_sibling
price_main_tag=price_tag.select(".sx-price-whole")
price_fraction_tag=price_tag.select(".sx-price-fractional")
link=link_item[0]["href"]
# Ignore items which don't have normal price tags.
# Those are items which are not sold by Amazon directly.
# Also, remove those items which are ads (3 ads are shown in each page).
if((price_main_tag !=[]) & (price_fraction_tag !=[]) & (link.endswith("spons#customerReviews") == False)):
# extract the item's name and ID from the obtained link
item_name=link.split("/")[3]
item_id=link.split("/")[5]
# replace the obtained link by the link that will lead to the customer reviews
base_url="https://www.amazon.com/"
link=base_url+item_name+"/product-reviews/"+item_id+"/ref=cm_cr_getr_d_paging_btm_" \
+str(1)+"?ie=UTF8&pageNumber="+str(1)+"&reviewerType=all_reviews&pageSize=1000"
# obtain the price of the selected single item
price_main=price_main_tag[0].text
price_fraction=price_fraction_tag[0].text
item_price=int(price_main)+0.01*int(price_fraction)
# obtain the brand of the selected single item
item_brand=price_tag.parent.select(".a-size-small")[1].text
if(item_brand=="by "):
item_brand=price_tag.parent.select(".a-size-small")[2].text
# obtain the number of reviews of the selected single item
item_num_of_review=int(re.sub(",","",link_item[0].text))
# obtain the averaged number of stars
starSelect=item.select_one("span[class='a-declarative']")
if((starSelect is None) or (starSelect.span is None)): # there are no reviews yet (hence, we see no stars at all)
item_avestar=0
else:
item_avestar=starSelect.span.string.split(" ")[0] # there are some reviews. So, we are able to extract the averaged number of stars
# store the obtained variables into lists
item_links.append(link)
item_num_of_reviews.append(item_num_of_review)
item_prices.append(item_price)
item_names.append(item_name)
item_ids.append(item_id)
item_brands.append(item_brand)
item_avestars.append(item_avestar)
return item_brands,item_ids,item_names,item_prices,item_num_of_reviews,item_links,item_avestars
In [659]:
item_brands,item_ids,item_names,item_prices,item_num_of_reviews,item_links,item_avestars=items_info_extractor(soups)
In [385]:
print(len(item_ids))
print(len(set(item_ids)))
In [386]:
print(len(item_names))
print(len(set(item_names)))
In [387]:
print(len(item_links))
print(len(set(item_links)))
The above results indicate that there are items that have the same product name but different links.
Cool. Let's find those products.
In [391]:
import collections
item_names_repeated=[]
for key in collections.Counter(item_names):
if collections.Counter(item_names)[key]>1:
print(key,collections.Counter(item_names)[key])
item_names_repeated.append(key)
#print [item for item, count in collections.Counter(a).items() if count > 1]
In [392]:
print(item_names_repeated)
In [419]:
items_repeated=[]
for name,link,price,numrev in zip(item_names,item_links,item_prices,item_num_of_reviews):
if name in item_names_repeated:
#print(name,link,"\n")
items_repeated.append((name,link,price,numrev))
sort a list with the method: sorted ( a "key" has to be given )
In [420]:
items_repeated=sorted(items_repeated, key=lambda x: x[0])
In [424]:
print("item name, item link, item price, total # of reviews of that item","\n")
for idx,(name,link,price,numrev) in enumerate(items_repeated):
if((idx+1)%2==0):
print(name,link,price,numrev,"\n")
else:
print(name,link,price,numrev)
What's found
Reference: http://pbpython.com/pandas-list-dict.html
In [661]:
for id in item_ids:
if("B006LXOJC0" in id):
print(id)
In [664]:
df=pd.DataFrame.from_items([("pindex",item_ids),("type","handheld"),("pname",item_names),("brand",item_brands),("price",item_prices),("rurl",item_links),("totalRev",item_num_of_reviews),("avgStars",item_avestars)])
In [671]:
df.loc[:,["rurl","avgStars","totalRev"]]
Out[671]:
In [466]:
from sqlalchemy import create_engine,Table,Column,Integer,String,MetaData,ForeignKey,Date
import pymysql
engine=create_engine("mysql+pymysql://semantic:GbwSq1RzFb@104.199.201.206:13606/Tests?charset=utf8",echo=False, encoding='utf-8')
conn = engine.connect()
df.to_sql(name='amzProd', con=conn, if_exists = 'append', index=False)
conn.close()
In [469]:
df.to_csv("ProdInfo_handheld_26012017.csv", encoding="utf-8")
And load it:
In [474]:
pd.DataFrame.from_csv("ProdInfo_handheld_26012017.csv", encoding="utf-8")
Out[474]:
In [496]:
from sqlalchemy import create_engine,Table,Column,Integer,String,MetaData,ForeignKey,Date
import pymysql
import datetime
I found out that there might be same pindex in one dataframe. This can lead to an error if we are going to upload our data to MariaDB, as the primary key is ought to be unique.
In [531]:
pd.set_option('max_colwidth', 800)
for idx,df in enumerate(dfs):
print(idx,df.loc[df['pindex'] == 'B00SWGVICS'])
In [42]:
import os
from IPython.display import display
In [32]:
cwd=os.getcwd()
In [33]:
print(cwd)
Now, it's time to get to know the Pandas Dataframe better. I'd like to figure out how two dataframes can be merged horizontally.
In [58]:
test_col = pd.DataFrame.from_items([("test_column1",np.arange(10))])
test_col2 = pd.DataFrame.from_items([("test_column2",5+np.arange(10))])
display(test_col,test_col2)
In [59]:
result = pd.concat([test_col, test_col2], axis=1)
In [60]:
display(result)
In [7]:
date="2017-02-01"
prodTypes=["central","canister","handheld","robotic","stick","upright","wetdry"]
# put all the dataframes into a list
dfs=[pd.DataFrame.from_csv("data/ProdInfo_%s_%s.csv"%(prodType,date), encoding="utf-8") for prodType in prodTypes]
for idx,df in enumerate(dfs):
cID=[j%7 for j in range(df.shape[0])]
colCID=pd.DataFrame.from_items([( "cID",cID )])
dfs[idx]=pd.concat([df, colCID], axis=1)
# concatenate dataframes
df=pd.concat(dfs).drop_duplicates("rurl")
df.to_csv("ProdInfo_all_%s.csv"%(date), encoding="utf-8")
In [5]:
date="2017-02-01"
date="2017-02-06"
prodTypes=["central","canister","handheld","robotic","stick","upright","wetdry"]
# put all the dataframes into a list
dfs=[pd.DataFrame.from_csv("data/ProdInfo_%s_%s.csv"%(prodType,date), encoding="utf-8") for prodType in prodTypes]
for idx,df in enumerate(dfs):
cID=[j%7 for j in range(df.shape[0])]
colCID=pd.DataFrame.from_items([( "cID",cID )])
dfs[idx]=pd.concat([df, colCID], axis=1)
# concatenate dataframes
df=pd.concat(dfs).drop_duplicates("rurl")
# prepare the connection and connect to the DB
engine=create_engine("mysql+pymysql://semantic:GbwSq1RzFb@104.199.201.206:13606/Tests?charset=utf8",echo=False, encoding='utf-8')
conn = engine.connect()
# remove duplicates and upload the concatenated dataframe to the SQL DataBase
df.to_sql(name='amzProd', con=conn, if_exists = 'append', index=False)
# close the connection
conn.close()
In [111]:
len(df.iloc[974]["brand"])
Out[111]:
In [540]:
df.iloc[463]["pname"]
Out[540]:
In [543]:
!echo "Handheld-Vacuum-Cleaner-Abask-Vacuum-Cleaner-7-2V-60W-Ni-CD2200MA-3-5KPA-Suction-Portable-1-Accessories-Rechargeable-Cordless-Cleaner"| wc
Length of this string is larger than 100. Therefore, I have to alter our schema, since the product name was set to have length 100 by default.